cd1c11ae2a758a6c3a8902472a3017f35bd85220
braney
  Sat Feb 6 12:15:53 2021 -0800
tweaks to gencode V36

diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh
index 615dfe2..10e5c32 100644
--- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh
+++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh
@@ -1,27 +1,30 @@
 # This doc assumes that the gencode* tables have been built on $db
 db=hg38
 GENCODE_VERSION=V36
 PREV_GENCODE_VERSION=V35
 dir=/hive/data/genomes/$db/bed/gencode$GENCODE_VERSION/build
 genomes=/hive/data/genomes
 tempDb=knownGeneV36
 oldDb=knownGeneV35
 kent=$HOME/kent
 spDb=sp180404
 cpuFarm=ku
-export curVer=14
+export oldGeneDir=/cluster/data/hg38/bed/ucsc.20.1
+export oldGeneBed=$oldGeneDir/ucscGenes.bed
+export lastVer=12
+export curVer=13
 export Db=Hg38
 export xdb=mm10
 export Xdb=Mm10
 export ydb=canFam3
 export zdb=rheMac8
 export ratDb=rn6
 export RatDb=Rn6
 export fishDb=danRer11
 export flyDb=dm6
 export wormDb=ce11
 export yeastDb=sacCer3
 export tempFa=$dir/ucscGenes.faa
 export genomes=/hive/data/genomes
 export xdbFa=$genomes/$xdb/bed/ucsc.18.1/ucscGenes.faa
 export ratFa=$genomes/$ratDb/bed/ensGene.95/ensembl.faa
@@ -33,30 +36,42 @@
 export wormFa=$genomes/$wormDb/bed/ws245Genes/ws245Pep.faa
 #export wormFa=$genomes/$wormDb/bed/blastp/wormPep190.faa
 export yeastFa=$genomes/$yeastDb/bed/sgdAnnotations/blastTab/sacCer3.sgd.faa
 export scratchDir=/hive/users/braney/scratch
 
 export blastTab=hgBlastTab
 export xBlastTab=mmBlastTab
 export rnBlastTab=rnBlastTab
 export dbHost=hgwdev
 export ramFarm=ku
 export cpuFarm=ku
 
 mkdir -p $dir
 cd $dir
 
+# first get list of tables from push request in $lastVer.table.lst
+# here's the redmine http://redmine.soe.ucsc.edu/issues/21644
+wc -l $oldGeneDir/$lastVer.table.lst 
+# 62
+
+(
+cat $oldGeneDir/$lastVer.table.lst | grep -v "ToKg$lastVer" | grep -v "XrefOld" | grep -v "knownGeneOld" | grep -v "knownToGencode" 
+echo kg${lastVer}ToKg${curVer}
+echo knownGeneOld$lastVer
+echo kgXrefOld$lastVer
+) | sort >  $curVer.table.lst 
+
 echo "create database $tempDb" | hgsql ""
 echo "create table chromInfo like  $db.chromInfo" | hgsql $tempDb
 echo "insert into chromInfo select * from   $db.chromInfo" | hgsql $tempDb
 
 hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 |  genePredToBed stdin stdout | sort -k1,1 -k2,2n | gzip -c > gencode${GENCODE_VERSION}.bed.gz
 
 touch oldToNew.tab
 
 zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed
 txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg
 txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq | bedClip stdin /cluster/data/hg38/chrom.sizes  ucscSplice.bed
 hgLoadBed $tempDb knownAlt ucscSplice.bed
 
 zcat gencode${GENCODE_VERSION}.bed.gz |  awk '{print $4}' | sort > newGencodeName.txt
 hgsql $oldDb -Ne "select name,alignId from knownGene" | sort > oldGenToUcsc.txt
@@ -119,30 +134,33 @@
 tawk '{split($2,a,"."); for(ii = 1; ii <= a[2]; ii++) print $1,a[1] "." ii }' txToAcc.tab >> foo.alias
 tawk '{split($1,a,"."); for(ii = 1; ii <= a[2] - 1; ii++) print $1,a[1] "." ii }' txToAcc.tab >> foo.alias
 sort foo.alias | uniq > ucscGenes.alias
 sort foo.protAlias | uniq > ucscGenes.protAlias
 rm foo.alias foo.protAlias
 hgLoadSqlTab -notOnServer $tempDb kgAlias $kent/src/hg/lib/kgAlias.sql ucscGenes.alias
 hgLoadSqlTab -notOnServer $tempDb kgProtAlias $kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias
 hgsql $tempDb -N -e 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' > kgSpAlias_0.tmp
 
 hgsql $tempDb -N -e 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID' >> kgSpAlias_0.tmp
 cat kgSpAlias_0.tmp|sort -u  > kgSpAlias.tab
 rm kgSpAlias_0.tmp
 
 hgLoadSqlTab -notOnServer $tempDb kgSpAlias $kent/src/hg/lib/kgSpAlias.sql kgSpAlias.tab
 
+txGeneExplainUpdate2 $oldGeneBed ucscGenes.bed kgOldToNew.tab
+hgLoadSqlTab -notOnServer $tempDb kg${lastVer}ToKg${curVer} $kent/src/hg/lib/kg1ToKg2.sql kgOldToNew.tab
+
 
 #ifdef NOTNOW
 hgsql $db -Ne "select * from gencodeToUniProt$GENCODE_VERSION" | tawk '{print $1,$2}'|  sort > uniProt.txt
 hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | tawk '{print $1,$12}' | sort > gene.txt
 join -a 1 gene.txt uniProt.txt > geneNames.txt
 #endif
 
 //genePredToBigGenePred -score=knownScore.txt  -colors=colors.txt -geneNames=geneNames.txt  -known gencodeAnnot$GENCODE_VERSION.txt  gencodeAnnot$GENCODE_VERSION.bgpInput
 #hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | cut -f 2- > gencodeAnnot$GENCODE_VERSION.txt
 #genePredToBigGenePred -colors=colors.txt gencodeAnnot$GENCODE_VERSION.txt stdout | sort -k1,1 -k2,2n >  gencodeAnnot$GENCODE_VERSION.bgpInput
 
 hgsql $tempDb -Ne "select kgId, geneSymbol, spID from kgXref" > geneNames.txt
 #hgsql $tempDb -Ne "select * from knownCds" > knownCds.txt
 #genePredToBigGenePred -colors=colors.txt -known knownGene.gp stdout | sort -k1,1 -k2,2n >  gencodeAnnot$GENCODE_VERSION.bgpInput
 genePredToBigGenePred -colors=colors.txt -geneNames=geneNames.txt -known -cds=knownCds.tab   knownGene.gp  stdout | sort -k1,1 -k2,2n >  gencodeAnnot$GENCODE_VERSION.bgpInput
@@ -627,15 +645,77 @@
 sort -k 2 /hive/data/outside/scop/1.75/model.tab > scop.model.tab
 scopCollapse scopPlusScore.tab scop.model.tab ucscScop.tab \
 	scopDesc.tab knownToSuper.tab
 hgLoadSqlTab -notOnServer $tempDb scopDesc $kent/src/hg/lib/scopDesc.sql scopDesc.tab
 hgLoadSqlTab $tempDb knownToSuper $kent/src/hg/lib/knownToSuper.sql knownToSuper.tab
 #hgsql $tempDb -e "delete k from knownToSuper k, kgXref x where k.gene = x.kgID and x.geneSymbol = 'abParts'"
 
 
 hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db affyU133 knownGene knownToU133
 hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db affyU95 knownGene knownToU95
     mkdir hprd
     cd hprd
     wget "http://www.hprd.org/edownload/HPRD_FLAT_FILES_041310"
     tar xvf HPRD_FLAT_FILES_041310
     knownToHprd $tempDb FLAT_FILES_072010/HPRD_ID_MAPPINGS.txt
+
+    time hgExpDistance $tempDb hgFixed.gnfHumanU95MedianRatio \
+	    hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95
+    time hgExpDistance $tempDb hgFixed.gnfHumanAtlas2MedianRatio \
+	hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
+	-lookup=knownToGnfAtlas2
+
+# Build knownToMupit
+
+mkdir mupit
+cd mupit
+
+# mupit-pdbids.txt was emailed from Kyle Moad (kmoad@insilico.us.com)
+# wc -l 
+cp /hive/data/outside/mupit/mupit-pdbids.txt .
+# get knownGene IDs and associated PDB IDS
+# the extDb{Ref} parts come from hg/hgGene/domains.c:domainsPrint()
+hgsql -Ne "select kgID, extAcc1 from $db.kgXref x \
+    inner join sp180404.extDbRef sp on x.spID = sp.acc \
+    inner join sp180404.extDb e on sp.extDb=e.id \
+    where x.spID != '' and e.val='PDB' order by kgID" \
+    > $db.knownToPdb.txt;
+# filter out pdbIds not found in mupit
+cat mupit-pdbids.txt | tr '[a-z]' '[A-Z]' | \
+    grep -Fwf - $db.knownToPdb.txt >  knownToMupit.txt;
+# check that it filtered correctly:
+# cut -f2 $db.knownToMuipit.txt | sort -u | wc -l;
+# load new table for hgGene/hgc
+hgLoadSqlTab $db knownToMupit ~/kent/src/hg/lib/knownTo.sql knownToMupit.txt
+
+#myGene2
+mkdir $dir/myGene2
+cd $dir/myGene2
+
+# copy list of genes from https://mygene2.org/MyGene2/genes 
+awk '{print $1}' | sort > genes.lst
+hgsql hg38 -Ne "select geneSymbol, kgId from kgXref" | sort > ids.txt
+join -t $'\t' genes.lst  ids.txt | tawk '{print $2,$1}' | sort > knownToMyGene2.txt
+hgLoadSqlTab $db knownToMyGene2 ~/kent/src/hg/lib/knownTo.sql knownToMyGene2.txt
+
+# make knownToNextProt
+mkdir -p $dir/nextProt
+cd $dir/nextProt
+
+wget "ftp://ftp.nextprot.org/pub/current_release/ac_lists/nextprot_ac_list_all.txt"
+awk '{print $0, $0}' nextprot_ac_list_all.txt | sed 's/NX_//' | sort > displayIdToNextProt.txt
+hgsql -e "select spID,kgId from kgXref" --skip-column-names $tempDb | awk '{if (NF == 2) print}' | sort > displayIdToKgId.txt
+join displayIdToKgId.txt displayIdToNextProt.txt | awk 'BEGIN {OFS="\t"} {print $2,$3}' > knownToNextProt.tab
+hgLoadSqlTab -notOnServer $tempDb  knownToNextProt $kent/src/hg/lib/knownTo.sql  knownToNextProt.tab
+
+# this should be done AFTER moving the new tables into hg38
+hgKgGetText $tempDb tempSearch.txt
+sort tempSearch.txt > tempSearch2.txt
+tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt
+tawk '{split($2,a,"."); printf "%s\t%s ", $1,a[1];for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' knownAttrs.tab | sort > tempSearch4.txt
+join tempSearch2.txt tempSearch3.txt | join /dev/stdin tempSearch4.txt | sort > gencode$GENCODE_VERSION.txt
+ixIxx gencode$GENCODE_VERSION.txt gencode$GENCODE_VERSION.ix gencode$GENCODE_VERSION.ixx
+ rm -rf /gbdb/$tempDb/gencode/gencode$GENCODE_VERSION.ix /gbdb/$tempDb/gencode/gencode$GENCODE_VERSION.ixx
+ln -s $dir/gencode$GENCODE_VERSION.ix  /gbdb/$tempDb/gencode/gencode$GENCODE_VERSION.ix
+ln -s $dir/gencode$GENCODE_VERSION.ixx /gbdb/$tempDb/gencode/gencode$GENCODE_VERSION.ixx  
+
+