src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh c7d442d0346b223715d4ab83ff7bb5184843db4d

c7d442d0346b223715d4ab83ff7bb5184843db4d
braney
  Tue Oct 8 11:22:33 2019 -0700
final touches to mm10 and hg38 knownGene Gencode

diff --git src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh
index fea823f..3780d19 100755
--- src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh
+++ src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh
@@ -133,31 +133,32 @@
 
 hgLoadSqlTab -notOnServer $tempDb knownGene $kent/src/hg/lib/knownGene.sql knownGene.gp
 hgLoadGenePred -genePredExt $tempDb  knownGeneExt knownGeneExt.gp
 
 #getRnaPred -genePredExt -peptides $tempDb knownGeneExt all ucscGenes.faa
 genePredToProt knownGeneExt.gp /cluster/data/$db/$db.2bit tmp.faa
 faFilter -uniq tmp.faa ucscGenes.faa
 hgPepPred $tempDb generic knownGenePep ucscGenes.faa
 
 hgMapToGene -type=psl -all -tempDb=$tempDb $db all_mrna knownGene knownToMrna
 hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToRefSeq
 hgMapToGene -type=psl -tempDb=$tempDb $db all_mrna knownGene knownToMrnaSingle
 
 makeGencodeKnownGene $db $tempDb $GENCODE_VERSION txToAcc.tab
 
-hgsql $tempDb -Ne "select k.name, g.geneId, g.geneStatus, g.geneType,g.transcriptName,g.transcriptType,g.transcriptStatus, g.havanaGeneId,  g.ccdsId, g.level, g.transcriptClass from knownGene k, $db.wgEncodeGencodeAttrs$GENCODE_VERSION g where k.alignID=g.transcriptId" | sort | uniq > knownAttrs.tab
+hgsql $tempDb -Ne "select k.name, g.geneId, g.geneStatus, g.geneType,g.transcriptName,g.transcriptType,g.transcriptStatus, g.havanaGeneId,  g.ccdsId, g.level, g.transcriptClass from knownGene k, $db.wgEncodeGencodeAttrs$GENCODE_VERSION g where k.name=g.transcriptId" | sort | uniq > knownAttrs.tab
+
 hgLoadSqlTab -notOnServer $tempDb knownAttrs $kent/src/hg/lib/knownAttrs.sql knownAttrs.tab
 
 #tawk '$4=="new" {print $3}' oldToNew.tab | sort > new.txt
 #sort knownGene.gp | join -t $'\t' new.txt /dev/stdin > new.gp
 #sort knownGene.gp | join -t $'\t' lost.txt /dev/stdin | wc
 # should be zero
 # tawk '{print $12}' hg38.lost.gp | while read name; do grep $name /tmp/2; done | wc
 
 sort kgColor.tab | uniq | hgLoadSqlTab -notOnServer $tempDb kgColor $kent/src/hg/lib/kgColor.sql stdin
  
 hgLoadSqlTab -notOnServer $tempDb knownIsoforms $kent/src/hg/lib/knownIsoforms.sql knownIsoforms.tab
 
 hgLoadSqlTab -notOnServer $tempDb kgXref $kent/src/hg/lib/kgXref.sql kgXref.tab
 
 hgLoadSqlTab -notOnServer $tempDb knownCanonical $kent/src/hg/lib/knownCanonical.sql knownCanonical.tab
@@ -200,32 +201,34 @@
 # this should be done AFTER moving the new tables into hg38
 hgKgGetText $tempDb tempSearch.txt
 sort tempSearch.txt > tempSearch2.txt
 tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt
 join tempSearch2.txt tempSearch3.txt | sort > knownGene.txt
 ixIxx knownGene.txt knownGene.ix knownGene.ixx
  rm -rf /gbdb/$tempDb/knownGene.ix /gbdb/$tempDb/knownGene.ixx
 ln -s $dir/knownGene.ix  /gbdb/$tempDb/knownGene.ix
 ln -s $dir/knownGene.ixx /gbdb/$tempDb/knownGene.ixx  
 
 
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from hgFixed.refLink" $db > refToLl.txt
 hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToLocusLink -lookup=refToLl.txt
 knownToVisiGene $tempDb -probesDb=$db
 
-awk '{OFS="\t"} {print $2,$1}' tmp1 | sort > knownToEnsembl.tab
-tawk '{print $2,$1}' tmp1 | sort > knownToGencode${GENCODE_VERSION}.tab
+awk '{OFS="\t"} {print $4,$4}' ucscGenes.bed | sort > knownToEnsembl.tab
+cp knownToEnsembl.tab knownToGencode${GENCODE_VERSION}.tab
+#awk '{OFS="\t"} {print $2,$1}' tmp1 | sort > knownToEnsembl.tab
+#tawk '{print $2,$1}' tmp1 | sort > knownToGencode${GENCODE_VERSION}.tab
 hgLoadSqlTab -notOnServer $tempDb  knownToEnsembl  $kent/src/hg/lib/knownTo.sql  knownToEnsembl.tab
 hgLoadSqlTab -notOnServer $tempDb  knownToGencode${GENCODE_VERSION}  $kent/src/hg/lib/knownTo.sql  knownToGencode${GENCODE_VERSION}.tab
 
 hgMapToGene -tempDb=$tempDb $db gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 
 
 if ($db =~ hg*) then
     #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db HInvGeneMrna knownGene knownToHInv
     #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU133Plus2 knownGene knownToU133Plus2
     hgMapToGene -tempDb=$tempDb $db affyU133 knownGene knownToU133
     hgMapToGene -tempDb=$tempDb $db affyU95 knownGene knownToU95
     mkdir hprd
     cd hprd
     wget "http://www.hprd.org/edownload/HPRD_FLAT_FILES_041310"
     tar xvf HPRD_FLAT_FILES_041310
@@ -922,54 +925,41 @@
 #
 # Finally, need to wait until after testing, but update databases in other organisms
 # with blastTabs
 
 # Load blastTabs
 cd $dir/hgNearBlastp
 hgLoadBlastTab $xdb $blastTab run.$xdb.$tempDb/out/*.tab
 hgLoadBlastTab $ratDb $blastTab run.$ratDb.$tempDb/out/*.tab 
 hgLoadBlastTab $flyDb $blastTab run.$flyDb.$tempDb/recipBest.tab
 hgLoadBlastTab $wormDb $blastTab run.$wormDb.$tempDb/recipBest.tab
 hgLoadBlastTab $yeastDb $blastTab run.$yeastDb.$tempDb/recipBest.tab
 hgLoadBlastTab $fishDb $blastTab run.$fishDb.$tempDb/recipBest.tab
 
 # Do synteny on mouse/human/rat
 synBlastp.csh $xdb $db
-#old number of unique query values: 45399
-#old number of unique target values 22999
-#new number of unique query values: 42015
-#new number of unique target values 22470
+# old number of unique query values: 61250
+# old number of unique target values 27574
+# new number of unique query values: 52518
+# new number of unique target values 25367
 
 synBlastp.csh $ratDb $db ensGene knownGene
-#old number of unique query values:  27888  
-#old number of unique target values  18988  
-#new number of unique query values:  24530  
-#new number of unique target values  18411  
-
-
-# need to generate multiz downloads
-#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownCanonical.exonAA.fa.gz
-#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownCanonical.exonNuc.fa.gz
-#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownGene.exonAA.fa.gz
-#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownGene.exonNuc.fa.gz
-#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/md5sum.txt
-
-echo
-echo "see the bottom of the script for details about knownToWikipedia"
-echo
-# Clean up
-rm -r run.*/out
+# old number of unique query values: 28159
+# old number of unique target values 19155
+# new number of unique query values: 23777
+# new number of unique target values 17885
+
 
 # Last step in setting up isPCR: after the new UCSC Genes with the new Known Gene isPcr
 # is released, take down the old isPcr gfServer  
 
 #######################
 ### The following is the process Briam Lee used to pull out only
 #   the genes from knownToLocusLink for which there are Wikipedia articles.
 ### get the full knownToLocusLinkTable
 # hgsql -Ne 'select value from knownToLocusLink' hg38 | sort -u >> knToLocusLink
 ###   query Wikipedia for each to if there is an article
 # for i in $(cat knToLocusLink); do lynx -dump "http://genewiki.sulab.org/map/wiki/"$i | grep -m 1 "no results" >trash ; echo $? $i | grep "1 "| awk '{print $2}'>> workingLinks; done
 ###   pull out all isoforms that have permitted LocusLinkIds
 # for i in $(cat workingLinks); do hgsql -Ne 'select * from knownToLocusLink where value like "'$i'"' hg38 >> knownToWikipediaNew; done
 ###   then load the table as knownToWikipedia using the knowToLocusLink INDICES.