c7d442d0346b223715d4ab83ff7bb5184843db4d braney Tue Oct 8 11:22:33 2019 -0700 final touches to mm10 and hg38 knownGene Gencode diff --git src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh index fea823f..3780d19 100755 --- src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh @@ -133,31 +133,32 @@ hgLoadSqlTab -notOnServer $tempDb knownGene $kent/src/hg/lib/knownGene.sql knownGene.gp hgLoadGenePred -genePredExt $tempDb knownGeneExt knownGeneExt.gp #getRnaPred -genePredExt -peptides $tempDb knownGeneExt all ucscGenes.faa genePredToProt knownGeneExt.gp /cluster/data/$db/$db.2bit tmp.faa faFilter -uniq tmp.faa ucscGenes.faa hgPepPred $tempDb generic knownGenePep ucscGenes.faa hgMapToGene -type=psl -all -tempDb=$tempDb $db all_mrna knownGene knownToMrna hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToRefSeq hgMapToGene -type=psl -tempDb=$tempDb $db all_mrna knownGene knownToMrnaSingle makeGencodeKnownGene $db $tempDb $GENCODE_VERSION txToAcc.tab -hgsql $tempDb -Ne "select k.name, g.geneId, g.geneStatus, g.geneType,g.transcriptName,g.transcriptType,g.transcriptStatus, g.havanaGeneId, g.ccdsId, g.level, g.transcriptClass from knownGene k, $db.wgEncodeGencodeAttrs$GENCODE_VERSION g where k.alignID=g.transcriptId" | sort | uniq > knownAttrs.tab +hgsql $tempDb -Ne "select k.name, g.geneId, g.geneStatus, g.geneType,g.transcriptName,g.transcriptType,g.transcriptStatus, g.havanaGeneId, g.ccdsId, g.level, g.transcriptClass from knownGene k, $db.wgEncodeGencodeAttrs$GENCODE_VERSION g where k.name=g.transcriptId" | sort | uniq > knownAttrs.tab + hgLoadSqlTab -notOnServer $tempDb knownAttrs $kent/src/hg/lib/knownAttrs.sql knownAttrs.tab #tawk '$4=="new" {print $3}' oldToNew.tab | sort > new.txt #sort knownGene.gp | join -t $'\t' new.txt /dev/stdin > new.gp #sort knownGene.gp | join -t $'\t' lost.txt /dev/stdin | wc # should be zero # tawk '{print $12}' hg38.lost.gp | while read name; do grep $name /tmp/2; done | wc sort kgColor.tab | uniq | hgLoadSqlTab -notOnServer $tempDb kgColor $kent/src/hg/lib/kgColor.sql stdin hgLoadSqlTab -notOnServer $tempDb knownIsoforms $kent/src/hg/lib/knownIsoforms.sql knownIsoforms.tab hgLoadSqlTab -notOnServer $tempDb kgXref $kent/src/hg/lib/kgXref.sql kgXref.tab hgLoadSqlTab -notOnServer $tempDb knownCanonical $kent/src/hg/lib/knownCanonical.sql knownCanonical.tab @@ -200,32 +201,34 @@ # this should be done AFTER moving the new tables into hg38 hgKgGetText $tempDb tempSearch.txt sort tempSearch.txt > tempSearch2.txt tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt join tempSearch2.txt tempSearch3.txt | sort > knownGene.txt ixIxx knownGene.txt knownGene.ix knownGene.ixx rm -rf /gbdb/$tempDb/knownGene.ix /gbdb/$tempDb/knownGene.ixx ln -s $dir/knownGene.ix /gbdb/$tempDb/knownGene.ix ln -s $dir/knownGene.ixx /gbdb/$tempDb/knownGene.ixx hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from hgFixed.refLink" $db > refToLl.txt hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToLocusLink -lookup=refToLl.txt knownToVisiGene $tempDb -probesDb=$db -awk '{OFS="\t"} {print $2,$1}' tmp1 | sort > knownToEnsembl.tab -tawk '{print $2,$1}' tmp1 | sort > knownToGencode${GENCODE_VERSION}.tab +awk '{OFS="\t"} {print $4,$4}' ucscGenes.bed | sort > knownToEnsembl.tab +cp knownToEnsembl.tab knownToGencode${GENCODE_VERSION}.tab +#awk '{OFS="\t"} {print $2,$1}' tmp1 | sort > knownToEnsembl.tab +#tawk '{print $2,$1}' tmp1 | sort > knownToGencode${GENCODE_VERSION}.tab hgLoadSqlTab -notOnServer $tempDb knownToEnsembl $kent/src/hg/lib/knownTo.sql knownToEnsembl.tab hgLoadSqlTab -notOnServer $tempDb knownToGencode${GENCODE_VERSION} $kent/src/hg/lib/knownTo.sql knownToGencode${GENCODE_VERSION}.tab hgMapToGene -tempDb=$tempDb $db gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' if ($db =~ hg*) then #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db HInvGeneMrna knownGene knownToHInv #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU133Plus2 knownGene knownToU133Plus2 hgMapToGene -tempDb=$tempDb $db affyU133 knownGene knownToU133 hgMapToGene -tempDb=$tempDb $db affyU95 knownGene knownToU95 mkdir hprd cd hprd wget "http://www.hprd.org/edownload/HPRD_FLAT_FILES_041310" tar xvf HPRD_FLAT_FILES_041310 @@ -922,54 +925,41 @@ # # Finally, need to wait until after testing, but update databases in other organisms # with blastTabs # Load blastTabs cd $dir/hgNearBlastp hgLoadBlastTab $xdb $blastTab run.$xdb.$tempDb/out/*.tab hgLoadBlastTab $ratDb $blastTab run.$ratDb.$tempDb/out/*.tab hgLoadBlastTab $flyDb $blastTab run.$flyDb.$tempDb/recipBest.tab hgLoadBlastTab $wormDb $blastTab run.$wormDb.$tempDb/recipBest.tab hgLoadBlastTab $yeastDb $blastTab run.$yeastDb.$tempDb/recipBest.tab hgLoadBlastTab $fishDb $blastTab run.$fishDb.$tempDb/recipBest.tab # Do synteny on mouse/human/rat synBlastp.csh $xdb $db -#old number of unique query values: 45399 -#old number of unique target values 22999 -#new number of unique query values: 42015 -#new number of unique target values 22470 +# old number of unique query values: 61250 +# old number of unique target values 27574 +# new number of unique query values: 52518 +# new number of unique target values 25367 synBlastp.csh $ratDb $db ensGene knownGene -#old number of unique query values: 27888 -#old number of unique target values 18988 -#new number of unique query values: 24530 -#new number of unique target values 18411 - - -# need to generate multiz downloads -#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownCanonical.exonAA.fa.gz -#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownCanonical.exonNuc.fa.gz -#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownGene.exonAA.fa.gz -#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/knownGene.exonNuc.fa.gz -#/usr/local/apache/htdocs-hgdownload/goldenPath/hg38/multiz46way/alignments/md5sum.txt - -echo -echo "see the bottom of the script for details about knownToWikipedia" -echo -# Clean up -rm -r run.*/out +# old number of unique query values: 28159 +# old number of unique target values 19155 +# new number of unique query values: 23777 +# new number of unique target values 17885 + # Last step in setting up isPCR: after the new UCSC Genes with the new Known Gene isPcr # is released, take down the old isPcr gfServer ####################### ### The following is the process Briam Lee used to pull out only # the genes from knownToLocusLink for which there are Wikipedia articles. ### get the full knownToLocusLinkTable # hgsql -Ne 'select value from knownToLocusLink' hg38 | sort -u >> knToLocusLink ### query Wikipedia for each to if there is an article # for i in $(cat knToLocusLink); do lynx -dump "http://genewiki.sulab.org/map/wiki/"$i | grep -m 1 "no results" >trash ; echo $? $i | grep "1 "| awk '{print $2}'>> workingLinks; done ### pull out all isoforms that have permitted LocusLinkIds # for i in $(cat workingLinks); do hgsql -Ne 'select * from knownToLocusLink where value like "'$i'"' hg38 >> knownToWikipediaNew; done ### then load the table as knownToWikipedia using the knowToLocusLink INDICES.