9c6055fba90d52e081a16fa54bd3a2b31a7243a6 braney Wed Jan 27 16:30:13 2021 -0800 changing back to the old knownGene model diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh index b4ecf8c..615dfe2 100644 --- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh @@ -44,30 +44,31 @@ mkdir -p $dir cd $dir echo "create database $tempDb" | hgsql "" echo "create table chromInfo like $db.chromInfo" | hgsql $tempDb echo "insert into chromInfo select * from $db.chromInfo" | hgsql $tempDb hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | genePredToBed stdin stdout | sort -k1,1 -k2,2n | gzip -c > gencode${GENCODE_VERSION}.bed.gz touch oldToNew.tab zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq | bedClip stdin /cluster/data/hg38/chrom.sizes ucscSplice.bed +hgLoadBed $tempDb knownAlt ucscSplice.bed zcat gencode${GENCODE_VERSION}.bed.gz | awk '{print $4}' | sort > newGencodeName.txt hgsql $oldDb -Ne "select name,alignId from knownGene" | sort > oldGenToUcsc.txt kgAllocId oldGenToUcsc.txt newGencodeName.txt 5072896 stdout | sort -u > txToAcc.tab # lastId 5075761 makeUcscGeneTables -justKnown $db $tempDb $GENCODE_VERSION txToAcc.tab hgLoadSqlTab -notOnServer $tempDb knownGene $kent/src/hg/lib/knownGene.sql knownGene.gp #hgsql $db -Ne "create view $tempDb.all_mrna as select * from all_mrna" hgsql $db -Ne "create view $tempDb.trackDb as select * from trackDb" hgLoadGenePred -genePredExt $tempDb knownGeneExt knownGeneExt.gp hgMapToGene -type=psl -all -tempDb=$tempDb $db all_mrna knownGene knownToMrna hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToRefSeq hgMapToGene -type=psl -tempDb=$tempDb $db all_mrna knownGene knownToMrnaSingle makeUcscGeneTables $db $tempDb $GENCODE_VERSION txToAcc.tab @@ -80,31 +81,32 @@ hgLoadSqlTab -notOnServer $tempDb kgXref $kent/src/hg/lib/kgXref.sql kgXref.tab #ifdef NOTNOW # calculate score field with bitfields hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | cut -f 2- | sort > gencodeAnnot$GENCODE_VERSION.txt hgsql $db -Ne "select name,2 from gencodeAnnot$GENCODE_VERSION" | sort > knownCanon.txt hgsql $db -Ne "select * from gencodeTag$GENCODE_VERSION" | grep basic | sed 's/basic/1/' | sort > knownTag.txt hgsql $db -Ne "select transcriptId,transcriptClass from gencodeAttrs$GENCODE_VERSION where transcriptClass='pseudo'" | sed 's/pseudo/4/' > knownAttrs.txt sort knownCanon.txt knownTag.txt knownAttrs.txt | awk '{if ($1 != last) {print last, sum; sum=$2; last=$1} else {sum += $2; }} END {print last, sum}' | tail -n +2 > knownScore.txt #endif hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | tawk '{print $1,$13,$14,$8,$15}' | sort | uniq > knownCds.tab hgLoadSqlTab -notOnServer $tempDb knownCds $kent/src/hg/lib/knownCds.sql knownCds.tab -hgsql -e "select * from gencodeTag$GENCODE_VERSION" --skip-column-names $db | sort | uniq > knownToTag.tab +#hgsql -e "select * from gencodeTag$GENCODE_VERSION" --skip-column-names $db | sort | uniq > knownToTag.tab +hgsql -e "select * from wgEncodeGencodeTag$GENCODE_VERSION" --skip-column-names $db | sort | uniq > knownToTag.tab hgLoadSqlTab -notOnServer $tempDb knownToTag $kent/src/hg/lib/knownTo.sql knownToTag.tab hgsql $tempDb -Ne "select k.name, g.geneId, g.unused1, g.geneType,g.transcriptName,g.transcriptType,g.unused2, g.unused3, g.ccdsId, g.level, g.transcriptClass from knownGene k, $db.gencodeAttrs$GENCODE_VERSION g where k.name=g.transcriptId" | sort | uniq > knownAttrs.tab hgLoadSqlTab -notOnServer $tempDb knownAttrs $kent/src/hg/lib/knownAttrs.sql knownAttrs.tab cat << __EOF__ > colors.sed s/coding/12\t12\t120/ s/nonCoding/0\t100\t0/ s/pseudo/255\t51\t255/ s/other/254\t0\t0/ __EOF__ hgsql $db -Ne "select * from gencodeAttrs$GENCODE_VERSION" | tawk '{print $5,$13}' | sed -f colors.sed > colors.txt @@ -430,40 +432,40 @@ cd $dir/hgNearBlastp/run.$tempDb.$ratDb hgLoadBlastTab $tempDb $rnBlastTab -maxPer=1 out/*.tab # Remove non-syntenic hits for mouse and rat # Takes a few minutes mkdir -p /gbdb/$tempDb/liftOver rm -f /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz ln -s $genomes/$db/bed/liftOver/${db}To$RatDb.over.chain.gz \ /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz ln -s $genomes/$db/bed/liftOver/${db}To${Xdb}.over.chain.gz \ /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz # delete non-syntenic genes from rat and mouse blastp tables cd $dir/hgNearBlastp synBlastp.csh $tempDb $xdb -# old number of unique query values: 93277 -# old number of unique target values 27504 -# new number of unique query values: 86329 -# new number of unique target values 26151 +# old number of unique query values: 94781 +# old number of unique target values 27519 +# new number of unique query values: 87746 +# new number of unique target values 26162 synBlastp.csh $tempDb $ratDb knownGene ensGene -#old number of unique query values: 92243 -#old number of unique target values 19898 -#new number of unique query values: 85663 -#new number of unique target values 19025 +# old number of unique query values: 93733 +# old number of unique target values 19909 +# new number of unique query values: 87149 +# new number of unique target values 19034 # Make reciprocal best subset for the blastp pairs that are too # Far for synteny to help # Us vs. fish cd $dir/hgNearBlastp export aToB=run.$db.$fishDb export bToA=run.$fishDb.$db cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb drBlastTab $aToB/recipBest.tab # Us vs. fly cd $dir/hgNearBlastp @@ -500,15 +502,140 @@ hgLoadNetDist $genomes/hg19/p2p/hprd/hprd.pathLengths $tempDb humanHprdP2P \ -sqlRemap="select distinct value, name from knownToHprd" hgLoadNetDist $genomes/hg19/p2p/vidal/humanVidal.pathLengths $tempDb humanVidalP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq" hgLoadNetDist $genomes/hg19/p2p/wanker/humanWanker.pathLengths $tempDb humanWankerP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq" mkdir $dir/wikipedia cd $dir/wikipedia hgsql $tempDb -e "select geneSymbol,name from knownGene g, kgXref x where g.name=x.kgId " | sort > $tempDb.symbolToId.txt join -t $'\t' /hive/groups/browser/wikipediaScrape/symbolToPage.txt $tempDb.symbolToId.txt | tawk '{print $3,$2}' | sort | uniq > $tempDb.idToPage.txt hgLoadSqlTab $tempDb knownToWikipedia $HOME/kent/src/hg/lib/knownTo.sql $tempDb.idToPage.txt # make views for all the tables in the specific database hgsql knownGeneV35 -Ne "show full tables" | grep -v VIEW | grep -v history | grep -v masterGeneTrack | grep -v chromInfo | awk '{print $1}' | sort > v35.tables.txt hgsql knownGeneV35 -Ne "show full tables" | grep -v VIEW | grep -v history | grep -v masterGeneTrack | grep -v chromInfo | awk '{print "show tables like \""$1"\";"}' > showTables.txt hgsql knownGeneV35 -Ne "show full tables" | grep -v VIEW | grep -v history | grep -v masterGeneTrack | grep -v chromInfo | awk '{print "create view "$1" as select * from knownGeneV35."$1";"}' > makeViews.txt + +# Regenerate ccdsKgMap table +$kent/src/hg/makeDb/genbank/bin/x86_64/mkCcdsGeneMap -db=$tempDb -loadDb $db.ccdsGene knownGene ccdsKgMap + +mkdir -p $dir/pfam +cd $dir/pfam +rm -rf splitProt +mkdir splitProt +faSplit sequence $dir/ucscGenes.faa 10000 splitProt/ +mkdir -p result +ls -1 splitProt > prot.list +# /hive/data/outside/pfam/hmmpfam -E 0.1 /hive/data/outside/pfam/current/Pfam_fs \ +cat << '_EOF_' > doPfam +#!/bin/csh -ef +/hive/data/outside/pfam/Pfam29.0/PfamScan/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam29.0/Pfam-A.hmm splitProt/$1 +mv /scratch/tmp/pfam.$2.pf $3 +_EOF_ + # << happy emacs +chmod +x doPfam +cat << '_EOF_' > template +#LOOP +doPfam $(path1) $(root1) {check out line+ result/$(root1).pf} +#ENDLOOP +_EOF_ +gensub2 prot.list single template jobList + +ssh $cpuFarm "cd $dir/pfam; para make jobList" +ssh $cpuFarm "cd $dir/pfam; para time > run.time" +cat run.time + +#Completed: 9428 of 9428 jobs +#CPU time in finished jobs: 2252658s 37544.30m 625.74h 26.07d 0.071 y +#IO & Wait Time: 476422s 7940.37m 132.34h 5.51d 0.015 y +#Average job time: 289s 4.82m 0.08h 0.00d +##Longest finished job: 2943s 49.05m 0.82h 0.03d +#Submission to last job: 8377s 139.62m 2.33h 0.10d + +# Make up pfamDesc.tab by converting pfam to a ra file first +cat << '_EOF_' > makePfamRa.awk +/^NAME/ {print} +/^ACC/ {print} +/^DESC/ {print} +/^TC/ {print $1,$3; printf("\n");} +_EOF_ +awk -f makePfamRa.awk /hive/data/outside/pfam/Pfam29.0/Pfam-A.hmm > pfamDesc.ra +raToTab -cols=ACC,NAME,DESC,TC pfamDesc.ra stdout | awk -F '\t' '{printf("%s\t%s\t%s\t%g\n", $1, $2, $3, $4);}' | sort > pfamDesc.tab + +# Convert output to tab-separated file. +cd $dir/pfam +catDir result | sed '/^#/d' > allResults.tab +awk 'BEGIN {OFS="\t"} { print $5,$1,$18-1,$19,$4,$14}' allResults.tab | sort > allUcscPfam.tab +join -t $'\t' -j 1 allUcscPfam.tab pfamDesc.tab | tawk '{if ($6 > $9) print $2, $3, $4, $5, $6, $1}' > ucscPfam.tab +cd $dir + +# Convert output to knownToPfam table +tawk '{print $1, gensub(/\.[0-9]+/, "", "g", $6)}' pfam/ucscPfam.tab | sort -u > knownToPfam.tab +hgLoadSqlTab -notOnServer $tempDb knownToPfam $kent/src/hg/lib/knownTo.sql knownToPfam.tab +tawk '{print gensub(/\.[0-9]+/, "", "g", $1), $2, $3}' pfam/pfamDesc.tab| hgLoadSqlTab -notOnServer $tempDb pfamDesc $kent/src/hg/lib/pfamDesc.sql stdin + +cd $dir/pfam +genePredToFakePsl $tempDb knownGene knownGene.psl cdsOut.tab +sort cdsOut.tab | sed 's/\.\./ /' > sortCdsOut.tab +sort ucscPfam.tab> sortPfam.tab +awk '{print $10, $11}' knownGene.psl > gene.sizes +join sortCdsOut.tab sortPfam.tab | awk '{print $1, $2 - 1 + 3 * $4, $2 - 1 + 3 * $5, $6}' | bedToPsl gene.sizes stdin domainToGene.psl +pslMap domainToGene.psl knownGene.psl stdout | pslToBed stdin stdout | bedOrBlocks -useName stdin domainToGenome.bed +hgLoadBed $tempDb ucscGenePfam domainToGenome.bed + +# Do scop run. Takes about 6 hours +#mkdir /hive/data/outside/scop/1.75 +#cd /hive/data/outside/scop/1.75 +#wget "ftp://license:SlithyToves@supfam.org/models/hmmlib_1.75.gz" +#gunzip hmmlib_1.75.gz +#wget "ftp://license:SlithyToves@supfam.org/models/model.tab.gz" +#gunzip model.tab.gz + +mkdir -p $dir/scop +cd $dir/scop +rm -rf result +mkdir result +ls -1 ../pfam/splitProt > prot.list +cat << '_EOF_' > doScop +#!/bin/csh -ef +/hive/data/outside/pfam/Pfam29.0/PfamScan/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch --domtblout /scratch/tmp/scop.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/scop/1.75/hmmlib_1.75 ../pfam/splitProt/$1 +mv /scratch/tmp/scop.$2.pf $3 +_EOF_ + # << happy emacs +chmod +x doScop +cat << '_EOF_' > template +#LOOP +doScop $(path1) $(root1) {check out line+ result/$(root1).pf} +#ENDLOOP +_EOF_ + # << happy emacs +gensub2 prot.list single template jobList + +ssh $cpuFarm "cd $dir/scop; para make jobList" +ssh $cpuFarm "cd $dir/scop; para time > run.time" +cat run.time + +# Completed: 9428 of 9428 jobs +# CPU time in finished jobs: 2122993s 35383.22m 589.72h 24.57d 0.067 y +# IO & Wait Time: 479507s 7991.78m 133.20h 5.55d 0.015 y +# Average job time: 276s 4.60m 0.08h 0.00d +# Longest finished job: 2498s 41.63m 0.69h 0.03d +# Submission to last job: 8193s 136.55m 2.28h 0.09d + +# Convert scop output to tab-separated files +cd $dir +catDir scop/result | sed '/^#/d' | awk 'BEGIN {OFS="\t"} {if ($7 <= 0.0001) print $1,$18-1,$19,$4, $7,$8}' | sort > scopPlusScore.tab +sort -k 2 /hive/data/outside/scop/1.75/model.tab > scop.model.tab +scopCollapse scopPlusScore.tab scop.model.tab ucscScop.tab \ + scopDesc.tab knownToSuper.tab +hgLoadSqlTab -notOnServer $tempDb scopDesc $kent/src/hg/lib/scopDesc.sql scopDesc.tab +hgLoadSqlTab $tempDb knownToSuper $kent/src/hg/lib/knownToSuper.sql knownToSuper.tab +#hgsql $tempDb -e "delete k from knownToSuper k, kgXref x where k.gene = x.kgID and x.geneSymbol = 'abParts'" + + +hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db affyU133 knownGene knownToU133 +hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db affyU95 knownGene knownToU95 + mkdir hprd + cd hprd + wget "http://www.hprd.org/edownload/HPRD_FLAT_FILES_041310" + tar xvf HPRD_FLAT_FILES_041310 + knownToHprd $tempDb FLAT_FILES_072010/HPRD_ID_MAPPINGS.txt