834f8db0db217654d786090bd9e25ad92292ee4e braney Wed Jan 13 11:20:37 2021 -0800 ongoing work on UCSC genes diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh index 403ca14..8040ea6 100644 --- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV35.sh @@ -76,57 +76,74 @@ hgLoadSqlTab -notOnServer $tempDb kgXref $kent/src/hg/lib/kgXref.sql kgXref.tab #ifdef NOTNOW # calculate score field with bitfields hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | cut -f 2- | sort > gencodeAnnot$GENCODE_VERSION.txt hgsql $db -Ne "select name,2 from gencodeAnnot$GENCODE_VERSION" | sort > knownCanon.txt hgsql $db -Ne "select * from gencodeTag$GENCODE_VERSION" | grep basic | sed 's/basic/1/' | sort > knownTag.txt hgsql $db -Ne "select transcriptId,transcriptClass from gencodeAttrs$GENCODE_VERSION where transcriptClass='pseudo'" | sed 's/pseudo/4/' > knownAttrs.txt sort knownCanon.txt knownTag.txt knownAttrs.txt | awk '{if ($1 != last) {print last, sum; sum=$2; last=$1} else {sum += $2; }} END {print last, sum}' | tail -n +2 > knownScore.txt #endif hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | tawk '{print $1,$13,$14,$8,$15}' | sort | uniq > knownCds.tab hgLoadSqlTab -notOnServer $tempDb knownCds $kent/src/hg/lib/knownCds.sql knownCds.tab +hgsql -e "select * from gencodeTag$GENCODE_VERSION" --skip-column-names $db | sort | uniq > knownToTag.tab +hgLoadSqlTab -notOnServer $tempDb knownToTag $kent/src/hg/lib/knownTo.sql knownToTag.tab + +hgsql $tempDb -Ne "select k.name, g.geneId, g.unused1, g.geneType,g.transcriptName,g.transcriptType,g.unused2, g.unused3, g.ccdsId, g.level, g.transcriptClass from knownGene k, $db.gencodeAttrs$GENCODE_VERSION g where k.name=g.transcriptId" | sort | uniq > knownAttrs.tab + +hgLoadSqlTab -notOnServer $tempDb knownAttrs $kent/src/hg/lib/knownAttrs.sql knownAttrs.tab + cat << __EOF__ > colors.sed s/coding/12\t12\t120/ s/nonCoding/0\t100\t0/ s/pseudo/255\t51\t255/ s/other/254\t0\t0/ __EOF__ hgsql $db -Ne "select * from gencodeAttrs$GENCODE_VERSION" | tawk '{print $4,$10}' | sed -f colors.sed > colors.txt +hgLoadSqlTab -notOnServer $tempDb kgColor $kent/src/hg/lib/kgColor.sql colors.txt + +hgsql $tempDb -N -e 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' > kgSpAlias_0.tmp + +hgsql $tempDb -N -e 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID' >> kgSpAlias_0.tmp +cat kgSpAlias_0.tmp|sort -u > kgSpAlias.tab +rm kgSpAlias_0.tmp + +hgLoadSqlTab -notOnServer $tempDb kgSpAlias $kent/src/hg/lib/kgSpAlias.sql kgSpAlias.tab + #ifdef NOTNOW hgsql $db -Ne "select * from gencodeToUniProt$GENCODE_VERSION" | tawk '{print $1,$2}'| sort > uniProt.txt hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | tawk '{print $1,$12}' | sort > gene.txt join -a 1 gene.txt uniProt.txt > geneNames.txt #endif //genePredToBigGenePred -score=knownScore.txt -colors=colors.txt -geneNames=geneNames.txt -known gencodeAnnot$GENCODE_VERSION.txt gencodeAnnot$GENCODE_VERSION.bgpInput hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | cut -f 2- > gencodeAnnot$GENCODE_VERSION.txt #genePredToBigGenePred -colors=colors.txt gencodeAnnot$GENCODE_VERSION.txt stdout | sort -k1,1 -k2,2n > gencodeAnnot$GENCODE_VERSION.bgpInput hgsql $tempDb -Ne "select kgId, geneSymbol, spID from kgXref" > geneNames.txt #hgsql $tempDb -Ne "select * from knownCds" > knownCds.txt #genePredToBigGenePred -colors=colors.txt -known knownGene.gp stdout | sort -k1,1 -k2,2n > gencodeAnnot$GENCODE_VERSION.bgpInput genePredToBigGenePred -colors=colors.txt -geneNames=geneNames.txt -known -cds=knownCds.tab knownGene.gp stdout | sort -k1,1 -k2,2n > gencodeAnnot$GENCODE_VERSION.bgpInput tawk '{print $4,$0}' gencodeAnnot$GENCODE_VERSION.bgpInput | sort > join1 -hgsql $db -Ne "select transcriptId, transcriptClass from gencodeAttrs$GENCODE_VERSION" | sort > attrs.txt +hgsql $db -Ne "select transcriptId, transcriptClass from gencodeAttrs$GENCODE_VERSION" | sed 's/other/nonCoding/'| sort > attrs.txt join -t $'\t' join1 attrs.txt > join2 hgsql $db -Ne "select transcriptId, source from gencodeTranscriptSource$GENCODE_VERSION" | sort > source.txt join -t $'\t' join2 source.txt > join3 hgsql $db -Ne "select transcriptId, transcriptType from gencodeAttrs$GENCODE_VERSION" | sort > biotype.txt join -t $'\t' join3 biotype.txt > join4 hgsql $db -Ne "select transcriptId, tag from gencodeTag$GENCODE_VERSION" | sort | tawk '{if ($1 != last) {print last,buff; buff=$2}else {buff=buff "," $2} last=$1} END {print last,buff}' | tail -n +2 > tags.txt join -t $'\t' -a 1 -e"none" -o auto join4 tags.txt > join5 hgsql $db -Ne "select transcriptId, level from gencodeAttrs$GENCODE_VERSION" | sort > level.txt join -t $'\t' join5 level.txt > join6 grep basic tags.txt | tawk '{print $1, 1, "basic"}' > basic.txt tawk '{print $5,0,"canonical"}' knownCanonical.tab | sort > canonical.txt tawk '{print $4,2,"all"}' gencodeAnnot$GENCODE_VERSION.bgpInput | sort > all.txt sort -k1,1 -k2,2n basic.txt canonical.txt all.txt | tawk '{if ($1 != last) {print last,buff; buff=$3}else {buff=buff "," $3} last=$1} END {print last,buff}' | tail -n +2 > tier.txt join -t $'\t' join6 tier.txt > join7 cut -f 2- -d $'\t' join7 | sort -k1,1 -k2,2n > bgpInput.txt @@ -463,15 +480,31 @@ hgLoadBlastTab $tempDb ceBlastTab $aToB/recipBest.tab # Us vs. yeast cd $dir/hgNearBlastp export aToB=run.$db.$yeastDb export bToA=run.$yeastDb.$db cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb scBlastTab $aToB/recipBest.tab # Clean up cd $dir/hgNearBlastp cat run.$tempDb.$tempDb/out/*.tab | gzip -c > run.$tempDb.$tempDb/all.tab.gz gzip run.*/all.tab + +hgLoadNetDist $genomes/hg19/p2p/hprd/hprd.pathLengths $tempDb humanHprdP2P \ + -sqlRemap="select distinct value, name from knownToHprd" +hgLoadNetDist $genomes/hg19/p2p/vidal/humanVidal.pathLengths $tempDb humanVidalP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq" +hgLoadNetDist $genomes/hg19/p2p/wanker/humanWanker.pathLengths $tempDb humanWankerP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq" + +mkdir $dir/wikipedia +cd $dir/wikipedia +hgsql $tempDb -e "select geneSymbol,name from knownGene g, kgXref x where g.name=x.kgId " | sort > $tempDb.symbolToId.txt +join -t $'\t' /hive/groups/browser/wikipediaScrape/symbolToPage.txt $tempDb.symbolToId.txt | tawk '{print $3,$2}' | sort | uniq > $tempDb.idToPage.txt +hgLoadSqlTab $tempDb knownToWikipedia $HOME/kent/src/hg/lib/knownTo.sql $tempDb.idToPage.txt + +# make views for all the tables in the specific database +hgsql knownGeneV35 -Ne "show full tables" | grep -v VIEW | grep -v history | grep -v masterGeneTrack | grep -v chromInfo | awk '{print $1}' | sort > v35.tables.txt +hgsql knownGeneV35 -Ne "show full tables" | grep -v VIEW | grep -v history | grep -v masterGeneTrack | grep -v chromInfo | awk '{print "show tables like \""$1"\";"}' > showTables.txt +hgsql knownGeneV35 -Ne "show full tables" | grep -v VIEW | grep -v history | grep -v masterGeneTrack | grep -v chromInfo | awk '{print "create view "$1" as select * from knownGeneV35."$1";"}' > makeViews.txt