90ab34106679395529b254e024c8b0f350547195 braney Wed Nov 27 16:54:26 2019 -0800 add myGene2 link from hgGene on hg38 diff --git src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh index 88d9eaf..60dcc78 100755 --- src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.ucscGenes20.sh @@ -1,1128 +1,1138 @@ export dir=/cluster/data/hg38/bed/ucsc.20.1 export GENCODE_VERSION=V32 export oldGeneDir=/cluster/data/hg38/bed/ucsc.19.1 export oldGeneBed=$oldGeneDir/ucscGenes.bed export db=hg38 export spDb=sp180404 export taxon=9606 export tempDb=tmpFoo20 export kent=$HOME/kent export lastVer=11 export curVer=12 export Db=Hg38 export xdb=mm10 export Xdb=Mm10 export ydb=canFam3 export zdb=rheMac8 export ratDb=rn6 export RatDb=Rn6 export fishDb=danRer11 export flyDb=dm6 export wormDb=ce11 export yeastDb=sacCer3 export tempFa=$dir/ucscGenes.faa export genomes=/hive/data/genomes export xdbFa=$genomes/$xdb/bed/ucsc.18.1/ucscGenes.faa export ratFa=$genomes/$ratDb/bed/ensGene.95/ensembl.faa #export ratFa=$genomes/$ratDb/bed/blastpRgdGene2/rgdGene2Pep.faa export fishFa=$genomes/$fishDb/bed/ensGene.95/ensembl.faa #export fishFa=$genomes/$fishDb/bed/blastp/ensembl.faa export flyFa=$genomes/$flyDb/bed/ensGene.95/ensembl.faa #export flyFa=$genomes/$flyDb/bed/hgNearBlastp/100806/$flyDb.flyBasePep.faa export wormFa=$genomes/$wormDb/bed/ws245Genes/ws245Pep.faa #export wormFa=$genomes/$wormDb/bed/blastp/wormPep190.faa export yeastFa=$genomes/$yeastDb/bed/sgdAnnotations/blastTab/sacCer3.sgd.faa export scratchDir=/hive/users/braney/scratch export blastTab=hgBlastTab export xBlastTab=mmBlastTab export rnBlastTab=rnBlastTab export dbHost=hgwdev export ramFarm=ku export cpuFarm=ku mkdir -p $dir cd $dir # first get list of tables from push request in $lastVer.table.lst # here's the redmine http://redmine.soe.ucsc.edu/issues/21644 wc -l $lastVer.table.lst # 62 ( cat $lastVer.table.lst | grep -v "ToKg$lastVer" | grep -v "XrefOld" | grep -v "knownGeneOld" | grep -v "knownToGencode" echo kg${lastVer}ToKg${curVer} echo knownGeneOld$lastVer echo kgXrefOld$lastVer ) | sort > $curVer.table.lst # check to see which are new (none should be now ;-) for i in `cat $curVer.table.lst`; do d=`hgsql hg38 -Ne "SELECT create_time FROM INFORMATION_SCHEMA.TABLES WHERE table_schema = 'hg38' AND table_name = '$i'"` ; echo $i $d; done | sort -nk2 echo "create database $tempDb" | hgsql "" echo "create table knownGeneOld$lastVer like $db.knownGene" | hgsql $tempDb echo "insert into knownGeneOld$lastVer select * from $db.knownGene" | hgsql $tempDb echo "create table kgXrefOld$lastVer like $db.kgXref" | hgsql $tempDb echo "insert into kgXrefOld$lastVer select * from $db.kgXref" | hgsql $tempDb hgsql -e "select * from wgEncodeGencodeComp$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | genePredToBed stdin tmp hgsql -e "select * from wgEncodeGencodePseudoGene$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | genePredToBed stdin tmp2 sort -k1,1 -k2,2n tmp tmp2 | gzip -c > gencode${GENCODE_VERSION}.bed.gz # get current list of ids zcat gencode${GENCODE_VERSION}.bed.gz | awk '{print $4}' | sort > newGencodeName.txt # grab ENST to UC map from the previous set hgsql $db -Ne "select name,alignId from knownGene" | sort > EnstToUC.txt # grab startId from the last run cat /cluster/data/hg38/bed/ucsc.19.1/startId startId # 5048446 kgAllocId EnstToUC.txt newGencodeName.txt 5048446 stdout | sort -u > txToAcc.tab # lastId 5070122 # make null file touch oldToNew.tab # check to make sure we don't have any dups. These two numbers should # be the same. awk '{print $2}' txToAcc.tab | sed 's/\..*//' | sort -u | wc -l # 247381 awk '{print $2}' txToAcc.tab | sed 's/\..*//' | sort | wc -l # 247381 # this should be the current db instead of olDdb if not the first release hgsql -Ne "select * from knownGene" $db | sort > $db.knownGene.gp #grep lost oldToNew.tab | tawk '{print $2}' | sort > lost.txt #join -t $'\t' lost.txt $db.knownGene.gp > $db.lost.gp #awk '{if ($7 == $6) print}' $db.lost.gp | wc -l # non-coding 341 #awk '{if ($7 != $6) print}' $db.lost.gp | wc -l # coding 806 # Assign permanent accessions to each transcript #txGeneAccession $oldGeneBed startId gencode${GENCODE_VERSION}.bed.gz txToAcc.tab oldToNew.tab zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed twoBitToFa -noMask /cluster/data/$db/$db.2bit -bed=ucscGenes.bed stdout | faFilter -uniq stdin ucscGenes.fa hgPepPred $tempDb generic knownGeneMrna ucscGenes.fa bedToPsl /cluster/data/$db/chrom.sizes ucscGenes.bed ucscGenes.psl pslRecalcMatch ucscGenes.psl /cluster/data/$db/$db.2bit ucscGenes.fa kgTargetAli.psl # should be zero awk '$11 != $1 + $3+$4' kgTargetAli.psl echo "create table chromInfo like $db.chromInfo" | hgsql $tempDb echo "insert into chromInfo select * from $db.chromInfo" | hgsql $tempDb hgLoadPsl $tempDb kgTargetAli.psl txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq | bedClip stdin /cluster/data/hg38/chrom.sizes ucscSplice.bed hgLoadBed $tempDb knownAlt ucscSplice.bed #hgsql -N $spDb -e "select p.acc, p.val from protein p, accToTaxon x where x.taxon=$taxon and p.acc=x.acc" | awk '{print ">" $1;print $2}' >uniProt.fa #faSize uniProt.fa #needs two passes. First make knownGene, then supporting tables makeGencodeKnownGene -justKnown $db $tempDb $GENCODE_VERSION txToAcc.tab hgLoadSqlTab -notOnServer $tempDb knownGene $kent/src/hg/lib/knownGene.sql knownGene.gp hgLoadGenePred -genePredExt $tempDb knownGeneExt knownGeneExt.gp #getRnaPred -genePredExt -peptides $tempDb knownGeneExt all ucscGenes.faa genePredToProt knownGeneExt.gp /cluster/data/$db/$db.2bit tmp.faa faFilter -uniq tmp.faa ucscGenes.faa hgPepPred $tempDb generic knownGenePep ucscGenes.faa hgMapToGene -type=psl -all -tempDb=$tempDb $db all_mrna knownGene knownToMrna hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToRefSeq hgMapToGene -type=psl -tempDb=$tempDb $db all_mrna knownGene knownToMrnaSingle makeGencodeKnownGene $db $tempDb $GENCODE_VERSION txToAcc.tab hgsql $tempDb -Ne "select k.name, g.geneId, g.geneStatus, g.geneType,g.transcriptName,g.transcriptType,g.transcriptStatus, g.havanaGeneId, g.ccdsId, g.level, g.transcriptClass from knownGene k, $db.wgEncodeGencodeAttrs$GENCODE_VERSION g where k.name=g.transcriptId" | sort | uniq > knownAttrs.tab hgLoadSqlTab -notOnServer $tempDb knownAttrs $kent/src/hg/lib/knownAttrs.sql knownAttrs.tab #tawk '$4=="new" {print $3}' oldToNew.tab | sort > new.txt #sort knownGene.gp | join -t $'\t' new.txt /dev/stdin > new.gp #sort knownGene.gp | join -t $'\t' lost.txt /dev/stdin | wc # should be zero # tawk '{print $12}' hg38.lost.gp | while read name; do grep $name /tmp/2; done | wc sort kgColor.tab | uniq | hgLoadSqlTab -notOnServer $tempDb kgColor $kent/src/hg/lib/kgColor.sql stdin sort knownIsoforms.tab | uniq | hgLoadSqlTab -notOnServer $tempDb knownIsoforms $kent/src/hg/lib/knownIsoforms.sql stdin hgLoadSqlTab -notOnServer $tempDb kgXref $kent/src/hg/lib/kgXref.sql kgXref.tab hgLoadSqlTab -notOnServer $tempDb knownCanonical $kent/src/hg/lib/knownCanonical.sql knownCanonical.tab hgsql $tempDb -e "select * from knownToMrna" | tail -n +2 | tawk '{if ($1 != last) {print last, count, buffer; count=1; buffer=$2} else {count++;buffer=$2","buffer} last=$1}' | tail -n +2 | sort > tmp1 hgsql $tempDb -e "select * from knownToMrnaSingle" | tail -n +2 | sort > tmp2 join tmp2 tmp1 > knownGene.ev txGeneAlias $db $spDb kgXref.tab knownGene.ev oldToNew.tab foo.alias foo.protAlias tawk '{split($2,a,"."); for(ii = 1; ii <= a[2]; ii++) print $1,a[1] "." ii }' txToAcc.tab >> foo.alias sort foo.alias | uniq > ucscGenes.alias sort foo.protAlias | uniq > ucscGenes.protAlias rm foo.alias foo.protAlias hgLoadSqlTab -notOnServer $tempDb kgAlias $kent/src/hg/lib/kgAlias.sql ucscGenes.alias hgLoadSqlTab -notOnServer $tempDb kgProtAlias $kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias # Build kgSpAlias table, which combines content of both kgAlias and kgProtAlias tables. hgsql $tempDb -N -e 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' > kgSpAlias_0.tmp hgsql $tempDb -N -e 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID' >> kgSpAlias_0.tmp cat kgSpAlias_0.tmp|sort -u > kgSpAlias.tab rm kgSpAlias_0.tmp hgLoadSqlTab -notOnServer $tempDb kgSpAlias $kent/src/hg/lib/kgSpAlias.sql kgSpAlias.tab txGeneExplainUpdate2 $oldGeneBed ucscGenes.bed kgOldToNew.tab hgLoadSqlTab -notOnServer $tempDb kg${lastVer}ToKg${curVer} $kent/src/hg/lib/kg1ToKg2.sql kgOldToNew.tab # TODO add kg${lastVer}ToKg${curVer} to all.joiner !!!! #sort txToAcc.tab > tmp1 #hgsql -e "select * from wgEncodeGencodeComp$GENCODE_VERSION" --skip-column-names hg38 | cut -f 2-16 | tawk '{print $1 "." $2,$13,$14,$8,$15}' | sort | join /dev/stdin tmp1 | awk 'BEGIN {OFS="\t"} {print $6, $2, $3, $4, $5}' | sort > knownCds.tab hgsql -e "select * from wgEncodeGencodeComp$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | tawk '{print $1,$13,$14,$8,$15}' | sort | uniq > knownCds.tab hgLoadSqlTab -notOnServer $tempDb knownCds $kent/src/hg/lib/knownCds.sql knownCds.tab hgsql -e "select * from wgEncodeGencodeTag$GENCODE_VERSION" --skip-column-names $db | sort | uniq > knownToTag.tab hgLoadSqlTab -notOnServer $tempDb knownToTag $kent/src/hg/lib/knownTo.sql knownToTag.tab # this should be done AFTER moving the new tables into hg38 hgKgGetText $tempDb tempSearch.txt sort tempSearch.txt > tempSearch2.txt tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt join tempSearch2.txt tempSearch3.txt | sort > knownGene.txt ixIxx knownGene.txt knownGene.ix knownGene.ixx rm -rf /gbdb/$tempDb/knownGene.ix /gbdb/$tempDb/knownGene.ixx ln -s $dir/knownGene.ix /gbdb/$tempDb/knownGene.ix ln -s $dir/knownGene.ixx /gbdb/$tempDb/knownGene.ixx hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from hgFixed.refLink" $db > refToLl.txt hgMapToGene -tempDb=$tempDb $db refGene knownGene knownToLocusLink -lookup=refToLl.txt knownToVisiGene $tempDb -probesDb=$db awk '{OFS="\t"} {print $4,$4}' ucscGenes.bed | sort > knownToEnsembl.tab cp knownToEnsembl.tab knownToGencode${GENCODE_VERSION}.tab #awk '{OFS="\t"} {print $2,$1}' tmp1 | sort > knownToEnsembl.tab #tawk '{print $2,$1}' tmp1 | sort > knownToGencode${GENCODE_VERSION}.tab hgLoadSqlTab -notOnServer $tempDb knownToEnsembl $kent/src/hg/lib/knownTo.sql knownToEnsembl.tab hgLoadSqlTab -notOnServer $tempDb knownToGencode${GENCODE_VERSION} $kent/src/hg/lib/knownTo.sql knownToGencode${GENCODE_VERSION}.tab hgMapToGene -tempDb=$tempDb $db gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' if ($db =~ hg*) then #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db HInvGeneMrna knownGene knownToHInv #hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU133Plus2 knownGene knownToU133Plus2 hgMapToGene -tempDb=$tempDb $db affyU133 knownGene knownToU133 hgMapToGene -tempDb=$tempDb $db affyU95 knownGene knownToU95 mkdir hprd cd hprd wget "http://www.hprd.org/edownload/HPRD_FLAT_FILES_041310" tar xvf HPRD_FLAT_FILES_041310 knownToHprd $tempDb FLAT_FILES_072010/HPRD_ID_MAPPINGS.txt # hgsql $tempDb -e "delete k from knownToHprd k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'" endif if ($db =~ hg*) then cd $dir time hgExpDistance $tempDb hgFixed.gnfHumanU95MedianRatio \ hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 time hgExpDistance $tempDb hgFixed.gnfHumanAtlas2MedianRatio \ hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ -lookup=knownToGnfAtlas2 endif if ($db =~ mm*) then hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyGnf1m knownGene knownToGnf1m hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12' hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyU74 knownGene knownToU74 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyMOE430 knownGene knownToMOE430 hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db affyMOE430 -prefix=A: knownGene knownToMOE430A hgExpDistance $tempDb $db.affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74 hgExpDistance $tempDb $db.affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74 hgExpDistance $tempDb $db.affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74 hgExpDistance $tempDb hgFixed.gnfMouseAtlas2MedianRatio \ hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m endif cd $dir # Create Human P2P protein-interaction Gene Sorter columns if ($db =~ hg*) then hgLoadNetDist $genomes/hg19/p2p/hprd/hprd.pathLengths $tempDb humanHprdP2P \ -sqlRemap="select distinct value, name from knownToHprd" hgLoadNetDist $genomes/hg19/p2p/vidal/humanVidal.pathLengths $tempDb humanVidalP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq" hgLoadNetDist $genomes/hg19/p2p/wanker/humanWanker.pathLengths $tempDb humanWankerP2P -sqlRemap="select distinct locusLinkID, kgID from hgFixed.refLink,kgXref where hgFixed.refLink.mrnaAcc = kgXref.refSeq" endif # Run nice Perl script to make all protein blast runs for # Gene Sorter and Known Genes details page. Takes about # 45 minutes to run. # check to make sure we have all the fasta for i in $tempFa $xdbFa $ratFa $fishFa $flyFa $wormFa $yeastFa do if test ! -f $i then echo $i not found fi done rm -rf $dir/hgNearBlastp mkdir $dir/hgNearBlastp cd $dir/hgNearBlastp tcsh cat << _EOF_ > config.ra # Latest human vs. other Gene Sorter orgs: # mouse, rat, zebrafish, worm, yeast, fly targetGenesetPrefix known targetDb $tempDb queryDbs $xdb $ratDb $fishDb $flyDb $wormDb $yeastDb ${tempDb}Fa $tempFa ${xdb}Fa $xdbFa ${ratDb}Fa $ratFa ${fishDb}Fa $fishFa ${flyDb}Fa $flyFa ${wormDb}Fa $wormFa ${yeastDb}Fa $yeastFa buildDir $dir/hgNearBlastp scratchDir $scratchDir/brHgNearBlastp _EOF_ # exit tcsh rm -rf $scratchDir/brHgNearBlastp doHgNearBlastp.pl -noLoad -clusterHub=ku -distrHost=hgwdev -dbHost=hgwdev -workhorse=hgwdev config.ra > do.log 2>&1 & # Load self cd $dir/hgNearBlastp/run.$tempDb.$tempDb # builds knownBlastTab ./loadPairwise.csh # Load mouse and rat cd $dir/hgNearBlastp/run.$tempDb.$xdb hgLoadBlastTab $tempDb $xBlastTab -maxPer=1 out/*.tab cd $dir/hgNearBlastp/run.$tempDb.$ratDb hgLoadBlastTab $tempDb $rnBlastTab -maxPer=1 out/*.tab # Remove non-syntenic hits for mouse and rat # Takes a few minutes mkdir -p /gbdb/$tempDb/liftOver rm -f /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz ln -s $genomes/$db/bed/liftOver/${db}To$RatDb.over.chain.gz \ /gbdb/$tempDb/liftOver/${tempDb}To$RatDb.over.chain.gz ln -s $genomes/$db/bed/liftOver/${db}To${Xdb}.over.chain.gz \ /gbdb/$tempDb/liftOver/${tempDb}To$Xdb.over.chain.gz # delete non-syntenic genes from rat and mouse blastp tables cd $dir/hgNearBlastp synBlastp.csh $tempDb $xdb #old number of unique query values: 100823 #old number of unique target values 27503 #new number of unique query values: 93781 #new number of unique target values 26806 export oldDb=hg38 hgsql -e "select count(*) from mmBlastTab\G" $oldDb | tail -n +2 # count(*): 110864 hgsql -e "select count(*) from mmBlastTab\G" $tempDb | tail -n +2 # count(*): 93781 synBlastp.csh $tempDb $ratDb knownGene ensGene # old number of unique query values: 99755 # old number of unique target values 19904 # new number of unique query values: 90425 # new number of unique target values 19032 hgsql -e "select count(*) from rnBlastTab\G" $oldDb | tail -n +2 # count(*): 88508 hgsql -e "select count(*) from rnBlastTab\G" $tempDb | tail -n +2 # count(*): 90425 # Make reciprocal best subset for the blastp pairs that are too # Far for synteny to help # Us vs. fish cd $dir/hgNearBlastp export aToB=run.$tempDb.$fishDb export bToA=run.$fishDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb drBlastTab $aToB/recipBest.tab hgsql -e "select count(*) from drBlastTab\G" $oldDb | tail -n +2 # count(*): 13313 hgsql -e "select count(*) from drBlastTab\G" $tempDb | tail -n +2 # count(*): 13507 # Us vs. fly cd $dir/hgNearBlastp export aToB=run.$tempDb.$flyDb export bToA=run.$flyDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb dmBlastTab $aToB/recipBest.tab hgsql -e "select count(*) from dmBlastTab\G" $oldDb | tail -n +2 # count(*): 6031 hgsql -e "select count(*) from dmBlastTab\G" $tempDb | tail -n +2 # count(*): 6034 # Us vs. worm cd $dir/hgNearBlastp export aToB=run.$tempDb.$wormDb export bToA=run.$wormDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb ceBlastTab $aToB/recipBest.tab hgsql -e "select count(*) from ceBlastTab\G" $oldDb | tail -n +2 # count(*): 4381 hgsql -e "select count(*) from ceBlastTab\G" $tempDb | tail -n +2 # count(*): 4386 # Us vs. yeast cd $dir/hgNearBlastp export aToB=run.$tempDb.$yeastDb export bToA=run.$yeastDb.$tempDb cat $aToB/out/*.tab > $aToB/all.tab cat $bToA/out/*.tab > $bToA/all.tab blastRecipBest $aToB/all.tab $bToA/all.tab $aToB/recipBest.tab $bToA/recipBest.tab hgLoadBlastTab $tempDb scBlastTab $aToB/recipBest.tab hgsql -e "select count(*) from scBlastTab\G" $oldDb | tail -n +2 # count(*): 2361 hgsql -e "select count(*) from scBlastTab\G" $tempDb | tail -n +2 # count(*): 2361 # Clean up cd $dir/hgNearBlastp cat run.$tempDb.$tempDb/out/*.tab | gzip -c > run.$tempDb.$tempDb/all.tab.gz gzip run.*/all.tab # load malacards table hgsql -e "select geneSymbol,kgId from kgXref" --skip-column-names hg38 | awk '{if (NF == 2) print}' | sort > geneSymbolToKgId.txt hgsql -e "select geneSymbol from malacards" --skip-column-names hg38 | sort > malacardExists.txt join malacardExists.txt geneSymbolToKgId.txt | awk 'BEGIN {OFS="\t"} {print $2, $1}' > knownToMalacard.txt hgLoadSqlTab -notOnServer $tempDb knownToMalacards $kent/src/hg/lib/knownTo.sql knownToMalacard.txt # make knownToLynx mkdir -p $dir/lynx cd $dir/lynx wget "http://lynx.ci.uchicago.edu/downloads/LYNX_GENES.tab" awk '{print $2}' LYNX_GENES.tab | sort > lynxExists.txt hgsql -e "select geneSymbol,kgId from kgXref" --skip-column-names $tempDb | awk '{if (NF == 2) print}' | sort > geneSymbolToKgId.txt join lynxExists.txt geneSymbolToKgId.txt | awk 'BEGIN {OFS="\t"} {print $2,$1}' | sort > knownToLynx.tab hgLoadSqlTab -notOnServer $tempDb knownToLynx $kent/src/hg/lib/knownTo.sql knownToLynx.tab hgsql -e "select count(*) from knownToLynx\G" $oldDb | tail -n +2 # count(*): 174798 hgsql -e "select count(*) from knownToLynx\G" $tempDb | tail -n +2 # count(*): 205303 # make knownToNextProt mkdir -p $dir/nextProt cd $dir/nextProt wget "ftp://ftp.nextprot.org/pub/current_release/ac_lists/nextprot_ac_list_all.txt" awk '{print $0, $0}' nextprot_ac_list_all.txt | sed 's/NX_//' | sort > displayIdToNextProt.txt hgsql -e "select spID,kgId from kgXref" --skip-column-names $tempDb | awk '{if (NF == 2) print}' | sort > displayIdToKgId.txt join displayIdToKgId.txt displayIdToNextProt.txt | awk 'BEGIN {OFS="\t"} {print $2,$3}' > knownToNextProt.tab hgLoadSqlTab -notOnServer $tempDb knownToNextProt $kent/src/hg/lib/knownTo.sql knownToNextProt.tab hgsql -e "select count(*) from knownToNextProt\G" $oldDb | tail -n +2 # count(*): 45890 hgsql -e "select count(*) from knownToNextProt\G" $tempDb | tail -n +2 # count(*): 45773 # make knownToWikipedia #STOPPED HERE mkdir $dir/wikipedia cd $dir/wikipedia hgsql $tempDb -e "select geneSymbol,name from knownGene g, kgXref x where g.name=x.kgId " | sort > $tempDb.symbolToId.txt join -t $'\t' /hive/groups/browser/wikipediaScrape/symbolToPage.txt $tempDb.symbolToId.txt | tawk '{print $3,$2}' | sort | uniq > $tempDb.idToPage.txt hgLoadSqlTab $tempDb knownToWikipedia $HOME/kent/src/hg/lib/knownTo.sql $tempDb.idToPage.txt # THIS HAS TO BE DONE AFTER MOVING tempDb to hg38 # MAKE FOLDUTR TABLES # First set up directory structure and extract UTR sequence on hgwdev cd $dir mkdir -p rnaStruct cd rnaStruct mkdir -p utr3/split utr5/split utr3/fold utr5/fold # these commands take some significant time utrFa $db knownGene utr3 utr3/utr.fa utrFa $db knownGene utr5 utr5/utr.fa # Split up files and make files that define job. faSplit sequence utr3/utr.fa 10000 utr3/split/s faSplit sequence utr5/utr.fa 10000 utr5/split/s ls -1 utr3/split > utr3/in.lst ls -1 utr5/split > utr5/in.lst cd utr3 cat << _EOF_ > template #LOOP rnaFoldBig split/\$(path1) fold #ENDLOOP _EOF_ gensub2 in.lst single template jobList cp template ../utr5 cd ../utr5 gensub2 in.lst single template jobList # Do cluster runs for UTRs ssh $cpuFarm "cd $dir/rnaStruct/utr3; para make jobList" ssh $cpuFarm "cd $dir/rnaStruct/utr5; para make jobList" ssh $cpuFarm "cd $dir/rnaStruct/utr3; para time" ssh $cpuFarm "cd $dir/rnaStruct/utr5; para time" # Load database cd $dir/rnaStruct/utr5 hgLoadRnaFold $db foldUtr5 fold cd ../utr3 hgLoadRnaFold -warnEmpty $db foldUtr3 fold # Clean up rm -r split fold err batch.bak cd ../utr5 rm -r split fold err batch.bak # Make pfam run. Actual cluster run is about 6 hours. #mkdir -p /hive/data/outside/pfam/Pfam27.0 #cd /hive/data/outside/pfam/Pfam27.0 #wget ftp://ftp.sanger.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz #gunzip Pfam-A.hmm.gz #set pfamScratch = $scratchDir/pfamBR #ssh $cpuFarm mkdir -p $pfamScratch #ssh $cpuFarm cp /hive/data/outside/pfam/Pfam26.0/Pfam-A.hmm $pfamScratch mkdir -p $dir/pfam cd $dir/pfam rm -rf splitProt mkdir splitProt faSplit sequence $dir/ucscGenes.faa 10000 splitProt/ mkdir -p result ls -1 splitProt > prot.list # /hive/data/outside/pfam/hmmpfam -E 0.1 /hive/data/outside/pfam/current/Pfam_fs \ cat << '_EOF_' > doPfam #!/bin/csh -ef /hive/data/outside/pfam/Pfam29.0/PfamScan/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam29.0/Pfam-A.hmm splitProt/$1 mv /scratch/tmp/pfam.$2.pf $3 _EOF_ # << happy emacs chmod +x doPfam cat << '_EOF_' > template #LOOP doPfam $(path1) $(root1) {check out line+ result/$(root1).pf} #ENDLOOP _EOF_ gensub2 prot.list single template jobList ssh $cpuFarm "cd $dir/pfam; para make jobList" ssh $cpuFarm "cd $dir/pfam; para time > run.time" cat run.time # Completed: 9427 of 9427 jobs # CPU time in finished jobs: 2274331s 37905.52m 631.76h 26.32d 0.072 y # IO & Wait Time: 480321s 8005.35m 133.42h 5.56d 0.015 y # Average job time: 292s 4.87m 0.08h 0.00d # Longest finished job: 370s 6.17m 0.10h 0.00d # Submission to last job: 2927s 48.78m 0.81h 0.03d # Make up pfamDesc.tab by converting pfam to a ra file first cat << '_EOF_' > makePfamRa.awk /^NAME/ {print} /^ACC/ {print} /^DESC/ {print} /^TC/ {print $1,$3; printf("\n");} _EOF_ awk -f makePfamRa.awk /hive/data/outside/pfam/Pfam29.0/Pfam-A.hmm > pfamDesc.ra raToTab -cols=ACC,NAME,DESC,TC pfamDesc.ra stdout | awk -F '\t' '{printf("%s\t%s\t%s\t%g\n", $1, $2, $3, $4);}' | sort > pfamDesc.tab # Convert output to tab-separated file. cd $dir/pfam catDir result | sed '/^#/d' > allResults.tab awk 'BEGIN {OFS="\t"} { print $5,$1,$18-1,$19,$4,$14}' allResults.tab | sort > allUcscPfam.tab join -t $'\t' -j 1 allUcscPfam.tab pfamDesc.tab | tawk '{if ($6 > $9) print $2, $3, $4, $5, $6, $1}' > ucscPfam.tab cd $dir # Convert output to knownToPfam table tawk '{print $1, gensub(/\.[0-9]+/, "", "g", $6)}' pfam/ucscPfam.tab | sort -u > knownToPfam.tab hgLoadSqlTab -notOnServer $tempDb knownToPfam $kent/src/hg/lib/knownTo.sql knownToPfam.tab tawk '{print gensub(/\.[0-9]+/, "", "g", $1), $2, $3}' pfam/pfamDesc.tab| hgLoadSqlTab -notOnServer $tempDb pfamDesc $kent/src/hg/lib/pfamDesc.sql stdin cd $dir/pfam genePredToFakePsl $tempDb knownGene knownGene.psl cdsOut.tab sort cdsOut.tab | sed 's/\.\./ /' > sortCdsOut.tab sort ucscPfam.tab> sortPfam.tab awk '{print $10, $11}' knownGene.psl > gene.sizes join sortCdsOut.tab sortPfam.tab | awk '{print $1, $2 - 1 + 3 * $4, $2 - 1 + 3 * $5, $6}' | bedToPsl gene.sizes stdin domainToGene.psl pslMap domainToGene.psl knownGene.psl stdout | pslToBed stdin stdout | bedOrBlocks -useName stdin domainToGenome.bed hgLoadBed $tempDb ucscGenePfam domainToGenome.bed # Do scop run. Takes about 6 hours #mkdir /hive/data/outside/scop/1.75 #cd /hive/data/outside/scop/1.75 #wget "ftp://license:SlithyToves@supfam.org/models/hmmlib_1.75.gz" #gunzip hmmlib_1.75.gz #wget "ftp://license:SlithyToves@supfam.org/models/model.tab.gz" #gunzip model.tab.gz mkdir -p $dir/scop cd $dir/scop rm -rf result mkdir result ls -1 ../pfam/splitProt > prot.list cat << '_EOF_' > doScop #!/bin/csh -ef /hive/data/outside/pfam/Pfam29.0/PfamScan/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch --domtblout /scratch/tmp/scop.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/scop/1.75/hmmlib_1.75 ../pfam/splitProt/$1 mv /scratch/tmp/scop.$2.pf $3 _EOF_ # << happy emacs chmod +x doScop cat << '_EOF_' > template #LOOP doScop $(path1) $(root1) {check out line+ result/$(root1).pf} #ENDLOOP _EOF_ # << happy emacs gensub2 prot.list single template jobList ssh $cpuFarm "cd $dir/scop; para make jobList" ssh $cpuFarm "cd $dir/scop; para time > run.time" cat run.time #Completed: 9425 of 9425 jobs #CPU time in finished jobs: 2111474s 35191.24m 586.52h 24.44d 0.067 y #IO & Wait Time: 457978s 7632.96m 127.22h 5.30d 0.015 y #Average job time: 273s 4.54m 0.08h 0.00d #Longest finished job: 489s 8.15m 0.14h 0.01d #Submission to last job: 39656s 660.93m 11.02h 0.46d # Convert scop output to tab-separated files cd $dir catDir scop/result | sed '/^#/d' | awk 'BEGIN {OFS="\t"} {if ($7 <= 0.0001) print $1,$18-1,$19,$4, $7,$8}' | sort > scopPlusScore.tab sort -k 2 /hive/data/outside/scop/1.75/model.tab > scop.model.tab scopCollapse scopPlusScore.tab scop.model.tab ucscScop.tab \ scopDesc.tab knownToSuper.tab hgLoadSqlTab -notOnServer $tempDb scopDesc $kent/src/hg/lib/scopDesc.sql scopDesc.tab hgLoadSqlTab $tempDb knownToSuper $kent/src/hg/lib/knownToSuper.sql knownToSuper.tab #hgsql $tempDb -e "delete k from knownToSuper k, kgXref x where k.gene = x.kgID and x.geneSymbol = 'abParts'" hgLoadSqlTab $tempDb ucscScop $kent/src/hg/lib/ucscScop.sql ucscScop.tab # Regenerate ccdsKgMap table $kent/src/hg/makeDb/genbank/bin/x86_64/mkCcdsGeneMap -db=$tempDb -loadDb $db.ccdsGene knownGene ccdsKgMap # DIDN'T DO mkdir -p retroTmp hgsql -Ne "select kgName from ucscRetroInfo9" hg38 | sort -u > retroTmp/0 tawk '{print $1,$1}' retroTmp/0 | sed 's/\.[0-9]*//' | sort -u > retroTmp/1 hgsql hg38 -Ne "select alignId,name from knownGene" | sed 's/\.[0-9]*//' | sort -u > retroTmp/2 join retroTmp/1 retroTmp/2 | awk 'BEGIN {OFS="\t"} {print $2,$3}' > retroTmp/3 hgsql -Ne "select * from ucscRetroInfo9" hg38 | tawk '{print $44,"noKg"}' | sort -u > retroTmp/4 cat retroTmp/3 retroTmp/4 | tawk '{if (!found[$1]) print; found[$1]=1}' > retroTmp/5 hgsql -Ne "select * from ucscRetroInfo9" hg38 | subColumn 44 stdin retroTmp/5 stdout | sort -k1,1 -k2,2n > newUcscRetroInfo9.txt rm -rf retroTmp hgsql hg38 -Ne "create table newUcscRetroInfo9 like ucscRetroInfo9" hgsql hg38 -Ne "load data local infile 'newUcscRetroInfo9.txt' into table newUcscRetroInfo9;" hgsql hg38 -Ne "rename table ucscRetroInfo9 to oldUcscRetroInfo9" hgsql hg38 -Ne "rename table newUcscRetroInfo9 to ucscRetroInfo9" # Do BioCyc Pathways build mkdir /hive/data/outside/bioCyc/190905 cd /hive/data/outside/bioCyc/190905 mkdir download cd download wget --timestamping --no-directories --recursive --user=biocyc-flatfiles --password=data-20541 "http://brg-files.ai.sri.com/public/dist/humancyc.tar.gz" tar xvzf humancyc.tar.gz export bioCycDir=/hive/data/outside/bioCyc/190905/download/humancyc/21.0/data mkdir $dir/bioCyc cd $dir/bioCyc grep -E -v "^#" $bioCycDir/genes.col > genes.tab grep -E -v "^#" $bioCycDir/pathways.col | awk -F'\t' '{if (140 == NF) { printf "%s\t\t\n", $0; } else { print $0}}' > pathways.tab kgBioCyc1 -noEnsembl genes.tab pathways.tab $tempDb bioCycPathway.tab bioCycMapDesc.tab hgLoadSqlTab $tempDb bioCycPathway ~/kent/src/hg/lib/bioCycPathway.sql ./bioCycPathway.tab hgLoadSqlTab $tempDb bioCycMapDesc ~/kent/src/hg/lib/bioCycMapDesc.sql ./bioCycMapDesc.tab # Do KEGG Pathways build (borrowing Fan Hus's strategy from hg38.txt) mkdir -p $dir/kegg cd $dir/kegg # Make the keggMapDesc table, which maps KEGG pathway IDs to descriptive names cp /cluster/data/hg19/bed/ucsc.14.3/kegg/map_title.tab . # wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab cat map_title.tab | sed -e 's/\t/\thsa\t/' > j.tmp cut -f 2 j.tmp >j.hsa cut -f 1,3 j.tmp >j.1 paste j.hsa j.1 |sed -e 's/\t//' > keggMapDesc.tab rm j.hsa j.1 j.tmp hgLoadSqlTab -notOnServer $tempDb keggMapDesc $kent/src/hg/lib/keggMapDesc.sql keggMapDesc.tab # Following in two-step process, build/load a table that maps UCSC Gene IDs # to LocusLink IDs and to KEGG pathways. First, make a table that maps # LocusLink IDs to KEGG pathways from the downloaded data. Store it temporarily # in the keggPathway table, overloading the schema. cp /cluster/data/hg19/bed/ucsc.14.3/kegg/hsa_pathway.list . cat hsa_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp hgLoadSqlTab -notOnServer $tempDb keggPathway $kent/src/hg/lib/keggPathway.sql j.tmp # Next, use the temporary contents of the keggPathway table to join with # knownToLocusLink, creating the real content of the keggPathway table. # Load this data, erasing the old temporary content hgsql $tempDb -B -N -e 'select distinct name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' > keggPathway.tab hgLoadSqlTab -notOnServer $tempDb \ keggPathway $kent/src/hg/lib/keggPathway.sql keggPathway.tab # Finally, update the knownToKeggEntrez table from the keggPathway table. hgsql $tempDb -B -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u| sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab hgLoadSqlTab -notOnServer $tempDb knownToKeggEntrez $kent/src/hg/lib/knownToKeggEntrez.sql knownToKeggEntrez.tab #hgsql $tempDb -e "delete k from knownToKeggEntrez k, kgXref x where k.name = x.kgID and x.geneSymbol = 'abParts'" # Make spMrna table cd $dir #hgsql $db -N -e "select spDisplayID,kgID from kgXref where spDisplayID != ''" > spMrna.tab; hgsql $tempDb -N -e "select spDisplayID,kgID from kgXref where spDisplayID != ''" > spMrna.tab; hgLoadSqlTab $tempDb spMrna $kent/src/hg/lib/spMrna.sql spMrna.tab # Do CGAP tables mkdir -p $dir/cgap cd $dir/cgap wget --timestamping -O Hs_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Hs_GeneData.dat" hgCGAP Hs_GeneData.dat cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab hgLoadSqlTab -notOnServer $tempDb cgapAlias $kent/src/hg/lib/cgapAlias.sql ./cgapAlias.tab hgLoadSqlTab -notOnServer $tempDb cgapBiocPathway $kent/src/hg/lib/cgapBiocPathway.sql ./cgapBIOCARTA.tab cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab hgLoadSqlTab -notOnServer $tempDb cgapBiocDesc $kent/src/hg/lib/cgapBiocDesc.sql cgapBIOCARTAdescSorted.tab cd $dir # Make PCR target for UCSC Genes, Part 1. # 1. Get a set of IDs that consist of the UCSC Gene accession concatenated with the # gene symbol, e.g. uc010nxr.1__DDX11L1 hgsql $db -N -e 'select kgId,geneSymbol from kgXref' \ | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \ | sort -u > idSub.txt # 2. Get a file of per-transcript fasta sequences that contain the sequences of each UCSC Genes transcript, with this new ID in the place of the UCSC Genes accession. Convert that file to TwoBit format and soft-link it into /gbdb/hg38/targetDb/ awk '{if (!found[$4]) print; found[$4]=1 }' ucscGenes.bed > nodups.bed subColumn 4 nodups.bed idSub.txt ucscGenesIdSubbed.bed sequenceForBed -keepName -db=$db -bedIn=ucscGenesIdSubbed.bed -fastaOut=stdout | faToTwoBit stdin ${db}KgSeq${curVer}.2bit mkdir -p /gbdb/$db/targetDb/ rm -f /gbdb/$db/targetDb/${db}KgSeq${curVer}.2bit ln -s $dir/${db}KgSeq${curVer}.2bit /gbdb/$db/targetDb/ # Load the table kgTargetAli, which shows where in the genome these targets are. cut -f 1-10 knownGene.gp | genePredToFakePsl $tempDb stdin kgTargetAli.psl /dev/null hgLoadPsl $tempDb kgTargetAli.psl # # At this point we should save a list of the tables in tempDb!!! cd $dir echo "show tables" | hgsql $tempDb > tablesInKnownGene.lst cp ../ucsc.19.1/hg38.knownGene.tables.txt . #cp ../ucsc.17.1/hg38.all.knownGene.tables.txt . hgsql $tempDb -e "show tables like 'knownTo%'" | tail -n +2 | sort > $tempDb.knownTo.txt join -v 2 $tempDb.knownTo.txt hg38.knownGene.tables.txt | grep knownTo #nothing #cat hg38.knownTo.txt hg38.knownGene.tables.txt | sort -u > hg38.all.knownGene.tables.txt # added ccdsKgMap gnfAtlas2Distance gnfU95Distance for i in `cat hg38.knownGene.tables.txt`; do echo "desc $i;" | hgsql $tempDb; done 2>&1 | grep exist | awk '{print $8}' > missing.tables.txt join -v 1 tablesInKnownGene.lst hg38.all.knownGene.tables.txt #kg10ToKg11 #kgXrefOld10 #knownAttrs #knownCds #knownGeneOld10 #knownToEnsembl #knownToGencodeV26 #knownToMrna #knownToMrnaSingle #knownToTag join -v 2 tablesInKnownGene.lst hg38.all.knownGene.tables.txt TBD 'tmpFoo67.ceBlastTab' TBD 'tmpFoo67.dmBlastTab' TBD 'tmpFoo67.drBlastTab' TBD 'tmpFoo67.foldUtr3' TBD 'tmpFoo67.foldUtr5' X 'tmpFoo67.kg7ToKg8' X 'tmpFoo67.kgProtMap2' TBD 'tmpFoo67.kgTargetAli' X 'tmpFoo67.kgTxInfo' Y 'tmpFoo67.knownAlt' Y'tmpFoo67.knownGeneMrna' X 'tmpFoo67.knownGeneTxMrna' X 'tmpFoo67.knownGeneTxPep' TBD 'tmpFoo67.scBlastTab' #kg7ToKg8 #kgProtMap2 #kgTxInfo #knownGeneTxMrna #knownGeneTxPep #knownToGencodeV20 #knownToGnf1h #knownToWikipedia # Create backup database hgsqladmin create ${db}Backup4 # Swap in new tables, moving old tables to backup database. for i in `cat $lastVer.table.lst` do echo "rename table $db.$i to ${db}Backup4.$i;" done > /tmp/1 cat /tmp/1 | hgsql hg38 # Drop tempDb history table and chromInfo, we don't want to swap them in! hgsql -e "drop table history" $tempDb hgsql -e "drop table chromInfo" $tempDb echo "show tables" | hgsql -N $tempDb > tablesInKnownGene.lst for i in `cat tablesInKnownGene.lst` do echo "rename table $tempDb.$i to ${db}.$i;" done > /tmp/1 cat /tmp/1 | hgsql hg38 ERROR 1146 (42S02) at line 1: Table 'tmpFoo87.Tables_in_tmpFoo87' doesn't exist ERROR 1050 (42S01) at line 1: Table 'gnfAtlas2Distance' already exists ERROR 1050 (42S01) at line 1: Table 'gnfU95Distance' already exists ERROR 1050 (42S01) at line 1: Table 'kg10ToKg11' already exists ERROR 1050 (42S01) at line 1: Table 'kgXrefOld10' already exists ERROR 1050 (42S01) at line 1: Table 'knownAttrs' already exists ERROR 1050 (42S01) at line 1: Table 'knownCds' already exists ERROR 1050 (42S01) at line 1: Table 'knownGeneExt' already exists ERROR 1050 (42S01) at line 1: Table 'knownGeneOld10' already exists ERROR 1050 (42S01) at line 1: Table 'knownToEnsembl' already exists ERROR 1050 (42S01) at line 1: Table 'knownToGnfAtlas2' already exists ERROR 1050 (42S01) at line 1: Table 'knownToMrna' already exists ERROR 1050 (42S01) at line 1: Table 'knownToMrnaSingle' already exists ERROR 1050 (42S01) at line 1: Table 'knownToTag' already exists ERROR 1050 (42S01) at line 1: Table 'knownToU133' already exists ERROR 1050 (42S01) at line 1: Table 'knownToU95' already exists #ERROR 1050 (42S01) at line 1: Table 'chromInfo' already exists #ERROR 1050 (42S01) at line 1: Table 'history' already exists #ERROR 1050 (42S01) at line 1: Table 'knownAttrs' already exists #ERROR 1050 (42S01) at line 1: Table 'knownCds' already exists #ERROR 1050 (42S01) at line 1: Table 'knownToEnsembl' already exists #ERROR 1050 (42S01) at line 1: Table 'knownToMrna' already exists #ERROR 1050 (42S01) at line 1: Table 'knownToMrnaSingle' already exists #ERROR 1050 (42S01) at line 1: Table 'knownToTag' already exists hgsqladmin flush-tables # Make full text index. Takes a minute or so. After this the genome browser # tracks display will work including the position search. The genes details # NOW SWAP IN TABLES FROM TEMP DATABASE TO MAIN DATABASE. # You'll need superuser powers for this step..... cd $dir cat << _EOF_ > moveTablesIntoPlace # Save old known genes and kgXref tables sudo ~kent/bin/copyMysqlTable $db knownGene $tempDb knownGeneOld$lastVer sudo ~kent/bin/copyMysqlTable $db kgXref $tempDb kgXrefOld$lastVer # Create backup database hgsqladmin create ${db}BackupBraney2 # Swap in new tables, moving old tables to backup database. sudo ~kent/bin/swapInMysqlTempDb $tempDb $db ${db}BackupBraney2 _EOF_ # Update database links. sudo rm /var/lib/mysql/uniProt sudo ln -s /var/lib/mysql/$spDb /var/lib/mysql/uniProt sudo rm /var/lib/mysql/proteome sudo ln -s /var/lib/mysql/$pbDb /var/lib/mysql/proteome hgsqladmin flush-tables # Make full text index. Takes a minute or so. After this the genome browser # tracks display will work including the position search. The genes details # page, gene sorter, and proteome browser still need more tables. mkdir -p $dir/index cd $dir/index hgKgGetText $db knownGene.text ixIxx knownGene.text knownGene.ix knownGene.ixx rm -f /gbdb/$db/knownGene.ix /gbdb/$db/knownGene.ixx ln -s $dir/index/knownGene.ix /gbdb/$db/knownGene.ix ln -s $dir/index/knownGene.ixx /gbdb/$db/knownGene.ixx # 3. Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on # /gbdb/$db/targetDb/kgTargetSeq${curVer}.2bit # 4. On hgwdev, insert new records into blatServers and targetDb, using the # host (field 2) and port (field 3) specified by cluster-admin. Identify the # blatServer by the keyword "$db"Kg with the version number appended # untrans gfServer for hg38KgSeq12 on host blat1b, port 17897 hgsql hgcentraltest -e \ 'INSERT into blatServers values ("hg38KgSeq12", "blat1b", 17897, 0, 1);' hgsql hgcentraltest -e \ 'INSERT into targetDb values("hg38KgSeq12", "UCSC Genes", \ "hg38", "kgTargetAli", "", "", \ "/gbdb/hg38/targetDb/hg38KgSeq12.2bit", 1, now(), "");' # ## ## WRAP-UP # # add database to the db's in kent/src/hg/visiGene/vgGetText cd $dir # # Finally, need to wait until after testing, but update databases in other organisms # with blastTabs # Load blastTabs cd $dir/hgNearBlastp hgLoadBlastTab $xdb $blastTab run.$xdb.$tempDb/out/*.tab hgLoadBlastTab $ratDb $blastTab run.$ratDb.$tempDb/out/*.tab hgLoadBlastTab $flyDb $blastTab run.$flyDb.$tempDb/recipBest.tab hgLoadBlastTab $wormDb $blastTab run.$wormDb.$tempDb/recipBest.tab hgLoadBlastTab $yeastDb $blastTab run.$yeastDb.$tempDb/recipBest.tab hgLoadBlastTab $fishDb $blastTab run.$fishDb.$tempDb/recipBest.tab # Do synteny on mouse/human/rat synBlastp.csh $xdb $db # old number of unique query values: 61250 # old number of unique target values 27574 # new number of unique query values: 52518 # new number of unique target values 25367 synBlastp.csh $ratDb $db ensGene knownGene # old number of unique query values: 28159 # old number of unique target values 19155 # new number of unique query values: 23777 # new number of unique target values 17885 # Last step in setting up isPCR: after the new UCSC Genes with the new Known Gene isPcr # is released, take down the old isPcr gfServer ####################### ### The following is the process Briam Lee used to pull out only # the genes from knownToLocusLink for which there are Wikipedia articles. ### get the full knownToLocusLinkTable # hgsql -Ne 'select value from knownToLocusLink' hg38 | sort -u >> knToLocusLink ### query Wikipedia for each to if there is an article # for i in $(cat knToLocusLink); do lynx -dump "http://genewiki.sulab.org/map/wiki/"$i | grep -m 1 "no results" >trash ; echo $? $i | grep "1 "| awk '{print $2}'>> workingLinks; done ### pull out all isoforms that have permitted LocusLinkIds # for i in $(cat workingLinks); do hgsql -Ne 'select * from knownToLocusLink where value like "'$i'"' hg38 >> knownToWikipediaNew; done ### then load the table as knownToWikipedia using the knowToLocusLink INDICES. # This Section marked as not done... not done again! # Make up and load kgColor table. Takes about a minute. $txGeneColor $spDb ucscGenes.info ucscGenes.picks ucscGenes.color $hgLoadSqlTab -notOnServer $tempDb kgColor $kent/src/hg/lib/kgColor.sql ucscGenes.color # Load up kgProtMap2 table that says where exons are in terms of CDS $txGeneCdsMap weeded.bed weeded.info pick.picks refTweaked.psl \ refToPep.tab $genomes/$db/chrom.sizes cdsToRna.psl \ rnaToGenome.psl # Missed 337 of 40856 refSeq protein mappings. A small number of RefSeqs just map to genome in the UTR. $pslMap cdsToRna.psl rnaToGenome.psl cdsToGenome.psl $hgLoadPsl $tempDb ucscProtMap.psl -table=kgProtMap2 # TODO: Create knownToTreefam table. $mkdir -p $dir/treeFam $cd $dir/treeFam $wget "http://www.treefam.org/static/download/treefam_family_data.tar.gz" $tar xfz treefam_family_data.tar.gz $cd treefam_family_data $egrep -a ">ENSP[0-9]+" * | cut -d/ -f2 | tr -d : | sed -e 's/.aa.fasta//' |sed -e 's/.cds.fasta//' | grep -v accession | grep -v ^\> | tr '>' '\t' | tawk '{print $2,$1}' > ../ensToTreefam.tab $cd .. # hgMapToGene -exclude=abGenes.txt -tempDb=$tempDb $db ensGene knownGene knownToEnsembl -noLoad #awk '{print $2,$1}' ../knownToEnsembl.tab | sort | uniq > ensTransUcsc.tab $hgsql $db -e "select value,name from knownToEnsembl" | sort | uniq > ensTransUcsc.tab $echo "select transcript,protein from ensGtp" | hgsql hg38 | sort | uniq | awk '{if (NF==2) print}' > ensTransProt.tab $join ensTransUcsc.tab ensTransProt.tab | awk '{if (NF==3)print $3, $2}' | sort | uniq > ensProtToUc.tab $join ensProtToUc.tab ensToTreefam.tab | sort -u | awk 'BEGIN {OFS="\t"} {print $2,$3}' | sort -u > knownToTreefam.tab $hgLoadSqlTab $tempDb knownToTreefam $kent/src/hg/lib/knownTo.sql knownToTreefam.tab #end section not done # make bigKnownGene.bb cd $dir makeBigKnown hg38 rm -f /gbdb/hg38/knownGene32.bb ln -s `pwd`/hg38.knownGene.bb /gbdb/hg38/knownGene32.bb # Build knownToMupit mkdir mupit cd mupit # mupit-pdbids.txt was emailed from Kyle Moad (kmoad@insilico.us.com) # wc -l cp /hive/data/outside/mupit/mupit-pdbids.txt . # get knownGene IDs and associated PDB IDS # the extDb{Ref} parts come from hg/hgGene/domains.c:domainsPrint() hgsql -Ne "select kgID, extAcc1 from $db.kgXref x \ inner join sp180404.extDbRef sp on x.spID = sp.acc \ inner join sp180404.extDb e on sp.extDb=e.id \ where x.spID != '' and e.val='PDB' order by kgID" \ > $db.knownToPdb.txt; # filter out pdbIds not found in mupit cat mupit-pdbids.txt | tr '[a-z]' '[A-Z]' | \ grep -Fwf - $db.knownToPdb.txt > knownToMupit.txt; # check that it filtered correctly: # cut -f2 $db.knownToMuipit.txt | sort -u | wc -l; # load new table for hgGene/hgc hgLoadSqlTab $db knownToMupit ~/kent/src/hg/lib/knownTo.sql knownToMupit.txt ############################################################################# # hgPal downloads mkdir $dir/pal cd $dir/pal cat /hive/data/genomes/hg38/bed/multiz100way/species.list | tr '[ ]' '[\n]' > order.list export mz=multiz100way export gp=knownGene export db=hg38 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 /cluster/data/hg38/chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x ./$gp.jobs > $gp.jobs.log 2>&1 & # real 139m22.735s time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 4m45.035s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 16m29.138s export mz=multiz100way export gp=knownGene export db=hg38 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt rm -f $pd/$gp.exonAA.fa.gz rm -f $pd/$gp.exonNuc.fa.gz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz rm -rf exonAA exonNuc ### And knownCanonical cd $dir/pal export mz=multiz100way export gp=knownCanonical export db=hg38 mkdir exonAA exonNuc knownCanonical time cut -f1 /cluster/data/hg38/chrom.sizes | while read C do echo $C 1>&2 hgsql hg38 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed done # real 0m15.897s ls knownCanonical/*.known.bed | while read F do if [ -s $F ]; then echo $F | sed -e 's#knownCanonical/##; s/.known.bed//' fi done | while read C do echo "date" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz knownGene order.list stdout | \ gzip -c > exonNuc/$C.exonNuc.fa.gz" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz knownGene order.list stdout | \ gzip -c > exonAA/$C.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 # 267m58.813s rm *.known.bed export mz=multiz100way export gp=knownCanonical export db=hg38 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz & zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz & # about 6 minutes rm -rf exonAA exonNuc export mz=multiz100way export gp=knownCanonical export db=hg38 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd rm -f $pd/$gp.exonAA.fa.gz rm -f $pd/$gp.exonNuc.fa.gz ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz cd $pd md5sum *.fa.gz > md5sum.txt + +#myGene2 +mkdir $dir/myGene2 +cd $dir/myGene2 + +# copy list of genes from https://mygene2.org/MyGene2/genes +awk '{print $1}' | sort > genes.lst +hgsql hg38 -Ne "select geneSymbol, kgId from kgXref" | sort > ids.txt +join genes.lst ids.txt | awk '{print $2,$1}' | sort > knownToMyGene2.txt +hgLoadSqlTab $db knownToMyGene2 ~/kent/src/hg/lib/knownTo.sql knownToMyGene2.txt