cd1c11ae2a758a6c3a8902472a3017f35bd85220 braney Sat Feb 6 12:15:53 2021 -0800 tweaks to gencode V36 diff --git src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh index 615dfe2..10e5c32 100644 --- src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh +++ src/hg/makeDb/doc/ucscGenes/hg38.gencodeV36.sh @@ -1,27 +1,30 @@ # This doc assumes that the gencode* tables have been built on $db db=hg38 GENCODE_VERSION=V36 PREV_GENCODE_VERSION=V35 dir=/hive/data/genomes/$db/bed/gencode$GENCODE_VERSION/build genomes=/hive/data/genomes tempDb=knownGeneV36 oldDb=knownGeneV35 kent=$HOME/kent spDb=sp180404 cpuFarm=ku -export curVer=14 +export oldGeneDir=/cluster/data/hg38/bed/ucsc.20.1 +export oldGeneBed=$oldGeneDir/ucscGenes.bed +export lastVer=12 +export curVer=13 export Db=Hg38 export xdb=mm10 export Xdb=Mm10 export ydb=canFam3 export zdb=rheMac8 export ratDb=rn6 export RatDb=Rn6 export fishDb=danRer11 export flyDb=dm6 export wormDb=ce11 export yeastDb=sacCer3 export tempFa=$dir/ucscGenes.faa export genomes=/hive/data/genomes export xdbFa=$genomes/$xdb/bed/ucsc.18.1/ucscGenes.faa export ratFa=$genomes/$ratDb/bed/ensGene.95/ensembl.faa @@ -33,30 +36,42 @@ export wormFa=$genomes/$wormDb/bed/ws245Genes/ws245Pep.faa #export wormFa=$genomes/$wormDb/bed/blastp/wormPep190.faa export yeastFa=$genomes/$yeastDb/bed/sgdAnnotations/blastTab/sacCer3.sgd.faa export scratchDir=/hive/users/braney/scratch export blastTab=hgBlastTab export xBlastTab=mmBlastTab export rnBlastTab=rnBlastTab export dbHost=hgwdev export ramFarm=ku export cpuFarm=ku mkdir -p $dir cd $dir +# first get list of tables from push request in $lastVer.table.lst +# here's the redmine http://redmine.soe.ucsc.edu/issues/21644 +wc -l $oldGeneDir/$lastVer.table.lst +# 62 + +( +cat $oldGeneDir/$lastVer.table.lst | grep -v "ToKg$lastVer" | grep -v "XrefOld" | grep -v "knownGeneOld" | grep -v "knownToGencode" +echo kg${lastVer}ToKg${curVer} +echo knownGeneOld$lastVer +echo kgXrefOld$lastVer +) | sort > $curVer.table.lst + echo "create database $tempDb" | hgsql "" echo "create table chromInfo like $db.chromInfo" | hgsql $tempDb echo "insert into chromInfo select * from $db.chromInfo" | hgsql $tempDb hgsql -e "select * from gencodeAnnot$GENCODE_VERSION" --skip-column-names $db | cut -f 2-16 | genePredToBed stdin stdout | sort -k1,1 -k2,2n | gzip -c > gencode${GENCODE_VERSION}.bed.gz touch oldToNew.tab zcat gencode${GENCODE_VERSION}.bed.gz > ucscGenes.bed txBedToGraph ucscGenes.bed ucscGenes ucscGenes.txg txgAnalyze ucscGenes.txg $genomes/$db/$db.2bit stdout | sort | uniq | bedClip stdin /cluster/data/hg38/chrom.sizes ucscSplice.bed hgLoadBed $tempDb knownAlt ucscSplice.bed zcat gencode${GENCODE_VERSION}.bed.gz | awk '{print $4}' | sort > newGencodeName.txt hgsql $oldDb -Ne "select name,alignId from knownGene" | sort > oldGenToUcsc.txt @@ -119,30 +134,33 @@ tawk '{split($2,a,"."); for(ii = 1; ii <= a[2]; ii++) print $1,a[1] "." ii }' txToAcc.tab >> foo.alias tawk '{split($1,a,"."); for(ii = 1; ii <= a[2] - 1; ii++) print $1,a[1] "." ii }' txToAcc.tab >> foo.alias sort foo.alias | uniq > ucscGenes.alias sort foo.protAlias | uniq > ucscGenes.protAlias rm foo.alias foo.protAlias hgLoadSqlTab -notOnServer $tempDb kgAlias $kent/src/hg/lib/kgAlias.sql ucscGenes.alias hgLoadSqlTab -notOnServer $tempDb kgProtAlias $kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias hgsql $tempDb -N -e 'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' > kgSpAlias_0.tmp hgsql $tempDb -N -e 'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID' >> kgSpAlias_0.tmp cat kgSpAlias_0.tmp|sort -u > kgSpAlias.tab rm kgSpAlias_0.tmp hgLoadSqlTab -notOnServer $tempDb kgSpAlias $kent/src/hg/lib/kgSpAlias.sql kgSpAlias.tab +txGeneExplainUpdate2 $oldGeneBed ucscGenes.bed kgOldToNew.tab +hgLoadSqlTab -notOnServer $tempDb kg${lastVer}ToKg${curVer} $kent/src/hg/lib/kg1ToKg2.sql kgOldToNew.tab + #ifdef NOTNOW hgsql $db -Ne "select * from gencodeToUniProt$GENCODE_VERSION" | tawk '{print $1,$2}'| sort > uniProt.txt hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | tawk '{print $1,$12}' | sort > gene.txt join -a 1 gene.txt uniProt.txt > geneNames.txt #endif //genePredToBigGenePred -score=knownScore.txt -colors=colors.txt -geneNames=geneNames.txt -known gencodeAnnot$GENCODE_VERSION.txt gencodeAnnot$GENCODE_VERSION.bgpInput #hgsql $db -Ne "select * from gencodeAnnot$GENCODE_VERSION" | cut -f 2- > gencodeAnnot$GENCODE_VERSION.txt #genePredToBigGenePred -colors=colors.txt gencodeAnnot$GENCODE_VERSION.txt stdout | sort -k1,1 -k2,2n > gencodeAnnot$GENCODE_VERSION.bgpInput hgsql $tempDb -Ne "select kgId, geneSymbol, spID from kgXref" > geneNames.txt #hgsql $tempDb -Ne "select * from knownCds" > knownCds.txt #genePredToBigGenePred -colors=colors.txt -known knownGene.gp stdout | sort -k1,1 -k2,2n > gencodeAnnot$GENCODE_VERSION.bgpInput genePredToBigGenePred -colors=colors.txt -geneNames=geneNames.txt -known -cds=knownCds.tab knownGene.gp stdout | sort -k1,1 -k2,2n > gencodeAnnot$GENCODE_VERSION.bgpInput @@ -627,15 +645,77 @@ sort -k 2 /hive/data/outside/scop/1.75/model.tab > scop.model.tab scopCollapse scopPlusScore.tab scop.model.tab ucscScop.tab \ scopDesc.tab knownToSuper.tab hgLoadSqlTab -notOnServer $tempDb scopDesc $kent/src/hg/lib/scopDesc.sql scopDesc.tab hgLoadSqlTab $tempDb knownToSuper $kent/src/hg/lib/knownToSuper.sql knownToSuper.tab #hgsql $tempDb -e "delete k from knownToSuper k, kgXref x where k.gene = x.kgID and x.geneSymbol = 'abParts'" hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db affyU133 knownGene knownToU133 hgMapToGene -geneTableType=genePred -tempDb=$tempDb $db affyU95 knownGene knownToU95 mkdir hprd cd hprd wget "http://www.hprd.org/edownload/HPRD_FLAT_FILES_041310" tar xvf HPRD_FLAT_FILES_041310 knownToHprd $tempDb FLAT_FILES_072010/HPRD_ID_MAPPINGS.txt + + time hgExpDistance $tempDb hgFixed.gnfHumanU95MedianRatio \ + hgFixed.gnfHumanU95Exps gnfU95Distance -lookup=knownToU95 + time hgExpDistance $tempDb hgFixed.gnfHumanAtlas2MedianRatio \ + hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \ + -lookup=knownToGnfAtlas2 + +# Build knownToMupit + +mkdir mupit +cd mupit + +# mupit-pdbids.txt was emailed from Kyle Moad (kmoad@insilico.us.com) +# wc -l +cp /hive/data/outside/mupit/mupit-pdbids.txt . +# get knownGene IDs and associated PDB IDS +# the extDb{Ref} parts come from hg/hgGene/domains.c:domainsPrint() +hgsql -Ne "select kgID, extAcc1 from $db.kgXref x \ + inner join sp180404.extDbRef sp on x.spID = sp.acc \ + inner join sp180404.extDb e on sp.extDb=e.id \ + where x.spID != '' and e.val='PDB' order by kgID" \ + > $db.knownToPdb.txt; +# filter out pdbIds not found in mupit +cat mupit-pdbids.txt | tr '[a-z]' '[A-Z]' | \ + grep -Fwf - $db.knownToPdb.txt > knownToMupit.txt; +# check that it filtered correctly: +# cut -f2 $db.knownToMuipit.txt | sort -u | wc -l; +# load new table for hgGene/hgc +hgLoadSqlTab $db knownToMupit ~/kent/src/hg/lib/knownTo.sql knownToMupit.txt + +#myGene2 +mkdir $dir/myGene2 +cd $dir/myGene2 + +# copy list of genes from https://mygene2.org/MyGene2/genes +awk '{print $1}' | sort > genes.lst +hgsql hg38 -Ne "select geneSymbol, kgId from kgXref" | sort > ids.txt +join -t $'\t' genes.lst ids.txt | tawk '{print $2,$1}' | sort > knownToMyGene2.txt +hgLoadSqlTab $db knownToMyGene2 ~/kent/src/hg/lib/knownTo.sql knownToMyGene2.txt + +# make knownToNextProt +mkdir -p $dir/nextProt +cd $dir/nextProt + +wget "ftp://ftp.nextprot.org/pub/current_release/ac_lists/nextprot_ac_list_all.txt" +awk '{print $0, $0}' nextprot_ac_list_all.txt | sed 's/NX_//' | sort > displayIdToNextProt.txt +hgsql -e "select spID,kgId from kgXref" --skip-column-names $tempDb | awk '{if (NF == 2) print}' | sort > displayIdToKgId.txt +join displayIdToKgId.txt displayIdToNextProt.txt | awk 'BEGIN {OFS="\t"} {print $2,$3}' > knownToNextProt.tab +hgLoadSqlTab -notOnServer $tempDb knownToNextProt $kent/src/hg/lib/knownTo.sql knownToNextProt.tab + +# this should be done AFTER moving the new tables into hg38 +hgKgGetText $tempDb tempSearch.txt +sort tempSearch.txt > tempSearch2.txt +tawk '{split($2,a,"."); printf "%s\t", $1;for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' txToAcc.tab | sort > tempSearch3.txt +tawk '{split($2,a,"."); printf "%s\t%s ", $1,a[1];for(ii = 1; ii <= a[2]; ii++) printf "%s ",a[1] "." ii; printf "\n" }' knownAttrs.tab | sort > tempSearch4.txt +join tempSearch2.txt tempSearch3.txt | join /dev/stdin tempSearch4.txt | sort > gencode$GENCODE_VERSION.txt +ixIxx gencode$GENCODE_VERSION.txt gencode$GENCODE_VERSION.ix gencode$GENCODE_VERSION.ixx + rm -rf /gbdb/$tempDb/gencode/gencode$GENCODE_VERSION.ix /gbdb/$tempDb/gencode/gencode$GENCODE_VERSION.ixx +ln -s $dir/gencode$GENCODE_VERSION.ix /gbdb/$tempDb/gencode/gencode$GENCODE_VERSION.ix +ln -s $dir/gencode$GENCODE_VERSION.ixx /gbdb/$tempDb/gencode/gencode$GENCODE_VERSION.ixx + +