d0aad7f66c2fee0fa9366c3b3caf4d761750c055 braney Tue Aug 17 10:27:07 2021 -0700 first version of Gencode genes (knownGene version) for mm39 diff --git src/hg/utils/otto/knownGene/buildPfamScop.sh src/hg/utils/otto/knownGene/buildPfamScop.sh new file mode 100755 index 0000000..66ac877 --- /dev/null +++ src/hg/utils/otto/knownGene/buildPfamScop.sh @@ -0,0 +1,101 @@ +#!/bin/sh -ex +cd $dir +{ +mkdir -p $dir/pfam +cd $dir/pfam +rm -rf splitProt +mkdir splitProt +faSplit sequence $dir/ucscGenes.faa 10000 splitProt/ +mkdir -p result +ls -1 splitProt > prot.list +# /hive/data/outside/pfam/hmmpfam -E 0.1 /hive/data/outside/pfam/current/Pfam_fs \ +#/hive/data/outside/pfam/Pfam29.0/PfamScan/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam29.0/Pfam-A.hmm splitProt/$1 +cat << '_EOF_' > doPfam +#!/bin/csh -ef +/hive/data/outside/pfam/Pfam34.0/hmmer-3.3.2/src/hmmsearch --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam34.0/Pfam-A.hmm splitProt/$1 +mv /scratch/tmp/pfam.$2.pf $3 +_EOF_ + # << happy emacs +chmod +x doPfam +cat << '_EOF_' > template +#LOOP +doPfam $(path1) $(root1) {check out line+ result/$(root1).pf} +#ENDLOOP +_EOF_ +gensub2 prot.list single template jobList + +ssh $cpuFarm "cd $dir/pfam; para make jobList" +ssh $cpuFarm "cd $dir/pfam; para time > run.time" +cat run.time + +# Make up pfamDesc.tab by converting pfam to a ra file first +cat << '_EOF_' > makePfamRa.awk +/^NAME/ {print} +/^ACC/ {print} +/^DESC/ {print} +/^TC/ {print $1,$3; printf("\n");} +_EOF_ +awk -f makePfamRa.awk /hive/data/outside/pfam/Pfam34.0/Pfam-A.hmm > pfamDesc.ra +raToTab -cols=ACC,NAME,DESC,TC pfamDesc.ra stdout | awk -F '\t' '{printf("%s\t%s\t%s\t%g\n", $1, $2, $3, $4);}' | sort > pfamDesc.tab + +# Convert output to tab-separated file. +cd $dir/pfam +catDir result | sed '/^#/d' > allResults.tab +awk 'BEGIN {OFS="\t"} { print $5,$1,$18-1,$19,$4,$14}' allResults.tab | sort > allUcscPfam.tab +join -t $'\t' -j 1 allUcscPfam.tab pfamDesc.tab | tawk '{if ($6 > $9) print $2, $3, $4, $5, $6, $1}' > ucscPfam.tab +cd $dir + +# Convert output to knownToPfam table +tawk '{print $1, gensub(/\.[0-9]+/, "", "g", $6)}' pfam/ucscPfam.tab | sort -u > knownToPfam.tab +hgLoadSqlTab -notOnServer $tempDb knownToPfam $kent/src/hg/lib/knownTo.sql knownToPfam.tab +tawk '{print gensub(/\.[0-9]+/, "", "g", $1), $2, $3}' pfam/pfamDesc.tab| hgLoadSqlTab -notOnServer $tempDb pfamDesc $kent/src/hg/lib/pfamDesc.sql stdin + +cd $dir/pfam +genePredToFakePsl $tempDb knownGene knownGene.psl cdsOut.tab +sort cdsOut.tab | sed 's/\.\./ /' > sortCdsOut.tab +sort ucscPfam.tab> sortPfam.tab +awk '{print $10, $11}' knownGene.psl > gene.sizes +join sortCdsOut.tab sortPfam.tab | awk '{print $1, $2 - 1 + 3 * $4, $2 - 1 + 3 * $5, $6}' | bedToPsl gene.sizes stdin domainToGene.psl +pslMap domainToGene.psl knownGene.psl stdout | pslToBed stdin stdout | bedOrBlocks -useName stdin domainToGenome.bed +hgLoadBed $tempDb ucscGenePfam domainToGenome.bed + +# do SCOP run now +mkdir -p $dir/scop +cd $dir/scop +rm -rf result +mkdir result +ls -1 ../pfam/splitProt > prot.list +cat << '_EOF_' > doScop +#!/bin/csh -ef +/hive/data/outside/pfam/Pfam34.0/hmmer-3.3.2/src/hmmsearch --domtblout /scratch/tmp/scop.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/scop/1.75/hmmlib_1.75 ../pfam/splitProt/$1 +mv /scratch/tmp/scop.$2.pf $3 +_EOF_ + # << happy emacs +chmod +x doScop +cat << '_EOF_' > template +#LOOP +doScop $(path1) $(root1) {check out line+ result/$(root1).pf} +#ENDLOOP +_EOF_ + # << happy emacs +gensub2 prot.list single template jobList + +ssh $cpuFarm "cd $dir/scop; para make jobList" +ssh $cpuFarm "cd $dir/scop; para time > run.time" +cat run.time + +# Convert scop output to tab-separated files +cd $dir +catDir scop/result | sed '/^#/d' | awk 'BEGIN {OFS="\t"} {if ($7 <= 0.0001) print $1,$18-1,$19,$4, $7,$8}' | sort > scopPlusScore.tab +sort -k 2 /hive/data/outside/scop/1.75/model.tab > scop.model.tab +scopCollapse scopPlusScore.tab scop.model.tab ucscScop.tab \ + scopDesc.tab knownToSuper.tab +hgLoadSqlTab -notOnServer $tempDb scopDesc $kent/src/hg/lib/scopDesc.sql scopDesc.tab +hgLoadSqlTab $tempDb knownToSuper $kent/src/hg/lib/knownToSuper.sql knownToSuper.tab +#hgsql $tempDb -e "delete k from knownToSuper k, kgXref x where k.gene = x.kgID and x.geneSymbol = 'abParts'" + +hgLoadSqlTab $tempDb ucscScop $kent/src/hg/lib/ucscScop.sql ucscScop.tab +echo "BuildPfamScop successfully finished" + + +} > doPfamScop.log < /dev/null 2>&1