d0aad7f66c2fee0fa9366c3b3caf4d761750c055
braney
  Tue Aug 17 10:27:07 2021 -0700
first version of Gencode genes (knownGene version) for mm39

diff --git src/hg/utils/otto/knownGene/buildPfamScop.sh src/hg/utils/otto/knownGene/buildPfamScop.sh
new file mode 100755
index 0000000..66ac877
--- /dev/null
+++ src/hg/utils/otto/knownGene/buildPfamScop.sh
@@ -0,0 +1,101 @@
+#!/bin/sh -ex
+cd $dir
+{
+mkdir -p $dir/pfam
+cd $dir/pfam
+rm -rf  splitProt
+mkdir  splitProt
+faSplit sequence $dir/ucscGenes.faa 10000 splitProt/
+mkdir -p result
+ls -1 splitProt > prot.list
+# /hive/data/outside/pfam/hmmpfam -E 0.1 /hive/data/outside/pfam/current/Pfam_fs \
+#/hive/data/outside/pfam/Pfam29.0/PfamScan/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch   --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam29.0/Pfam-A.hmm     splitProt/$1 
+cat << '_EOF_' > doPfam
+#!/bin/csh -ef  
+/hive/data/outside/pfam/Pfam34.0/hmmer-3.3.2/src/hmmsearch   --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam34.0/Pfam-A.hmm     splitProt/$1 
+mv /scratch/tmp/pfam.$2.pf $3
+_EOF_
+    # << happy emacs
+chmod +x doPfam
+cat << '_EOF_' > template
+#LOOP
+doPfam $(path1) $(root1) {check out line+ result/$(root1).pf}
+#ENDLOOP
+_EOF_
+gensub2 prot.list single template jobList
+
+ssh $cpuFarm "cd $dir/pfam; para make jobList"
+ssh $cpuFarm "cd $dir/pfam; para time > run.time"
+cat run.time
+
+# Make up pfamDesc.tab by converting pfam to a ra file first
+cat << '_EOF_' > makePfamRa.awk
+/^NAME/ {print}
+/^ACC/ {print}
+/^DESC/ {print}
+/^TC/ {print $1,$3; printf("\n");}
+_EOF_
+awk -f makePfamRa.awk  /hive/data/outside/pfam/Pfam34.0/Pfam-A.hmm  > pfamDesc.ra
+raToTab -cols=ACC,NAME,DESC,TC pfamDesc.ra stdout |  awk -F '\t' '{printf("%s\t%s\t%s\t%g\n", $1, $2, $3, $4);}' | sort > pfamDesc.tab
+
+# Convert output to tab-separated file. 
+cd $dir/pfam
+catDir result | sed '/^#/d' > allResults.tab
+awk 'BEGIN {OFS="\t"} { print $5,$1,$18-1,$19,$4,$14}' allResults.tab | sort > allUcscPfam.tab
+join  -t $'\t' -j 1  allUcscPfam.tab pfamDesc.tab | tawk '{if ($6 > $9) print $2, $3, $4, $5, $6, $1}' > ucscPfam.tab
+cd $dir
+
+# Convert output to knownToPfam table
+tawk '{print $1, gensub(/\.[0-9]+/, "", "g", $6)}' pfam/ucscPfam.tab | sort -u > knownToPfam.tab
+hgLoadSqlTab -notOnServer $tempDb knownToPfam $kent/src/hg/lib/knownTo.sql knownToPfam.tab
+tawk '{print gensub(/\.[0-9]+/, "", "g", $1), $2, $3}' pfam/pfamDesc.tab| hgLoadSqlTab -notOnServer $tempDb pfamDesc $kent/src/hg/lib/pfamDesc.sql stdin
+
+cd $dir/pfam
+genePredToFakePsl $tempDb knownGene knownGene.psl cdsOut.tab
+sort cdsOut.tab | sed 's/\.\./   /' > sortCdsOut.tab
+sort ucscPfam.tab> sortPfam.tab
+awk '{print $10, $11}' knownGene.psl > gene.sizes
+join sortCdsOut.tab sortPfam.tab |  awk '{print $1, $2 - 1 + 3 * $4, $2 - 1 + 3 * $5, $6}' | bedToPsl gene.sizes stdin domainToGene.psl
+pslMap domainToGene.psl knownGene.psl stdout | pslToBed stdin stdout | bedOrBlocks -useName stdin domainToGenome.bed 
+hgLoadBed $tempDb ucscGenePfam domainToGenome.bed
+
+# do SCOP run now
+mkdir -p $dir/scop
+cd $dir/scop
+rm -rf result
+mkdir  result
+ls -1 ../pfam/splitProt > prot.list
+cat << '_EOF_' > doScop
+#!/bin/csh -ef
+/hive/data/outside/pfam/Pfam34.0/hmmer-3.3.2/src/hmmsearch   --domtblout /scratch/tmp/scop.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/scop/1.75/hmmlib_1.75  ../pfam/splitProt/$1 
+mv /scratch/tmp/scop.$2.pf $3
+_EOF_
+    # << happy emacs
+chmod +x doScop
+cat << '_EOF_' > template
+#LOOP
+doScop $(path1) $(root1) {check out line+ result/$(root1).pf}
+#ENDLOOP
+_EOF_
+    # << happy emacs
+gensub2 prot.list single template jobList
+
+ssh $cpuFarm "cd $dir/scop; para make jobList"
+ssh $cpuFarm "cd $dir/scop; para time > run.time"
+cat run.time
+
+# Convert scop output to tab-separated files
+cd $dir
+catDir scop/result | sed '/^#/d' | awk 'BEGIN {OFS="\t"} {if ($7 <= 0.0001) print $1,$18-1,$19,$4, $7,$8}' | sort > scopPlusScore.tab
+sort -k 2 /hive/data/outside/scop/1.75/model.tab > scop.model.tab
+scopCollapse scopPlusScore.tab scop.model.tab ucscScop.tab \
+	scopDesc.tab knownToSuper.tab
+hgLoadSqlTab -notOnServer $tempDb scopDesc $kent/src/hg/lib/scopDesc.sql scopDesc.tab
+hgLoadSqlTab $tempDb knownToSuper $kent/src/hg/lib/knownToSuper.sql knownToSuper.tab
+#hgsql $tempDb -e "delete k from knownToSuper k, kgXref x where k.gene = x.kgID and x.geneSymbol = 'abParts'"
+
+hgLoadSqlTab $tempDb ucscScop $kent/src/hg/lib/ucscScop.sql ucscScop.tab
+echo "BuildPfamScop successfully finished"
+
+
+} > doPfamScop.log < /dev/null 2>&1