src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh 8446663b24ef2f8db0695d970f7c6b4a03a9949f

8446663b24ef2f8db0695d970f7c6b4a03a9949f
angie
  Mon Apr 26 15:33:26 2021 -0700
Use matUtils annotate new options -m and -D to get mutations and details files.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index ccc5640..64b644c 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -21,31 +21,31 @@
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
 cd $ottoDir/$today
 
 prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
 
-usherDir=~angie/github/yatish_usher
+usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 # Make lists of sequences already in the tree.
 # matUtils creates full paths for output files, defaulting to current directory, so can't
 # output to named pipe because it becomes $cwd//dev/fd/63 ...
 $matUtils summary -i $prevProtobufMasked -s prevSamples
 tail -n+2 prevSamples | cut -f 1 > prevNames
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^EPI_ISL_' > prevGisaid
 # Add public sequences that have been mapped to GISAID sequences to prevGisaid.
@@ -199,57 +199,59 @@
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
 zcat gisaidAndPublic.$today.metadata.tsv.gz \
 | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \
 | sed -re 's/^20E \(EU1\)/20E.EU1/;' \
     > cladeToName
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
     -c cladeToName \
+    -m mutations.nextclade \
+    -D details.nextclade \
     -o gisaidAndPublic.$today.masked.nextclade.pb \
     >& annotate.nextclade.out
 
 # Add pangolin lineage annotations to protobuf.  Use pangoLEARN training samples;
 # convert EPI IDs to public to match tree IDs.
 tail -n+2 ~angie/github/pangoLEARN/pangoLEARN/data/lineages.metadata.csv \
 | awk -F, '{print $7 "\t" $2;}' \
 | sort > epiExemplarToLineage
 subColumn -miss=/dev/null 1 epiExemplarToLineage \
     <(cut -f 1,2 $gisaidDir/epiToPublicAndDate.$today) stdout \
 | sort > idExemplarToLineage
 cut -f 2 $renaming | grep -Fwf <(cut -f 1 idExemplarToLineage) \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \
 | sort > idExemplarToName
 join -t$'\t' idExemplarToName idExemplarToLineage \
 | tawk '{print $3, $2;}' \
 | sort > lineageToName
 linFile=lineageToName
 
 # Until pangoLEARN training is done, use pango-designation file for now.
 #linFile=/hive/users/angie/matLineages/linToPublicPlusGisaidName.2021-03-16
 
-#***
-##***time $matUtils annotate -T 50 \
-time ~angie/github/usher/build/matUtils annotate -T 50 \
+time $matUtils annotate -T 50 \
     -i gisaidAndPublic.$today.masked.nextclade.pb \
     -c $linFile \
+    -m mutations.pangolin \
+    -D details.pangolin \
     -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
     >& annotate.pangolin.out
 
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # Extract public samples from tree
 $matUtils summary -i gisaidAndPublic.$today.masked.pb -s newSamples
 tail -n+2 newSamples | cut -f 1 > newNames
 grep -v EPI_ISL_ newNames > newPublicNames
 $matUtils extract -i gisaidAndPublic.$today.masked.pb \
     -s newPublicNames \
     -O -o public.$today.masked.pb
 
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do