921f73e5adb3b42171efedb00a795fa4651655a7
angie
  Tue Nov 2 13:07:42 2021 -0700
Use the new --clade-mutations option for matUtils annotate (but adjust mutations to be rooted in lineage A instead of B).  The simplified trees always win, so don't bother testing unsimplified trees, saves ~1.5 hours.

diff --git src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
index afbf71f..36125ad 100755
--- src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
+++ src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
@@ -20,81 +20,76 @@
 
 usherDir=~angie/github/usher
 matUtils=$usherDir/build/matUtils
 
 today=$(date +%F)
 cd $ottoDir/$buildDate
 
 # Get full sample name for lineage A sequence used as reference/root by Pangolin:
 wh4SampleName=$(grep LR757995 samples.$buildDate)
 
 # Reroot protobuf:
 $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb \
     --reroot "$wh4SampleName" \
     -o gisaidAndPublic.$buildDate.masked.reroot.pb
 
+# Reroot pango.clade-mutations.tsv:
+grep -w ^A $scriptDir/pango.clade-mutations.tsv \
+| sed -re 's/T28144C( > )?//;  s/C8782T( > )?//;' \
+    > pango.clade-mutations.reroot.tsv
+grep -vw ^A $scriptDir/pango.clade-mutations.tsv \
+| sed -re 's/\t/\tT8782C > C28144T > /;' \
+    >> pango.clade-mutations.reroot.tsv
 # Assign updated lineages on the rerooted tree, pango-only for pangolin:
 time $matUtils annotate -T 50 \
     -i gisaidAndPublic.$buildDate.masked.reroot.pb \
+    -M pango.clade-mutations.reroot.tsv \
     -l \
     -c lineageToName \
     -f 0.95 \
     -u mutations.pangoOnly \
     -D details.pangoOnly \
     -o gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \
     >& annotate.pangoOnly.out
 
 set +o pipefail
 grep 'Could not' annotate.pangoOnly.out | cat
 set -o pipefail
 
 # Make a bunch of smaller trees and see how they do.
-mkdir /hive/users/angie/lineageTreeUpdate.$today
+mkdir -p /hive/users/angie/lineageTreeUpdate.$today
 cd /hive/users/angie/lineageTreeUpdate.$today
 for i in 0 1 2 3 4 5 6 7 8 9; do
     echo test.50.$i
     $matUtils extract -i $ottoDir/$buildDate/gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \
         -r 50 -o test.50.$i.pb
     $matUtils mask -i test.50.$i.pb -S -o test.50.$i.simp.pb
 done
 
 # Run pangolin on each subtree
 # 7.5-14.5m each job, ~3.5hrs total:
 conda activate pangolin
 for i in 0 1 2 3 4 5 6 7 8 9; do
-    echo test.50.$i
-    time pangolin -t 50 --usher-tree test.50.$i.pb \
-        --skip-designation-hash --no-temp --outdir subset_10000_0.pusher.test.50.$i.out \
-        ../pangolin_eval/subset_10000_0.fa
     echo test.50.$i.simp
     time pangolin -t 50 --usher-tree test.50.$i.simp.pb \
         --skip-designation-hash --no-temp --outdir subset_10000_0.pusher.test.50.$i.simp.out \
         ../pangolin_eval/subset_10000_0.fa
 done
 
 # Summarize results
 for i in 0 1 2 3 4 5 6 7 8 9; do
-    echo test.50.$i
-    tail -n+2 subset_10000_0.pusher.test.50.$i.out/lineage_report.csv \
-    | awk -F, '{print $1 "\t" $2;}' \
-    | sort \
-        > subset_10000_0.pusher.test.50.$i
-    join -t$'\t' ../pangolin_eval/subset_10000_0.cogNameToLin subset_10000_0.pusher.test.50.$i \
-    | tawk '$2 != $3' \
-        > subset_10000_0.pusher.test.50.$i.diff
-    cut -f 2,3 subset_10000_0.pusher.test.50.$i.diff \
-    | sort | uniq -c | sort -nr > subset_10000_0.pusher.test.50.$i.diffrank
+    echo test.50.$i.simp
     tail -n+2 subset_10000_0.pusher.test.50.$i.simp.out/lineage_report.csv \
     | awk -F, '{print $1 "\t" $2;}' \
     | sort \
         > subset_10000_0.pusher.test.50.$i.simp
     join -t$'\t' ../pangolin_eval/subset_10000_0.cogNameToLin subset_10000_0.pusher.test.50.$i.simp \
     | tawk '$2 != $3' \
         > subset_10000_0.pusher.test.50.$i.simp.diff
     cut -f 2,3 subset_10000_0.pusher.test.50.$i.simp.diff \
     | sort | uniq -c | sort -nr > subset_10000_0.pusher.test.50.$i.simp.diffrank
 done
 
 wc -l subset_10000_0.*.diff | sort -n
 
 # Pick the subtree with the fewest differences
 set +o pipefail