d1ab093492f89b1fe6130371898d1ac2d47ee85d
angie
  Mon Nov 29 15:56:05 2021 -0800
Reroot to lineage A internal node, not leaf sequence.  Mask entire untranslated beginning & end of genome like Pangolin does.

diff --git src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
index 36125ad..a7ae840e 100755
--- src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
+++ src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
@@ -12,59 +12,75 @@
 if [ $# != 1 ]; then
   usage
   exit 1
 fi
 
 buildDate=$1
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 
 usherDir=~angie/github/usher
 matUtils=$usherDir/build/matUtils
 
 today=$(date +%F)
 cd $ottoDir/$buildDate
 
-# Get full sample name for lineage A sequence used as reference/root by Pangolin:
-wh4SampleName=$(grep LR757995 samples.$buildDate)
+# Get node ID for root of lineage A, used as reference/root by Pangolin:
+if [ ! -s clade-paths ]; then
+    $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb -C clade-paths
+fi
+lineageARoot=$(grep ^A$'\t' clade-paths | cut -f 2)
 
-# Reroot protobuf:
+# Reroot protobuf to lineage A:
 $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb \
-    --reroot "$wh4SampleName" \
+    --reroot $lineageARoot \
     -o gisaidAndPublic.$buildDate.masked.reroot.pb
 
 # Reroot pango.clade-mutations.tsv:
 grep -w ^A $scriptDir/pango.clade-mutations.tsv \
 | sed -re 's/T28144C( > )?//;  s/C8782T( > )?//;' \
     > pango.clade-mutations.reroot.tsv
 grep -vw ^A $scriptDir/pango.clade-mutations.tsv \
 | sed -re 's/\t/\tT8782C > C28144T > /;' \
     >> pango.clade-mutations.reroot.tsv
-# Assign updated lineages on the rerooted tree, pango-only for pangolin:
+
+# Mask additional bases at the beginning and end of the genome that pangolin masks after
+# aligning input sequences.
+for ((i=56;  $i <= 265;  i++)); do
+    echo -e "N${i}N"
+done > maskPangoEnds
+for ((i=29674;  $i < 29804;  i++)); do
+    echo -e "N${i}N"
+done >> maskPangoEnds
+$matUtils mask -i gisaidAndPublic.$buildDate.masked.reroot.pb \
+    -m maskPangoEnds -o gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb
+
+# Assign updated lineages on the rerooted & pango-masked tree, pango-only for pangolin:
 time $matUtils annotate -T 50 \
-    -i gisaidAndPublic.$buildDate.masked.reroot.pb \
+    -i gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb \
     -M pango.clade-mutations.reroot.tsv \
     -l \
     -c lineageToName \
     -f 0.95 \
     -u mutations.pangoOnly \
     -D details.pangoOnly \
     -o gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \
     >& annotate.pangoOnly.out
 
 set +o pipefail
 grep 'Could not' annotate.pangoOnly.out | cat
+grep skip annotate.pangoOnly.out | cat
 set -o pipefail
 
 # Make a bunch of smaller trees and see how they do.
 mkdir -p /hive/users/angie/lineageTreeUpdate.$today
 cd /hive/users/angie/lineageTreeUpdate.$today
 for i in 0 1 2 3 4 5 6 7 8 9; do
     echo test.50.$i
     $matUtils extract -i $ottoDir/$buildDate/gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \
         -r 50 -o test.50.$i.pb
     $matUtils mask -i test.50.$i.pb -S -o test.50.$i.simp.pb
 done
 
 # Run pangolin on each subtree
 # 7.5-14.5m each job, ~3.5hrs total:
 conda activate pangolin