03e6d9865fa98c05dda6bbf5d780b0a2b41d389c
angie
  Tue Feb 22 11:11:08 2022 -0800
Remove B.1.1.529 from lineageTree.pb to match pangoLEARN behavior & avoid applying Omicron label to sequences of very poor quality -- requested by Rachel Colquhoun & Andrew Rambaut.

diff --git src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
index a7ae840e..23a3144 100755
--- src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
+++ src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
@@ -23,36 +23,37 @@
 
 today=$(date +%F)
 cd $ottoDir/$buildDate
 
 # Get node ID for root of lineage A, used as reference/root by Pangolin:
 if [ ! -s clade-paths ]; then
     $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb -C clade-paths
 fi
 lineageARoot=$(grep ^A$'\t' clade-paths | cut -f 2)
 
 # Reroot protobuf to lineage A:
 $matUtils extract -i gisaidAndPublic.$buildDate.masked.pb \
     --reroot $lineageARoot \
     -o gisaidAndPublic.$buildDate.masked.reroot.pb
 
-# Reroot pango.clade-mutations.tsv:
+# Reroot pango.clade-mutations.tsv; also remove B.1.1.529 (Rachel's request):
 grep -w ^A $scriptDir/pango.clade-mutations.tsv \
 | sed -re 's/T28144C( > )?//;  s/C8782T( > )?//;' \
     > pango.clade-mutations.reroot.tsv
 grep -vw ^A $scriptDir/pango.clade-mutations.tsv \
 | sed -re 's/\t/\tT8782C > C28144T > /;' \
+| grep -vF B.1.1.529 \
     >> pango.clade-mutations.reroot.tsv
 
 # Mask additional bases at the beginning and end of the genome that pangolin masks after
 # aligning input sequences.
 for ((i=56;  $i <= 265;  i++)); do
     echo -e "N${i}N"
 done > maskPangoEnds
 for ((i=29674;  $i < 29804;  i++)); do
     echo -e "N${i}N"
 done >> maskPangoEnds
 $matUtils mask -i gisaidAndPublic.$buildDate.masked.reroot.pb \
     -m maskPangoEnds -o gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb
 
 # Assign updated lineages on the rerooted & pango-masked tree, pango-only for pangolin:
 time $matUtils annotate -T 50 \