00842797b9fda733465b9bf732648a492d133cac
angie
  Wed Jan 12 10:00:10 2022 -0800
Also mask 21302, 21304 and 21305 in Delta.  Mask convergent deletion of 11291-11296 plus 3 bases to the left in Alpha, Beta and Gamma to prevent spurious 'SNVs' at deletion site from messing up the tree; may need to add this for Omicron eventually.

diff --git src/hg/utils/otto/sarscov2phylo/maskDelta.sh src/hg/utils/otto/sarscov2phylo/maskDelta.sh
index 30cba66..251119d 100755
--- src/hg/utils/otto/sarscov2phylo/maskDelta.sh
+++ src/hg/utils/otto/sarscov2phylo/maskDelta.sh
@@ -20,33 +20,59 @@
 
 usherDir=~angie/github/usher
 matUtils=$usherDir/build/matUtils
 matOptimize=$usherDir/build/matOptimize
 
 # I wish there were a less hacky method to identify the node for Delta, but since mutation
 # paths can change as new samples are added, this is the most stable method I have at the moment:
 # make sample-paths, grep for basal sample IND/GBRC714b/2021 (USA/WI-CDC-FG-038252/2021 would also
 # work in case that one goes away for any reason), use the final node in path.
 samplePaths=$treeInPb.sample-paths
 $matUtils extract -i $treeInPb -S $samplePaths
 deltaNode=$(grep IND/GBRC714b/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;')
 
 # Delta has deletions at S:157-158 (22029-22034), ORF8:119-120 (28248-28253) and 28271.
 # Mask those locations and some adjacent bases where we get a ton of spurious "mutations".
-maskFile=$(basename $treeInPb .pb).maskForDelta.tsv
+maskFile=$(basename $treeInPb .pb).branchSpecificMask.tsv
 set +x
 for ((i=22027;  $i <= 22034;  i++)); do
     echo -e "N${i}N\t$deltaNode"
 done > $maskFile
 for ((i=28246;  $i <= 28253;  i++)); do
     echo -e "N${i}N\t$deltaNode"
 done >> $maskFile
 echo -e "N28271N\t$deltaNode" >> $maskFile
 set -x
 
 # S:95 (21846) is also very unreliably detected in Delta.  Mask it off to avoid tree trouble,
 # like split AY.100.
 echo -e "N21846N\t$deltaNode" >> $maskFile
 
+# These three sites are recommended for caution in the Problematic Sites set, and seem to have
+# create a false lineage (AY.89) from samples that probably should be AY.4.  AY.89 is being
+# withdrawn (https://github.com/cov-lineages/pango-designation/issues/398); mask sites in Delta.
+echo -e "N21302N\t$deltaNode" >> $maskFile
+echo -e "N21304N\t$deltaNode" >> $maskFile
+echo -e "N21305N\t$deltaNode" >> $maskFile
+
+# OK, not just Delta -- Alpha, Beta, and Gamma have a deletion that causes spurious "mutations",
+# especially at 11296 and 11291, somewhat also at 11288.
+# Omicron has ~ the same deletion but it aligns 5 bases to the left, probably because it was
+# combined with an SNV (https://github.com/cov-lineages/pango-designation/issues/361).
+alphaNode=$(grep Italy/TAA-1900553896/2021 $samplePaths \
+            | awk '{print $(NF-1);}' | sed -re 's/:.*//;')
+betaNode=$(grep SouthAfrica/CERI-KRISP-K012031/2021 $samplePaths \
+           | awk '{print $NF;}' | sed -re 's/:.*//;')
+gammaNode=$(grep France/PAC-IHU-5193-N1/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;')
+set +x
+for node in $alphaNode $betaNode $gammaNode; do
+    for ((i=11288;  $i <= 11296;  i++)); do
+        echo -e "N${i}N\t$node"
+    done
+done >> $maskFile
+set -x
+
 time $matUtils mask -i $treeInPb \
     -m $maskFile \
     -o $treeOutPb
+
+rm $samplePaths