b96a4b4d3d3b9d7dd2e7a1934ff4e4e57fb1348b
angie
  Tue Feb 22 11:07:42 2022 -0800
Mask two new problematic-in-Delta positions (https://github.com/cov-lineages/pango-designation/issues/435)

diff --git src/hg/utils/otto/sarscov2phylo/maskDelta.sh src/hg/utils/otto/sarscov2phylo/maskDelta.sh
index 251119d..3bc7072 100755
--- src/hg/utils/otto/sarscov2phylo/maskDelta.sh
+++ src/hg/utils/otto/sarscov2phylo/maskDelta.sh
@@ -1,78 +1,83 @@
 #!/bin/bash
 set -beEu -x -o pipefail
 
 #	Do not modify this script, modify the source tree copy:
 #	kent/src/hg/utils/otto/sarscov2phylo/maskDelta.sh
 
 usage() {
     echo "usage: $0 treeIn.pb treeOut.pb"
 }
 
 if [ $# != 2 ]; then
   usage
   exit 1
 fi
 
 treeInPb=$1
 treeOutPb=$2
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 
 usherDir=~angie/github/usher
 matUtils=$usherDir/build/matUtils
 matOptimize=$usherDir/build/matOptimize
 
 # I wish there were a less hacky method to identify the node for Delta, but since mutation
 # paths can change as new samples are added, this is the most stable method I have at the moment:
 # make sample-paths, grep for basal sample IND/GBRC714b/2021 (USA/WI-CDC-FG-038252/2021 would also
 # work in case that one goes away for any reason), use the final node in path.
 samplePaths=$treeInPb.sample-paths
 $matUtils extract -i $treeInPb -S $samplePaths
 deltaNode=$(grep IND/GBRC714b/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;')
 
 # Delta has deletions at S:157-158 (22029-22034), ORF8:119-120 (28248-28253) and 28271.
 # Mask those locations and some adjacent bases where we get a ton of spurious "mutations".
 maskFile=$(basename $treeInPb .pb).branchSpecificMask.tsv
 set +x
 for ((i=22027;  $i <= 22034;  i++)); do
     echo -e "N${i}N\t$deltaNode"
 done > $maskFile
 for ((i=28246;  $i <= 28253;  i++)); do
     echo -e "N${i}N\t$deltaNode"
 done >> $maskFile
 echo -e "N28271N\t$deltaNode" >> $maskFile
 set -x
 
 # S:95 (21846) is also very unreliably detected in Delta.  Mask it off to avoid tree trouble,
 # like split AY.100.
 echo -e "N21846N\t$deltaNode" >> $maskFile
 
 # These three sites are recommended for caution in the Problematic Sites set, and seem to have
 # create a false lineage (AY.89) from samples that probably should be AY.4.  AY.89 is being
 # withdrawn (https://github.com/cov-lineages/pango-designation/issues/398); mask sites in Delta.
 echo -e "N21302N\t$deltaNode" >> $maskFile
 echo -e "N21304N\t$deltaNode" >> $maskFile
 echo -e "N21305N\t$deltaNode" >> $maskFile
 
+# Mask flaky positions 28254 (ORF8:121) and 28461 (N:63) so that AY.96 is merged into AY.46
+# https://github.com/cov-lineages/pango-designation/issues/435
+echo -e "N28254N\t$deltaNode" >> $maskFile
+echo -e "N28461N\t$deltaNode" >> $maskFile
+
 # OK, not just Delta -- Alpha, Beta, and Gamma have a deletion that causes spurious "mutations",
 # especially at 11296 and 11291, somewhat also at 11288.
 # Omicron has ~ the same deletion but it aligns 5 bases to the left, probably because it was
 # combined with an SNV (https://github.com/cov-lineages/pango-designation/issues/361).
 alphaNode=$(grep Italy/TAA-1900553896/2021 $samplePaths \
             | awk '{print $(NF-1);}' | sed -re 's/:.*//;')
 betaNode=$(grep SouthAfrica/CERI-KRISP-K012031/2021 $samplePaths \
            | awk '{print $NF;}' | sed -re 's/:.*//;')
 gammaNode=$(grep France/PAC-IHU-5193-N1/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;')
 set +x
 for node in $alphaNode $betaNode $gammaNode; do
     for ((i=11288;  $i <= 11296;  i++)); do
         echo -e "N${i}N\t$node"
     done
 done >> $maskFile
 set -x
 
 time $matUtils mask -i $treeInPb \
     -m $maskFile \
     -o $treeOutPb
 
 rm $samplePaths