cabd81a0d3b5aecaa7ebc0f7ed86dd541c5330f4
angie
  Mon Nov 29 16:13:42 2021 -0800
Mask deleted bases (and adjacent bases) in the Delta branch to avoid bogus reported SNVs at those locations from erroneous genome assembly pipelines.

diff --git src/hg/utils/otto/sarscov2phylo/maskDelta.sh src/hg/utils/otto/sarscov2phylo/maskDelta.sh
new file mode 100755
index 0000000..6d767e5
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/maskDelta.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -beEu -x -o pipefail
+
+#	Do not modify this script, modify the source tree copy:
+#	kent/src/hg/utils/otto/sarscov2phylo/maskDelta.sh
+
+usage() {
+    echo "usage: $0 treeIn.pb treeOut.pb"
+}
+
+if [ $# != 2 ]; then
+  usage
+  exit 1
+fi
+
+treeInPb=$1
+treeOutPb=$2
+
+ottoDir=/hive/data/outside/otto/sarscov2phylo
+
+usherDir=~angie/github/usher
+matUtils=$usherDir/build/matUtils
+matOptimize=$usherDir/build/matOptimize
+
+# I wish there were a less hacky method to identify the node for Delta, but since mutation
+# paths can change as new samples are added, this is the most stable method I have at the moment:
+# make sample-paths, grep for basal sample IND/GBRC714b/2021 (USA/WI-CDC-FG-038252/2021 would also
+# work in case that one goes away for any reason), use the final node in path.
+samplePaths=$treeInPb.sample-paths
+$matUtils extract -i $treeInPb -S $samplePaths
+deltaNode=$(grep IND/GBRC714b/2021 $samplePaths | awk '{print $NF;}' | sed -re 's/:.*//;')
+
+# Delta has deletions at S:157-158 (22029-22034), ORF8:119-120 (28248-28253) and 28271.
+# Mask those locations and some adjacent bases where we get a ton of spurious "mutations".
+maskFile=$(basename $treeInPb .pb).maskForDelta.tsv
+set +x
+for ((i=22027;  $i <= 22034;  i++)); do
+    echo -e "N${i}N\t$deltaNode"
+done > $maskFile
+for ((i=28246;  $i <= 28253;  i++)); do
+    echo -e "N${i}N\t$deltaNode"
+done >> $maskFile
+echo -e "N28271N\t$deltaNode" >> $maskFile
+set -x
+
+time $matUtils mask -i $treeInPb \
+    -m $maskFile \
+    -o $treeOutPb