3ed65430d066888a23f7f97ebbc11eac3269586b
angie
  Fri Dec 2 11:13:14 2022 -0800
Mask more UTR noisy positions in BA.2 onward.

diff --git src/hg/utils/otto/sarscov2phylo/maskDelta.sh src/hg/utils/otto/sarscov2phylo/maskDelta.sh
index 27f84cb..056d44b 100755
--- src/hg/utils/otto/sarscov2phylo/maskDelta.sh
+++ src/hg/utils/otto/sarscov2phylo/maskDelta.sh
@@ -146,31 +146,37 @@
 for ((i=21765;  $i <= 21770;  i++)); do
     echo -e "N${i}N\t$BA4Node"
     echo -e "N${i}N\t$BA5Node"
 done >> $maskFile
 
 # 28877 and 28878 together are highly homoplasic in all of B.1.1 (28881-28883).  They seem to be
 # found very consistently in P.1*, but pop up in many places in Alpha and Omicrons.  I haven't
 # looked closely at the Alpha instances but they definitely cause some mini-Omicrons
 # (https://github.com/cov-lineages/pango-designation/issues/988).  Mask in BA.2 (which also covers
 # BA.4 and BA.5); possibly could also mask in B.1.1.7 and BA.1 but those are old news.
 for i in 28877 28878; do
     echo -e "N${i}N\t$BA2Node"
 done >> $maskFile
 
 # Also noticing a lot of noise on these 5'UTR locations (now I see why pangolin masks entire UTRs):
-for i in 76 77 78 79 80 81 83 84 85 86 88 89 91 92 93 94 96 97 98 99 100 123 124 126 127 129 130 131 132 133 134 135 136 139 140 141 143 144 145 146 147 148 151 152 154 157 158 159 162 164 179 180; do
+for i in 76 77 78 79 80 81 83 84 85 86 88 89 91 92 93 94 96 97 98 99 100 101 103 105 106 110 119 121 123 124 126 127 129 130 131 132 133 134 135 136 139 140 141 143 144 145 146 147 148 151 152 154 157 158 159 162 164 179 180; do
+    echo -e "N${i}N\t$BA2Node"
+done >> $maskFile
+
+# 3'UTR trouble spots (in addition to 29766 mentioned above):
+for i in 29760 29762 29764 29767 29769 29770 29771 29772 29773 29774 29775 29776 29777 29778 29779 \
+    29781 29782 29784 29786 29793 29800 29803 ; do
     echo -e "N${i}N\t$BA2Node"
 done >> $maskFile
 
 # BA.4-specific deletion:
 for ((i=686;  $i <= 694;  i++)); do
     echo -e "N${i}N\t$BA4Node"
 done >> $maskFile
 
 # BA.2.75 has an awful lot of amplicon dropout problematic sites as of Sep. 2022.
 BA275Node=$(grep India/WB-INSACOG-1931503209307/2022 $samplePaths \
           | awk '{print $NF;}' | sed -re 's/:.*//;')
 # These sites I would mask in all of BA.2 if it weren't for wanting to find legit recombinants:
 for backMut in G670T T2790C T3037C T4321C G9424A T9534C T9866C T10029C T10198C G18163A T19955C \
     G20055A T21618C G22200T A22578G T22674C C22679T T22686C G22688A A22775G T22813G A22992G A22995C \
     C23013A G23055A T23063A C23075T G23403A T23525C G23599T A23604C T23948G T24424A A24469T T25000C \