644aca2e8d7af3d037a6fc29a400c5d79484756e
angie
  Wed Feb 8 20:49:24 2023 -0800
Data-driven branch-specific masking: replaced the contents of maskDelta.sh
with a YAML specification of branches and sites/ranges/reversions to mask
and a python script that reads the spec and then makes a matUtils mask file
and runs matUtils mask as before.  Did this so I could point people to the
YAML spec instead of a non-portable bash script when they asked what sites
are masked in what branches.  See https://github.com/yatisht/usher/issues/324

diff --git src/hg/utils/otto/sarscov2phylo/branchSpecificMask.yml src/hg/utils/otto/sarscov2phylo/branchSpecificMask.yml
new file mode 100644
index 0000000..e21ebc9
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/branchSpecificMask.yml
@@ -0,0 +1,107 @@
+# Trying out a more data-driven approach to identifying nodes, sites and specific mutations
+# for branch-specific masking.
+
+version: 0
+
+B.1.1.7:
+    representative: Italy/TAA-1900553896/2021
+    representativeBacktrack: 1    # use the parent node of representative's node
+    ranges: [ [ 11288, 11296] ]   # recurring deletion found in many VoCs
+
+B.1.351:
+    representative: SouthAfrica/CERI-KRISP-K012031/2021
+    ranges: [ [ 11288, 11296] ]
+
+P.1:
+    representative: FRA/IHUCOVID-005193-N1/2021
+    ranges: [ [ 11288, 11296] ]
+
+B.1.617.2:
+    representative: IND/GBRC714b/2021
+    # Deletions at S:157-158 (22029-22034), ORF8:119-120 (28248-28253); mask adjacent noisy bases too
+    ranges: [ [ 22027, 22034], [28246, 28254] ]
+    sites: [ 21302, 21304, 21305, # https://github.com/cov-lineages/pango-designation/issues/398
+             21846,               # S:95 noisy -- caused split AY.100
+             28461,               # https://github.com/cov-lineages/pango-designation/issues/435
+             28271 ]              # deletion
+
+BA.1:
+    representative: England/DHSC-CYBJ4Y8/2022
+    ranges: [ [ 6513, 6515 ],
+              [ 11283, 11291 ],   # https://github.com/cov-lineages/pango-designation/issues/361
+              [ 21765, 21770 ],
+              [ 21988, 21995 ],
+              [ 22194, 22217 ],   # deletion 22194-22196, insertion after 22204, big mess after that
+              [ 28362, 28370 ] ]
+    sites: [ 22813, 22898, 22882, 23854 ]
+
+BA.1.1:
+    # On the BA.1 branch, so it inherits BA.1 masking.
+    representative: England/ALDP-2BEB0A0/2021
+    ranges: [ [ 76, 77 ] ]
+    reversions: [ G26530A ]       # false reversions on this really messed up the BA.1.1.1 branch.
+                                  # XD should have this reversion, oh well.
+
+BA.2:
+    # I'm including some 5' UTR and 3' UTR sites now; in retrospect could have started earlier.
+    # I would mask 210 too but it's useful for finding breakpoints of Delta/BA.2 recombinants.
+    # 212 is also noisy but I'm leaving it in as a red flag for 210 mutations that might be noise.
+    representative: England/DHSC-CYBAB7G/2022
+    representativeBacktrack: 1
+    ranges: [ # 5' UTR
+              [ 76, 81 ], [83, 86 ], [ 88, 89 ], [ 91, 94 ], [ 96, 101 ], [ 105, 106 ],
+              [ 123, 124 ], [ 126, 127 ], [ 129, 136 ], [ 139, 141 ], [ 143, 148 ], [ 151, 152 ],
+              [ 157, 159 ], [ 179, 180 ], [ 197, 201 ], [ 203, 204], [ 206, 207 ], [ 216, 219 ],
+              [ 221, 225 ], [ 230, 233 ], [ 241, 243 ], [ 245, 246 ],
+              # deletions
+              [ 11288, 11296 ], [ 21633, 21641 ], [ 28361, 28371 ], [ 29734, 29759 ],
+              # 28877 and 28878 together are highly homoplasic in all of B.1.1 (28881-28883).
+              # They seem to be found very consistently in P.1*, but pop up in many places in Alpha
+              # and Omicrons.  I haven't looked closely at the Alpha instances but they caused
+              # some mini-Omicrons (https://github.com/cov-lineages/pango-designation/issues/988).
+              # Possibly could also mask in B.1.1.7 and BA.1 but those are old news.
+              [ 28877, 28878 ],
+              # 3'UTR
+              [ 29769, 29779 ], [ 29781, 29782 ] ]
+    sites: [ # 5'UTR
+             103, 110, 119, 121, 154, 162, 164, 214, 228, 239,
+             # amplicon dropout (there are so many more; omitting to avoid recombinant trouble)
+             22786, 22882, 23854,
+             # 3' UTR
+             29760, 29762, 29764,
+             29766,                   # only Luxembourg, made a mini-BA.2
+             29767, 29784, 29786, 29793, 29800, 29803 ]
+
+BA.2.75:
+    # Inherits BA.2 masking but has so many problems with false reversions I'm adding a ton here.
+    representative: India/WB-INSACOG-1931503209307/2022
+    reversions: [ # BA.2-level but not masked BA.2-wide to avoid messing up recombinants:
+                  G670T, T2790C, T3037C, T4321C, G9424A, T9534C, T9866C, T10029C, T10198C,
+                  G18163A, T19955C, G20055A, T21618C, G22200T, A22578G, T22674C, C22679T,
+                  T22686C, G22688A, A22775G, T22813G, A22992G, A22995C, C23013A, G23055A,
+                  T23063A, C23075T, G23403A, T23525C, G23599T, A23604C, T23948G, T24424A,
+                  A24469T, T25000C, T26270C, G26577C, T27807C, T28271A, C29510A,
+                  # BA.2.75-defining, very dropout-prone:
+                  T4586C, G22001A, C22016T, A22033C, G22190A, C22577G, G26275A,
+                  # BA.2.75-defining, bad but maybe not quite as bad:
+                  T3796C, T3927C, T5183C, G12444A, A15451G, G22190A, A22331G, A22898G,
+                  G22942T, C23013A ]
+
+BA.4:
+    # BA.4 is placed on the BA.2 branch so it inherits all the BA.2 sites.
+    representative: SouthAfrica/NICD-N41664/2022
+    ranges: [ [ 686, 694 ], [ 21765, 21770 ] ]
+
+BA.5:
+    # BA.5 is placed on the BA.2 branch so it inherits all the BA.2 sites.
+    representative: England/PHEP-YYFJPAM/2022
+    ranges: [ [ 21765, 21770 ] ]
+    # Some of these should be reverted in recombinants, but we're pretty much past the point of
+    # simultaneous Delta/Omicron and the noise from false reversions is so intolerable that we'll
+    # just have to watch out for missing reversions when working with recombinants.
+    reversions: [ G670T, T2790C, T3037C, A4184G, T4321C, G9424A, T9534C, T10198C, A12160G,
+                  T15714C, T17410C, G18163A, T19955C, G20055A, T21618C, G22200T, A22578G,
+                  T22674C, C22679T, T22686C, G22688A, A22775G, T22813G, G22882T, G22917T,
+                  A22992G, A22995C, C23013A, G23018T, G23055A, T23063A, C23075T, G23403A,
+                  T23525C, G23599T, A23604C, T25000C, A26529G, G26577C, A26709G, T27807C,
+                  T27889C, T28271A, C29510A ]