644aca2e8d7af3d037a6fc29a400c5d79484756e angie Wed Feb 8 20:49:24 2023 -0800 Data-driven branch-specific masking: replaced the contents of maskDelta.sh with a YAML specification of branches and sites/ranges/reversions to mask and a python script that reads the spec and then makes a matUtils mask file and runs matUtils mask as before. Did this so I could point people to the YAML spec instead of a non-portable bash script when they asked what sites are masked in what branches. See https://github.com/yatisht/usher/issues/324 diff --git src/hg/utils/otto/sarscov2phylo/branchSpecificMask.yml src/hg/utils/otto/sarscov2phylo/branchSpecificMask.yml new file mode 100644 index 0000000..e21ebc9 --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/branchSpecificMask.yml @@ -0,0 +1,107 @@ +# Trying out a more data-driven approach to identifying nodes, sites and specific mutations +# for branch-specific masking. + +version: 0 + +B.1.1.7: + representative: Italy/TAA-1900553896/2021 + representativeBacktrack: 1 # use the parent node of representative's node + ranges: [ [ 11288, 11296] ] # recurring deletion found in many VoCs + +B.1.351: + representative: SouthAfrica/CERI-KRISP-K012031/2021 + ranges: [ [ 11288, 11296] ] + +P.1: + representative: FRA/IHUCOVID-005193-N1/2021 + ranges: [ [ 11288, 11296] ] + +B.1.617.2: + representative: IND/GBRC714b/2021 + # Deletions at S:157-158 (22029-22034), ORF8:119-120 (28248-28253); mask adjacent noisy bases too + ranges: [ [ 22027, 22034], [28246, 28254] ] + sites: [ 21302, 21304, 21305, # https://github.com/cov-lineages/pango-designation/issues/398 + 21846, # S:95 noisy -- caused split AY.100 + 28461, # https://github.com/cov-lineages/pango-designation/issues/435 + 28271 ] # deletion + +BA.1: + representative: England/DHSC-CYBJ4Y8/2022 + ranges: [ [ 6513, 6515 ], + [ 11283, 11291 ], # https://github.com/cov-lineages/pango-designation/issues/361 + [ 21765, 21770 ], + [ 21988, 21995 ], + [ 22194, 22217 ], # deletion 22194-22196, insertion after 22204, big mess after that + [ 28362, 28370 ] ] + sites: [ 22813, 22898, 22882, 23854 ] + +BA.1.1: + # On the BA.1 branch, so it inherits BA.1 masking. + representative: England/ALDP-2BEB0A0/2021 + ranges: [ [ 76, 77 ] ] + reversions: [ G26530A ] # false reversions on this really messed up the BA.1.1.1 branch. + # XD should have this reversion, oh well. + +BA.2: + # I'm including some 5' UTR and 3' UTR sites now; in retrospect could have started earlier. + # I would mask 210 too but it's useful for finding breakpoints of Delta/BA.2 recombinants. + # 212 is also noisy but I'm leaving it in as a red flag for 210 mutations that might be noise. + representative: England/DHSC-CYBAB7G/2022 + representativeBacktrack: 1 + ranges: [ # 5' UTR + [ 76, 81 ], [83, 86 ], [ 88, 89 ], [ 91, 94 ], [ 96, 101 ], [ 105, 106 ], + [ 123, 124 ], [ 126, 127 ], [ 129, 136 ], [ 139, 141 ], [ 143, 148 ], [ 151, 152 ], + [ 157, 159 ], [ 179, 180 ], [ 197, 201 ], [ 203, 204], [ 206, 207 ], [ 216, 219 ], + [ 221, 225 ], [ 230, 233 ], [ 241, 243 ], [ 245, 246 ], + # deletions + [ 11288, 11296 ], [ 21633, 21641 ], [ 28361, 28371 ], [ 29734, 29759 ], + # 28877 and 28878 together are highly homoplasic in all of B.1.1 (28881-28883). + # They seem to be found very consistently in P.1*, but pop up in many places in Alpha + # and Omicrons. I haven't looked closely at the Alpha instances but they caused + # some mini-Omicrons (https://github.com/cov-lineages/pango-designation/issues/988). + # Possibly could also mask in B.1.1.7 and BA.1 but those are old news. + [ 28877, 28878 ], + # 3'UTR + [ 29769, 29779 ], [ 29781, 29782 ] ] + sites: [ # 5'UTR + 103, 110, 119, 121, 154, 162, 164, 214, 228, 239, + # amplicon dropout (there are so many more; omitting to avoid recombinant trouble) + 22786, 22882, 23854, + # 3' UTR + 29760, 29762, 29764, + 29766, # only Luxembourg, made a mini-BA.2 + 29767, 29784, 29786, 29793, 29800, 29803 ] + +BA.2.75: + # Inherits BA.2 masking but has so many problems with false reversions I'm adding a ton here. + representative: India/WB-INSACOG-1931503209307/2022 + reversions: [ # BA.2-level but not masked BA.2-wide to avoid messing up recombinants: + G670T, T2790C, T3037C, T4321C, G9424A, T9534C, T9866C, T10029C, T10198C, + G18163A, T19955C, G20055A, T21618C, G22200T, A22578G, T22674C, C22679T, + T22686C, G22688A, A22775G, T22813G, A22992G, A22995C, C23013A, G23055A, + T23063A, C23075T, G23403A, T23525C, G23599T, A23604C, T23948G, T24424A, + A24469T, T25000C, T26270C, G26577C, T27807C, T28271A, C29510A, + # BA.2.75-defining, very dropout-prone: + T4586C, G22001A, C22016T, A22033C, G22190A, C22577G, G26275A, + # BA.2.75-defining, bad but maybe not quite as bad: + T3796C, T3927C, T5183C, G12444A, A15451G, G22190A, A22331G, A22898G, + G22942T, C23013A ] + +BA.4: + # BA.4 is placed on the BA.2 branch so it inherits all the BA.2 sites. + representative: SouthAfrica/NICD-N41664/2022 + ranges: [ [ 686, 694 ], [ 21765, 21770 ] ] + +BA.5: + # BA.5 is placed on the BA.2 branch so it inherits all the BA.2 sites. + representative: England/PHEP-YYFJPAM/2022 + ranges: [ [ 21765, 21770 ] ] + # Some of these should be reverted in recombinants, but we're pretty much past the point of + # simultaneous Delta/Omicron and the noise from false reversions is so intolerable that we'll + # just have to watch out for missing reversions when working with recombinants. + reversions: [ G670T, T2790C, T3037C, A4184G, T4321C, G9424A, T9534C, T10198C, A12160G, + T15714C, T17410C, G18163A, T19955C, G20055A, T21618C, G22200T, A22578G, + T22674C, C22679T, T22686C, G22688A, A22775G, T22813G, G22882T, G22917T, + A22992G, A22995C, C23013A, G23018T, G23055A, T23063A, C23075T, G23403A, + T23525C, G23599T, A23604C, T25000C, A26529G, G26577C, A26709G, T27807C, + T27889C, T28271A, C29510A ]