af90beb812e8f96e2283e62bd5082948ce455c58 angie Tue Oct 31 09:54:12 2023 -0700 Updating default column offsets to match more recent nextclade versions diff --git src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl index 5a4bd4b..3786875 100755 --- src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl +++ src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl @@ -4,42 +4,42 @@ # that have only a subset of Omicron mutations so we can keep them from screwing up # the base of the Omicron branch. # Some bad sequences are assigned 19A, 20A, 20B but have a suspicious number of Omicron muts. # Others are assigned Omicron (21K, 21L, 21M, 22*) but have a lot of reversions. use warnings; use strict; my $maxOmicronMuts = 5; my $maxReversions = 5; # Column offsets: #0 seqName #1 clade -#18 privateNucMutations.reversionSubstitutions -#19 privateNucMutations.labeledSubstitutions -#30 nonACGTNs +#22 privateNucMutations.reversionSubstitutions +#23 privateNucMutations.labeledSubstitutions +#34 nonACGTNs # Examples values for a seq assigned to 21J (Delta) but with a suspicious number of Omicron muts # (and Delta back-muts): # reversions example: T4181G,T7124C,T8986C,T9053G,T16466C,G21618C,C27638T,T27752C,T29402G # labeled example: T5386G|21K,G8393A|21K,C10449A|21K&21L&21M,A11537G|21K,T13195C|21K&21M,A17236G|21J,A18163G|21K&21L,C21762T|21K&21D&21M,C23525T|21K&20J&21L&21M,T23599G|21K&21L&21M,G23604A|20I&21K&21H&21L&21E&21M,G23948T|21K&21L,C24130A|21K&21M,C24503T|21K,A26530G|21K&21M,C26577G|21K&21L&21M,T27291C|21J,-28271T|21K&21G&21L&21M,C28311T|21K&21F&21G&21L&21M,T28881A|20I&21K&20B&20J&20F&20D&21G&21L&21E&21M -my $reversionsIx = 18; -my $labeledIx = 19; -my $ambigIx = 30; +my $reversionsIx = 22; +my $labeledIx = 23; +my $ambigIx = 34; sub cladeIsOmicron($) { my ($clade) = @_; return $clade =~ /^(21[KLM]|2[2-9]|recombinant)/; } sub reversionCount($$) { # Exclude ambiguous bases from reversions. Aside from that: # Just return the number of mutations in the comma-sep list, no second-guessing, although # I've seen cases where a sequence is placed out at the end of a long branch and half of the # long branch muts are counted against it as reversions -- even though in the big tree, that # long branch is broken up many times and breaking it up would be usher's approach. However, # in Nextclade's little tree, Omicron root is on a long branch, and in that case we do want # to count reversions against sequences that break up that particular long branch. # That's why I'm only looking at reversions (below) when the sequence is assigned to Omicron.