fcaec96345053d29cc56fa0aa8b13f07ecd8951d
angie
  Mon Feb 14 13:43:12 2022 -0800
Use Nextclade's new output columns privateNucMutations.{reversionSubstitutions,labeledSubstitutions} to identify partial or contaminated Omicron sequences for removal.

diff --git src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl
new file mode 100755
index 0000000..c0bb35f
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl
@@ -0,0 +1,122 @@
+#!/usr/bin/perl
+
+# Parse a few columns out of Nextclade's voluminous output to help identify sequences
+# that have only a subset of Omicron mutations so we can keep them from screwing up
+# the base of the Omicron branch.
+
+# Some bad sequences are assigned 19A, 20A, 20B but have a suspicious number of Omicron muts.
+# Others are assigned Omicron (21K, 21L, 21M) but have a lot of reversions.
+
+use warnings;
+use strict;
+
+my $maxOmicronMuts = 5;
+my $maxReversions = 5;
+
+# Column offsets:
+#0       seqName
+#1       clade
+#17      privateNucMutations.reversionSubstitutions
+#18      privateNucMutations.labeledSubstitutions
+
+# Examples values for a seq assigned to 21J (Delta) but with a suspicious number of Omicron muts
+# (and Delta back-muts):
+# reversions example: T4181G,T7124C,T8986C,T9053G,T16466C,G21618C,C27638T,T27752C,T29402G
+# labeled example: T5386G|21K,G8393A|21K,C10449A|21K&21L&21M,A11537G|21K,T13195C|21K&21M,A17236G|21J,A18163G|21K&21L,C21762T|21K&21D&21M,C23525T|21K&20J&21L&21M,T23599G|21K&21L&21M,G23604A|20I&21K&21H&21L&21E&21M,G23948T|21K&21L,C24130A|21K&21M,C24503T|21K,A26530G|21K&21M,C26577G|21K&21L&21M,T27291C|21J,-28271T|21K&21G&21L&21M,C28311T|21K&21F&21G&21L&21M,T28881A|20I&21K&20B&20J&20F&20D&21G&21L&21E&21M
+
+my $reversionsIx = 17;
+my $labeledIx = 18;
+my $ambigIx = 29;
+
+sub cladeIsOmicron($) {
+  my ($clade) = @_;
+  return $clade =~ /^21[KLM]/;
+}
+
+sub reversionCount($$) {
+  # Exclude ambiguous bases from reversions.  Aside from that:
+  # Just return the number of mutations in the comma-sep list, no second-guessing, although
+  # I've seen cases where a sequence is placed out at the end of a long branch and half of the
+  # long branch muts are counted against it as reversions -- even though in the big tree, that
+  # long branch is broken up many times and breaking it up would be usher's approach.  However,
+  # in Nextclade's little tree, Omicron root is on a long branch, and in that case we do want
+  # to count reversions against sequences that break up that particular long branch.
+  # That's why I'm only looking at reversions (below) when the sequence is assigned to Omicron.
+  my ($reversionStr, $ambigStr) = @_;
+  $ambigStr =~ s/[A-Z]://g;
+  my @revs = split(/,/, $reversionStr);
+  my %ambigLocs = map {$_ => 1} split(/,/, $ambigStr);
+  my $count = 0;
+  foreach my $rev (@revs) {
+    my $revLoc = $rev;
+    $revLoc =~ s/^[A-Z](\d+)[A-Z]$/$1/;
+    next if (exists $ambigLocs{$revLoc});
+    $count++;
+  }
+  return $count;
+}
+
+sub offLabelCount($$) {
+  my ($clade, $labeledStr) = @_;
+  my $count = 0;
+  my @mutStrs = split(',', $labeledStr);
+  foreach my $mutStr (@mutStrs) {
+    my (undef, $labelStr) = split(/\|/, $mutStr);
+    if (index($labelStr, $clade) >= 0) {
+      # The mutation is "private" by placement in nextclade's tree, but associated with this clade,
+      # so I ignore it.  https://github.com/nextstrain/nextclade/issues/711
+      next;
+    }
+    $count++;
+  }
+  return $count;
+}
+
+sub privateOmicronCount($$) {
+  my ($clade, $labeledStr) = @_;
+  my $count = 0;
+  my @mutStrs = split(',', $labeledStr);
+  foreach my $mutStr (@mutStrs) {
+    my (undef, $labelStr) = split(/\|/, $mutStr);
+    if (index($labelStr, $clade) >= 0) {
+      # The mutation is "private" by placement in nextclade's tree, but associated with this clade,
+      # so I ignore it.  https://github.com/nextstrain/nextclade/issues/711
+      next;
+    }
+    if (index($labelStr, "21K") >= 0 || index($labelStr, "21L") >= 0 ||
+        index($labelStr, "21M") >= 0) {
+      $count++;
+    }
+  }
+  return $count;
+}
+
+while (<>) {
+  chomp;
+  s/"//g;
+  my @w = split("\t");
+  my ($seqName, $clade, $reversionStr, $labeledStr, $ambigStr) =
+    ($w[0], $w[1], $w[$reversionsIx], $w[$labeledIx], $w[$ambigIx]);
+  if ($seqName eq 'seqName') {
+    # Just in case they tweak the column order, if this looks like a header line, get the
+    # indices from it:
+    for (my $ix = 1;  $ix < @w;  $ix++) {
+      if ($w[$ix] eq 'privateNucMutations.reversionSubstitutions') {
+        $reversionsIx = $ix;
+      } elsif ($w[$ix] eq 'privateNucMutations.labeledSubstitutions') {
+        $labeledIx = $ix;
+      } elsif ($w[$ix] eq 'nonACGTNs') {
+        $ambigIx = $ix;
+      }
+    }
+  } else {
+    $clade =~ s/ .*//;
+    my $isOmicron = cladeIsOmicron($clade);
+    my $numReversions = reversionCount($reversionStr, $ambigStr);
+    my $numPrivateOmicron = privateOmicronCount($clade, $labeledStr);
+    if ((! $isOmicron && $numPrivateOmicron > $maxOmicronMuts) ||
+        ($isOmicron && $numReversions > $maxReversions)) {
+      print join("\t", $seqName, $clade, $numReversions, $numPrivateOmicron) . "\n";
+    }
+  }
+}