a087918aa0bbba12853825bd8a588ab26b0b70bd
angie
  Sun Feb 4 09:24:02 2024 -0800
Update to nextclade v3 output column indices.

diff --git src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl
index 3786875..508ef29 100755
--- src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl
+++ src/hg/utils/otto/sarscov2phylo/findDropoutContam.pl
@@ -1,42 +1,45 @@
 #!/usr/bin/perl
 
 # Parse a few columns out of Nextclade's voluminous output to help identify sequences
 # that have only a subset of Omicron mutations so we can keep them from screwing up
 # the base of the Omicron branch.
 
 # Some bad sequences are assigned 19A, 20A, 20B but have a suspicious number of Omicron muts.
 # Others are assigned Omicron (21K, 21L, 21M, 22*) but have a lot of reversions.
 
 use warnings;
 use strict;
 
 my $maxOmicronMuts = 5;
 my $maxReversions = 5;
 
-# Column offsets:
+# nextclade v2 column offsets
+# (NOTE: nextclade v3 adds a first column 'index' so these are all off by at least one):
 #0       seqName
 #1       clade
 #22      privateNucMutations.reversionSubstitutions
 #23      privateNucMutations.labeledSubstitutions
 #34      nonACGTNs
 
 # Examples values for a seq assigned to 21J (Delta) but with a suspicious number of Omicron muts
 # (and Delta back-muts):
 # reversions example: T4181G,T7124C,T8986C,T9053G,T16466C,G21618C,C27638T,T27752C,T29402G
 # labeled example: T5386G|21K,G8393A|21K,C10449A|21K&21L&21M,A11537G|21K,T13195C|21K&21M,A17236G|21J,A18163G|21K&21L,C21762T|21K&21D&21M,C23525T|21K&20J&21L&21M,T23599G|21K&21L&21M,G23604A|20I&21K&21H&21L&21E&21M,G23948T|21K&21L,C24130A|21K&21M,C24503T|21K,A26530G|21K&21M,C26577G|21K&21L&21M,T27291C|21J,-28271T|21K&21G&21L&21M,C28311T|21K&21F&21G&21L&21M,T28881A|20I&21K&20B&20J&20F&20D&21G&21L&21E&21M
 
+my $seqNameIx = 0;
+my $cladeIx = 1;
 my $reversionsIx = 22;
 my $labeledIx = 23;
 my $ambigIx = 34;
 
 sub cladeIsOmicron($) {
   my ($clade) = @_;
   return $clade =~ /^(21[KLM]|2[2-9]|recombinant)/;
 }
 
 sub reversionCount($$) {
   # Exclude ambiguous bases from reversions.  Aside from that:
   # Just return the number of mutations in the comma-sep list, no second-guessing, although
   # I've seen cases where a sequence is placed out at the end of a long branch and half of the
   # long branch muts are counted against it as reversions -- even though in the big tree, that
   # long branch is broken up many times and breaking it up would be usher's approach.  However,
@@ -80,41 +83,56 @@
   foreach my $mutStr (@mutStrs) {
     my (undef, $labelStr) = split(/\|/, $mutStr);
     if (index($labelStr, $clade) >= 0) {
       # The mutation is "private" by placement in nextclade's tree, but associated with this clade,
       # so I ignore it.  https://github.com/nextstrain/nextclade/issues/711
       next;
     }
     if (index($labelStr, "21K") >= 0 || index($labelStr, "21L") >= 0 ||
         index($labelStr, "21M") >= 0) {
       $count++;
     }
   }
   return $count;
 }
 
+my $foundHeader = 0;
 while (<>) {
   chomp;
   s/"//g;
   my @w = split("\t");
+  # Rough way to detect nextclade v3 without header: look for numeric first column (index)
+  if (! $foundHeader && $w[0] =~ /^\d+$/) {
+    shift @w;
+    $seqNameIx = 0;
+    $cladeIx = 1;
+    $reversionsIx = 31;
+    $labeledIx = 32;
+    $ambigIx = 40;
+  }
   my ($seqName, $clade, $reversionStr, $labeledStr, $ambigStr) =
-    ($w[0], $w[1], $w[$reversionsIx], $w[$labeledIx], $w[$ambigIx]);
-  if ($seqName eq 'seqName') {
+    ($w[$seqNameIx], $w[$cladeIx], $w[$reversionsIx], $w[$labeledIx], $w[$ambigIx]);
+  if ($seqName eq 'seqName' || $seqName eq 'index') {
     # Just in case they tweak the column order, if this looks like a header line, get the
     # indices from it:
+    $foundHeader = 1;
     for (my $ix = 1;  $ix < @w;  $ix++) {
-      if ($w[$ix] eq 'privateNucMutations.reversionSubstitutions') {
+      if ($w[$ix] eq 'seqName') {
+        $seqNameIx = $ix;
+      } elsif ($w[$ix] eq 'clade') {
+        $cladeIx = $ix;
+      } elsif ($w[$ix] eq 'privateNucMutations.reversionSubstitutions') {
         $reversionsIx = $ix;
       } elsif ($w[$ix] eq 'privateNucMutations.labeledSubstitutions') {
         $labeledIx = $ix;
       } elsif ($w[$ix] eq 'nonACGTNs') {
         $ambigIx = $ix;
       }
     }
   } else {
     $clade =~ s/ .*//;
     my $isOmicron = cladeIsOmicron($clade);
     my $numReversions = reversionCount($reversionStr, $ambigStr);
     my $numPrivateOmicron = privateOmicronCount($clade, $labeledStr);
     if ((! $isOmicron && $numPrivateOmicron > $maxOmicronMuts) ||
         ($isOmicron && $numReversions > $maxReversions)) {
       print join("\t", $seqName, $clade, $numReversions, $numPrivateOmicron) . "\n";