3c022c2be4485ff1f369617943da75bf51e9da70
angie
  Tue Feb 22 10:52:54 2022 -0800
Identify sequences with an implausibly low number of substitutions given sampling month so we can exclude them from the tree.

diff --git src/hg/utils/otto/sarscov2phylo/findRefBackfill.pl src/hg/utils/otto/sarscov2phylo/findRefBackfill.pl
new file mode 100755
index 0000000..e77424b
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/findRefBackfill.pl
@@ -0,0 +1,29 @@
+#!/usr/bin/perl
+
+# Given 3 tab-separated columns of input (name/acc, date, number of substitutions),
+# output sequence name/acc that has suspiciously few mutations for a SARS-CoV-2
+# sequence from that date.
+
+use warnings;
+use strict;
+
+while (<>) {
+  chomp;
+  my ($name, $date, $substCount) = split(/\t/);
+  next if ($substCount eq "");
+  if ($date =~ /^20(19|2\d)(-(\d\d))?(-\d\d)?$/) {
+    # Parse date and convert to number of months since epidemic started.
+    my ($y, $m) = ($1, $3);
+    $m = 1 if (! $m);
+    $y -= 20;
+    my $epiMonth = 12*$y + $m;
+    next if ($epiMonth < 0);
+    # Calculate minimum acceptable substitutions (it might be good to consider # of ambigs too)
+    my $min = ($epiMonth - 2) * 0.75;
+    $min = 0 if ($min < 0);
+    if ($substCount < $min) {
+      print join("\t", $name, $min, $substCount) . "\n";
+    }
+  }
+}
+