3c022c2be4485ff1f369617943da75bf51e9da70 angie Tue Feb 22 10:52:54 2022 -0800 Identify sequences with an implausibly low number of substitutions given sampling month so we can exclude them from the tree. diff --git src/hg/utils/otto/sarscov2phylo/findRefBackfill.pl src/hg/utils/otto/sarscov2phylo/findRefBackfill.pl new file mode 100755 index 0000000..e77424b --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/findRefBackfill.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl + +# Given 3 tab-separated columns of input (name/acc, date, number of substitutions), +# output sequence name/acc that has suspiciously few mutations for a SARS-CoV-2 +# sequence from that date. + +use warnings; +use strict; + +while (<>) { + chomp; + my ($name, $date, $substCount) = split(/\t/); + next if ($substCount eq ""); + if ($date =~ /^20(19|2\d)(-(\d\d))?(-\d\d)?$/) { + # Parse date and convert to number of months since epidemic started. + my ($y, $m) = ($1, $3); + $m = 1 if (! $m); + $y -= 20; + my $epiMonth = 12*$y + $m; + next if ($epiMonth < 0); + # Calculate minimum acceptable substitutions (it might be good to consider # of ambigs too) + my $min = ($epiMonth - 2) * 0.75; + $min = 0 if ($min < 0); + if ($substCount < $min) { + print join("\t", $name, $min, $substCount) . "\n"; + } + } +} +