src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl 99599c7130790107ff0de9f043930da6aa7fddf1

99599c7130790107ff0de9f043930da6aa7fddf1
angie
  Mon Nov 16 16:35:58 2020 -0800
Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching
sequences and metadata from several public sources, mapping GISAID IDs to
public seq IDs, downloading the latest release of the phylogenetic tree from
github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public
sequences, and using github.com/yatisht/usher to resolve ambiguous alleles,
make protobuf files for hgPhyloPlace, and add public sequences that have not
been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive
public tree+VCF.

This is still not fully otto-mated because certain crucial inputs like
GISAID sequences still must be downloaded using a web browser, but the goal
is to automate as much as possible and maybe someday have it fully cron-driven.

There are two main top-level scripts which call other scripts, which may in turn
call scripts, in this hierarchy:

updateIdMapping.sh
getCogUk.sh
getNcbi.sh
searchAllSarsCov2BioSample.sh
bioSampleIdToText.sh
bioSampleTextToTab.pl
gbMetadataAddBioSample.pl
fixNcbiFastaNames.pl

updateSarsCov2Phylo.sh
getRelease.sh
processRelease.sh
cladeLineageColors.pl
mapPublic.sh
extractUnmappedPublic.sh
addUnmappedPublic.sh

many of the above:
util.sh

publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I
figure out how to automate fetching of author/institution metadata from NCBI
and COG-UK.

diff --git src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl
new file mode 100755
index 0000000..a36967d
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl
@@ -0,0 +1,97 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+sub usage() {
+  print STDERR "usage: $0 biosample.tab [gbMetadata.tab]\n";
+  exit 1;
+}
+
+# Read in and store distilled BioSample metadata; stream through GenBank metadata
+# (from NCBI Virus / NCBI Datasets) and add in collection date and isolate name
+# from BioSample when missing from GenBank.  Report any conflicting dates.
+
+my @months = qw(jan feb mar apr may jun jul aug sep oct nov dec);
+
+sub normalizeDate($) {
+  # Convert "25-Jan-2020" to 2020-01-25, "19-MAR-2020" to 2020-03-19...
+  my ($dateIn) = @_;
+  my $dateOut = "";
+  if ($dateIn) {
+    if ($dateIn =~ /^\d\d\d\d(-\d\d)?(-\d\d)?$/) {
+      $dateOut = $dateIn;
+    } elsif ($dateIn =~ /^((\d\d?)-)?(\w\w\w)-(\d\d\d\d)$/) {
+      my ($day, $month, $year) = ($2, lc($3), $4);
+      my ($mIx) = grep { $months[$_] eq $month } (0 .. @months-1);
+      if (! defined $mIx) {
+        die "Unrecognized month '$month' in '$dateIn'";
+      }
+      $month = $mIx + 1;
+      if ($day) {
+        $dateOut = sprintf("%04d-%02d-%02d", $year, $month, $day);
+      } else {
+        $dateOut = sprintf("%04d-%02d", $year, $month);
+      }
+    } else {
+      die "Unrecognized date format '$dateIn'";
+    }
+  }
+  return $dateOut;
+}
+
+my $biosampleFile = shift @ARGV;
+
+open(my $BIOSAMPLE, "<$biosampleFile") || die "Can't open $biosampleFile: %!\n";
+
+my %b2Name = ();
+my %b2Date = ();
+my %b2Country = ();
+while (<$BIOSAMPLE>) {
+  my (undef, $bAcc, $name, $date, undef, undef, $country) = split("\t");
+  $b2Name{$bAcc} = $name;
+  $b2Date{$bAcc} = $date;
+  $b2Country{$bAcc} = $country;
+}
+close($BIOSAMPLE);
+
+my $missingCount = 0;
+while (<>) {
+  my ($gbAcc, $bAcc, $gbDate, $gbGeo, $host, $gbName, $completeness, $len) = split("\t");
+  if ($bAcc) {
+    if (exists $b2Name{$bAcc}) {
+      my ($bName, $bDate, $bCountry) = ($b2Name{$bAcc}, normalizeDate($b2Date{$bAcc}),
+                                        $b2Country{$bAcc});
+      if (! $gbDate) {
+        $gbDate = $bDate;
+      } elsif ($bDate && $gbDate ne $bDate) {
+        print STDERR "CONFLICT: Genbank date ($gbAcc $gbName) = $gbDate, " .
+          "BioSample date ($bAcc $bName) = $bDate\n";
+      }
+      if (! $gbName) {
+        $gbName = $bName;
+      } elsif (($gbName eq '1' || $gbName eq 'NA') && length($bName) > length($gbName)) {
+        $gbName = $bName;
+      } elsif ($gbName eq 'nasopharyngeal' && $bName =~ m/\d/) {
+        $gbName = $bName;
+      }
+      if (! $gbGeo) {
+        $gbGeo = $bCountry;
+      }
+      print join("\t", $gbAcc, $bAcc, $gbDate, $gbGeo, $host, $gbName, $completeness, $len);
+    } else {
+      # BioSample file doesn't have info for this BioSample accession
+      print STDERR "Missing BioSample info for $bAcc\n";
+      $missingCount++;
+      if ($missingCount >= 100) {
+        die "Too many missing BioSamples, quitting.\n";
+      }
+      # Pass through as-is
+      print;
+    }
+  } else {
+    # No associated BioSample, just pass through as-is
+    print;
+  }
+}
+