99599c7130790107ff0de9f043930da6aa7fddf1
angie
  Mon Nov 16 16:35:58 2020 -0800
Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching
sequences and metadata from several public sources, mapping GISAID IDs to
public seq IDs, downloading the latest release of the phylogenetic tree from
github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public
sequences, and using github.com/yatisht/usher to resolve ambiguous alleles,
make protobuf files for hgPhyloPlace, and add public sequences that have not
been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive
public tree+VCF.

This is still not fully otto-mated because certain crucial inputs like
GISAID sequences still must be downloaded using a web browser, but the goal
is to automate as much as possible and maybe someday have it fully cron-driven.

There are two main top-level scripts which call other scripts, which may in turn
call scripts, in this hierarchy:

updateIdMapping.sh
getCogUk.sh
getNcbi.sh
searchAllSarsCov2BioSample.sh
bioSampleIdToText.sh
bioSampleTextToTab.pl
gbMetadataAddBioSample.pl
fixNcbiFastaNames.pl

updateSarsCov2Phylo.sh
getRelease.sh
processRelease.sh
cladeLineageColors.pl
mapPublic.sh
extractUnmappedPublic.sh
addUnmappedPublic.sh

many of the above:
util.sh

publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I
figure out how to automate fetching of author/institution metadata from NCBI
and COG-UK.

diff --git src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl
new file mode 100755
index 0000000..ccfe3bb
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl
@@ -0,0 +1,119 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+sub usage() {
+  print STDERR "usage: $0 ncbi_dataset.plusBioSample.tsv [fasta]\n";
+  exit 1;
+}
+
+# Read in metadata for GenBank virus sequences, then stream through fasta; if header already
+# has a well-formed country/isolate/year name after the accession then keep that, otherwise
+# add from metadata.
+
+sub makeName($$$$) {
+  my ($host, $country, $isolate, $year) = @_;
+  my @components = ();
+  if ($host) {
+    push @components, $host;
+  }
+  if ($country) {
+    push @components, $country;
+  }
+  if ($isolate) {
+    push @components, $isolate;
+  }
+  if ($year) {
+    push @components, $year;
+  }
+  return join('/', @components);
+}
+
+# Replace non-human host scientific names with common names
+my %sciToCommon = ( 'Canis lupus familiaris' => 'canine',
+                    'Felis catus' => 'cat',
+                    'Mustela lutreola' => 'mink', # Netherlands
+                    'Neovison vison' => 'mink',   # Denmark
+                    'Panthera leo' => 'lion',
+                    'Panthera tigris' => 'tiger',
+                    'Panthera tigris jacksoni' => 'tiger'
+                  );
+
+my $gbMetadataFile = shift @ARGV;
+
+open(my $GBMETA, "<$gbMetadataFile") || die "Can't open $gbMetadataFile: $!\n";
+
+my %accToMeta = ();
+while (<$GBMETA>) {
+  my ($acc, undef, $date, $geoLoc, $host, $isoName) = split("\t");
+  # Trim to just the country
+  $geoLoc =~ s/:.*//;
+  $geoLoc =~ s/ //g;
+  if ($host eq "Homo sapiens") {
+    $host = '';
+  } elsif (exists $sciToCommon{$host}) {
+    $host = $sciToCommon{$host};
+  }
+  $isoName =~ s@^SARS?[- ]Co[Vv]-?2/@@;
+  $isoName =~ s@^hCo[Vv]-19/@@;
+  $isoName =~ s@^BetaCoV/@@;
+  $isoName =~ s@^human/@@;
+  $isoName =~ s@/ENV/@/env/@;
+  $accToMeta{$acc} = [$date, $geoLoc, $host, $isoName];
+}
+close($GBMETA);
+
+while (<>) {
+  if (/^>([A-Z]+\d+\.\d+)\s*(\S+.*)?\s*$/) {
+    my ($acc, $fName) = ($1, $2);
+    if (exists $accToMeta{$acc}) {
+      my ($mDate, $mCountry, $mHost, $mName) = @{$accToMeta{$acc}};
+      my $mYear;
+      if ($mDate =~ /^(\d\d\d\d)/) {
+        $mYear = $1;
+      }
+      my $name = $fName;
+      my $year = $mYear;
+      if (! $fName) {
+        $name = makeName($mHost, $mCountry, $mName, $mYear);
+      } else {
+        # If fasta name contains host, country, isolate name, and/or year, use those,
+        # otherwise take from metadata.
+        if ($fName =~ m@^((\w+)/)?(\w+)/[^/]+/(\d+)$@) {
+          # Well-formed; use it.
+          $name = $fName;
+        } else {
+          if ($fName =~ m@/(\d\d\d\d)$@) {
+            if ($1 && $mYear && $1 ne $mYear) {
+              print $STDERR "Year mismatch for $acc: name $name, metadata $mDate";
+            }
+            $fName =~ s@/(\d\d\d\d)$@@;
+            $year = $1;
+          }
+          if ($fName =~ m@^[A-Z]{3}/@) {
+            # Not really well-formed, but at least it starts with a country code so
+            # don't mess it up further.
+            $name = $fName;
+          } else {
+            $name = makeName($mHost, $mCountry, $fName, $year);
+          }
+        }
+      }
+      print ">$acc |$name\n";
+    } else {
+      print STDERR "No metadata for $acc\n";
+      s/ /\|/;
+      print;
+    }
+  } elsif (/^[A-Z]+$/) {
+    print;
+  } else {
+    if (/^>/) {
+      s/ /\|/;
+    } else {
+      warn "Passing through weird line:\n$_";
+    }
+    print;
+  }
+}