99599c7130790107ff0de9f043930da6aa7fddf1
angie
  Mon Nov 16 16:35:58 2020 -0800
Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching
sequences and metadata from several public sources, mapping GISAID IDs to
public seq IDs, downloading the latest release of the phylogenetic tree from
github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public
sequences, and using github.com/yatisht/usher to resolve ambiguous alleles,
make protobuf files for hgPhyloPlace, and add public sequences that have not
been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive
public tree+VCF.

This is still not fully otto-mated because certain crucial inputs like
GISAID sequences still must be downloaded using a web browser, but the goal
is to automate as much as possible and maybe someday have it fully cron-driven.

There are two main top-level scripts which call other scripts, which may in turn
call scripts, in this hierarchy:

updateIdMapping.sh
getCogUk.sh
getNcbi.sh
searchAllSarsCov2BioSample.sh
bioSampleIdToText.sh
bioSampleTextToTab.pl
gbMetadataAddBioSample.pl
fixNcbiFastaNames.pl

updateSarsCov2Phylo.sh
getRelease.sh
processRelease.sh
cladeLineageColors.pl
mapPublic.sh
extractUnmappedPublic.sh
addUnmappedPublic.sh

many of the above:
util.sh

publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I
figure out how to automate fetching of author/institution metadata from NCBI
and COG-UK.

diff --git src/hg/utils/otto/sarscov2phylo/bioSampleTextToTab.pl src/hg/utils/otto/sarscov2phylo/bioSampleTextToTab.pl
new file mode 100755
index 0000000..7e5a424f
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/bioSampleTextToTab.pl
@@ -0,0 +1,129 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+sub isReal($) {
+  my ($val) = @_;
+  return ($val &&
+          (lc($val) ne "missing") &&
+          (lc($val) ne "unknown") &&
+          (lc($val) ne "not applicable") &&
+          (lc($val) ne "not collected") &&
+          (lc($val) ne "not provided") &&
+          (lc($val) ne "restricted access"));
+}
+
+my %attribs = ();
+
+while (<>) {
+  chomp;
+  if (/^Identifiers: BioSample: (\w+)(; Sample name: ([^;]+))?(; SRA: (\w+))?/) {
+    my ($acc, $sampleName, $sra) = ($1, $3, $5);
+    $attribs{__acc} = $acc;
+    $attribs{__sampleName} = $sampleName if ($sampleName);
+    $attribs{__sra} = $sra if ($sra);
+  } elsif (/^Identifiers: /) {
+    die "Can't parse Identifiers line $.:\n$_\t";
+  } elsif (/^    \/([^=]+)="([^"]+)"$/) {
+    my ($attr, $val) = ($1, $2);
+    if (isReal($val)) {
+      $attribs{$attr} = $val;
+    }
+  } elsif (/^    \//) {
+    die "Can't parse attribute line $.:\n$_\t";
+  } elsif (/^(EPI_ISL_\d+)/) {
+    $attribs{__epi} = $1;
+  } elsif (/^Accession: (\w+)\sID: (\d+)/) {
+    # Last line of record; reconcile whatever attributes were accumulated with the columns that
+    # we want to define.
+    my ($acc, $gi) = ($1, $2);
+    die "acc mismatch '$acc' vs. '$attribs{__acc}'" if ($acc ne $attribs{__acc});
+    my $name = "";
+    if (exists $attribs{"sample name"}) {
+      $name = $attribs{"sample name"};
+    } elsif (exists $attribs{"Submitter Id"}) {
+      $name = $attribs{"Submitter Id"};
+    } elsif (exists $attribs{strain}) {
+      $name = $attribs{strain};
+    } elsif (exists $attribs{isolate}) {
+      $name = $attribs{isolate};
+    } elsif (exists $attribs{title}) {
+      $name = $attribs{title};
+    } elsif (exists $attribs{"virus identifier"}) {
+      $name = $attribs{"virus identifier"};
+    } elsif (exists $attribs{__sampleName}) {
+      $name = $attribs{__sampleName};
+    }
+    $name =~ s@^SARS-Co[Vv]-2/@@;
+    $name =~ s@^hCo[Vv]-19/@@;
+    $name =~ s@^[Hh]uman/@@;
+    $name =~ s@^Severe acute respiratory syndrome coronavirus 2/@@;
+    $name =~ s@^/North America/@@;
+    my $date = "";
+    if (exists $attribs{"receipt date"} && exists $attribs{"collection date"}) {
+      # Use the longer of the two in case one is just "2020"
+      if (length($attribs{"receipt date"}) > length($attribs{"collection date"})) {
+        $date = $attribs{"receipt date"};
+      } else {
+        $date = $attribs{"collection date"};
+      }
+    } elsif (exists $attribs{"receipt date"}) {
+      $date = $attribs{"receipt date"};
+    } elsif (exists $attribs{"collection date"}) {
+      $date = $attribs{"collection date"};
+    }
+    my $lab = "";
+    if (exists $attribs{"collecting institution"}) {
+      $lab = $attribs{"collecting institution"};
+    } elsif (exists $attribs{"collected by"}) {
+      $lab = $attribs{"collected by"};
+    } elsif (exists $attribs{"INSDC center name"}) {
+      $lab = $attribs{"INSDC center name"};
+    }
+    my $author = "";
+    if (exists $attribs{"collector name"}) {
+      $author = $attribs{"collector name"};
+    }
+    my $country = "";
+    if (exists $attribs{"geographic location"}) {
+      $country = $attribs{"geographic location"};
+    }
+    my $locale = "";
+    if (exists $attribs{"geographic location (region and locality)"}) {
+      $locale = $attribs{"geographic location (region and locality)"};
+    } elsif ($country =~ m/(.*):\s*(.*)/) {
+      ($country, $locale) = ($1, $2);
+    }
+    my $hostId = "";
+    if (exists $attribs{"host subject id"}) {
+      $hostId = $attribs{"host subject id"};
+    }
+    my $sraId = "";
+    if (exists $attribs{__sra}) {
+      $sraId = $attribs{__sra};
+    }
+    my $epiId = "";
+    if (exists $attribs{gisaid_accession}) {
+      $epiId = $attribs{gisaid_accession};
+    } elsif (exists $attribs{gisaid_accession_id}) {
+      $epiId = $attribs{gisaid_accession_id};
+    } elsif (exists $attribs{gisaid}) {
+      $epiId = $attribs{gisaid};
+    } elsif (exists $attribs{"gisaid id"}) {
+      $epiId = $attribs{"gisaid id"};
+    } elsif (exists $attribs{"gisaid accession id"}) {
+      $epiId = $attribs{"gisaid accession id"};
+    } elsif (exists $attribs{subgroup}) {
+      $epiId = $attribs{subgroup};
+    } elsif (exists $attribs{__epi}) {
+      $epiId = $attribs{__epi};
+    }
+
+    print join("\t", $gi, $acc, $name, $date, $lab, $author, $country, $locale,
+               $hostId, $sraId, $epiId) ."\n";
+    %attribs = ();
+  } elsif (/^Accession: /) {
+    die "Can't parse Accession line $.:\n$_\t";
+  }
+}