99599c7130790107ff0de9f043930da6aa7fddf1 angie Mon Nov 16 16:35:58 2020 -0800 Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching sequences and metadata from several public sources, mapping GISAID IDs to public seq IDs, downloading the latest release of the phylogenetic tree from github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public sequences, and using github.com/yatisht/usher to resolve ambiguous alleles, make protobuf files for hgPhyloPlace, and add public sequences that have not been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive public tree+VCF. This is still not fully otto-mated because certain crucial inputs like GISAID sequences still must be downloaded using a web browser, but the goal is to automate as much as possible and maybe someday have it fully cron-driven. There are two main top-level scripts which call other scripts, which may in turn call scripts, in this hierarchy: updateIdMapping.sh getCogUk.sh getNcbi.sh searchAllSarsCov2BioSample.sh bioSampleIdToText.sh bioSampleTextToTab.pl gbMetadataAddBioSample.pl fixNcbiFastaNames.pl updateSarsCov2Phylo.sh getRelease.sh processRelease.sh cladeLineageColors.pl mapPublic.sh extractUnmappedPublic.sh addUnmappedPublic.sh many of the above: util.sh publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I figure out how to automate fetching of author/institution metadata from NCBI and COG-UK. diff --git src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl new file mode 100755 index 0000000..a36967d --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl @@ -0,0 +1,97 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +sub usage() { + print STDERR "usage: $0 biosample.tab [gbMetadata.tab]\n"; + exit 1; +} + +# Read in and store distilled BioSample metadata; stream through GenBank metadata +# (from NCBI Virus / NCBI Datasets) and add in collection date and isolate name +# from BioSample when missing from GenBank. Report any conflicting dates. + +my @months = qw(jan feb mar apr may jun jul aug sep oct nov dec); + +sub normalizeDate($) { + # Convert "25-Jan-2020" to 2020-01-25, "19-MAR-2020" to 2020-03-19... + my ($dateIn) = @_; + my $dateOut = ""; + if ($dateIn) { + if ($dateIn =~ /^\d\d\d\d(-\d\d)?(-\d\d)?$/) { + $dateOut = $dateIn; + } elsif ($dateIn =~ /^((\d\d?)-)?(\w\w\w)-(\d\d\d\d)$/) { + my ($day, $month, $year) = ($2, lc($3), $4); + my ($mIx) = grep { $months[$_] eq $month } (0 .. @months-1); + if (! defined $mIx) { + die "Unrecognized month '$month' in '$dateIn'"; + } + $month = $mIx + 1; + if ($day) { + $dateOut = sprintf("%04d-%02d-%02d", $year, $month, $day); + } else { + $dateOut = sprintf("%04d-%02d", $year, $month); + } + } else { + die "Unrecognized date format '$dateIn'"; + } + } + return $dateOut; +} + +my $biosampleFile = shift @ARGV; + +open(my $BIOSAMPLE, "<$biosampleFile") || die "Can't open $biosampleFile: %!\n"; + +my %b2Name = (); +my %b2Date = (); +my %b2Country = (); +while (<$BIOSAMPLE>) { + my (undef, $bAcc, $name, $date, undef, undef, $country) = split("\t"); + $b2Name{$bAcc} = $name; + $b2Date{$bAcc} = $date; + $b2Country{$bAcc} = $country; +} +close($BIOSAMPLE); + +my $missingCount = 0; +while (<>) { + my ($gbAcc, $bAcc, $gbDate, $gbGeo, $host, $gbName, $completeness, $len) = split("\t"); + if ($bAcc) { + if (exists $b2Name{$bAcc}) { + my ($bName, $bDate, $bCountry) = ($b2Name{$bAcc}, normalizeDate($b2Date{$bAcc}), + $b2Country{$bAcc}); + if (! $gbDate) { + $gbDate = $bDate; + } elsif ($bDate && $gbDate ne $bDate) { + print STDERR "CONFLICT: Genbank date ($gbAcc $gbName) = $gbDate, " . + "BioSample date ($bAcc $bName) = $bDate\n"; + } + if (! $gbName) { + $gbName = $bName; + } elsif (($gbName eq '1' || $gbName eq 'NA') && length($bName) > length($gbName)) { + $gbName = $bName; + } elsif ($gbName eq 'nasopharyngeal' && $bName =~ m/\d/) { + $gbName = $bName; + } + if (! $gbGeo) { + $gbGeo = $bCountry; + } + print join("\t", $gbAcc, $bAcc, $gbDate, $gbGeo, $host, $gbName, $completeness, $len); + } else { + # BioSample file doesn't have info for this BioSample accession + print STDERR "Missing BioSample info for $bAcc\n"; + $missingCount++; + if ($missingCount >= 100) { + die "Too many missing BioSamples, quitting.\n"; + } + # Pass through as-is + print; + } + } else { + # No associated BioSample, just pass through as-is + print; + } +} +