99599c7130790107ff0de9f043930da6aa7fddf1 angie Mon Nov 16 16:35:58 2020 -0800 Scripts for automating SARS-CoV-2 Phylogeny tracks (refs #26530): fetching sequences and metadata from several public sources, mapping GISAID IDs to public seq IDs, downloading the latest release of the phylogenetic tree from github.com/roblanf/sarscov2phylo/ , making VCFs from GISAID and public sequences, and using github.com/yatisht/usher to resolve ambiguous alleles, make protobuf files for hgPhyloPlace, and add public sequences that have not been mapped to GISAID sequences to the sarscov2phylo tree for a comprehensive public tree+VCF. This is still not fully otto-mated because certain crucial inputs like GISAID sequences still must be downloaded using a web browser, but the goal is to automate as much as possible and maybe someday have it fully cron-driven. There are two main top-level scripts which call other scripts, which may in turn call scripts, in this hierarchy: updateIdMapping.sh getCogUk.sh getNcbi.sh searchAllSarsCov2BioSample.sh bioSampleIdToText.sh bioSampleTextToTab.pl gbMetadataAddBioSample.pl fixNcbiFastaNames.pl updateSarsCov2Phylo.sh getRelease.sh processRelease.sh cladeLineageColors.pl mapPublic.sh extractUnmappedPublic.sh addUnmappedPublic.sh many of the above: util.sh publicCredits.sh will hopefully be folded into updateSarsCov2Phylo.sh when I figure out how to automate fetching of author/institution metadata from NCBI and COG-UK. diff --git src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl new file mode 100755 index 0000000..ccfe3bb --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl @@ -0,0 +1,119 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +sub usage() { + print STDERR "usage: $0 ncbi_dataset.plusBioSample.tsv [fasta]\n"; + exit 1; +} + +# Read in metadata for GenBank virus sequences, then stream through fasta; if header already +# has a well-formed country/isolate/year name after the accession then keep that, otherwise +# add from metadata. + +sub makeName($$$$) { + my ($host, $country, $isolate, $year) = @_; + my @components = (); + if ($host) { + push @components, $host; + } + if ($country) { + push @components, $country; + } + if ($isolate) { + push @components, $isolate; + } + if ($year) { + push @components, $year; + } + return join('/', @components); +} + +# Replace non-human host scientific names with common names +my %sciToCommon = ( 'Canis lupus familiaris' => 'canine', + 'Felis catus' => 'cat', + 'Mustela lutreola' => 'mink', # Netherlands + 'Neovison vison' => 'mink', # Denmark + 'Panthera leo' => 'lion', + 'Panthera tigris' => 'tiger', + 'Panthera tigris jacksoni' => 'tiger' + ); + +my $gbMetadataFile = shift @ARGV; + +open(my $GBMETA, "<$gbMetadataFile") || die "Can't open $gbMetadataFile: $!\n"; + +my %accToMeta = (); +while (<$GBMETA>) { + my ($acc, undef, $date, $geoLoc, $host, $isoName) = split("\t"); + # Trim to just the country + $geoLoc =~ s/:.*//; + $geoLoc =~ s/ //g; + if ($host eq "Homo sapiens") { + $host = ''; + } elsif (exists $sciToCommon{$host}) { + $host = $sciToCommon{$host}; + } + $isoName =~ s@^SARS?[- ]Co[Vv]-?2/@@; + $isoName =~ s@^hCo[Vv]-19/@@; + $isoName =~ s@^BetaCoV/@@; + $isoName =~ s@^human/@@; + $isoName =~ s@/ENV/@/env/@; + $accToMeta{$acc} = [$date, $geoLoc, $host, $isoName]; +} +close($GBMETA); + +while (<>) { + if (/^>([A-Z]+\d+\.\d+)\s*(\S+.*)?\s*$/) { + my ($acc, $fName) = ($1, $2); + if (exists $accToMeta{$acc}) { + my ($mDate, $mCountry, $mHost, $mName) = @{$accToMeta{$acc}}; + my $mYear; + if ($mDate =~ /^(\d\d\d\d)/) { + $mYear = $1; + } + my $name = $fName; + my $year = $mYear; + if (! $fName) { + $name = makeName($mHost, $mCountry, $mName, $mYear); + } else { + # If fasta name contains host, country, isolate name, and/or year, use those, + # otherwise take from metadata. + if ($fName =~ m@^((\w+)/)?(\w+)/[^/]+/(\d+)$@) { + # Well-formed; use it. + $name = $fName; + } else { + if ($fName =~ m@/(\d\d\d\d)$@) { + if ($1 && $mYear && $1 ne $mYear) { + print $STDERR "Year mismatch for $acc: name $name, metadata $mDate"; + } + $fName =~ s@/(\d\d\d\d)$@@; + $year = $1; + } + if ($fName =~ m@^[A-Z]{3}/@) { + # Not really well-formed, but at least it starts with a country code so + # don't mess it up further. + $name = $fName; + } else { + $name = makeName($mHost, $mCountry, $fName, $year); + } + } + } + print ">$acc |$name\n"; + } else { + print STDERR "No metadata for $acc\n"; + s/ /\|/; + print; + } + } elsif (/^[A-Z]+$/) { + print; + } else { + if (/^>/) { + s/ /\|/; + } else { + warn "Passing through weird line:\n$_"; + } + print; + } +}