2ae2c57f81bed91b2474771b896de8bc885b6b21 angie Wed Sep 8 12:04:46 2021 -0700 Don't prepend country and append year if isolate name (e.g. from BioSample) already includes those. diff --git src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl index 099f6d3..e1a96a3 100755 --- src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl +++ src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl @@ -6,39 +6,43 @@ sub usage() { print STDERR "usage: $0 ncbi_dataset.plusBioSample.tsv [fasta]\n"; exit 1; } # Read in metadata for GenBank virus sequences, then stream through fasta; if header already # has a well-formed country/isolate/year name after the accession then keep that, otherwise # add from metadata. sub makeName($$$$) { my ($host, $country, $isolate, $year) = @_; my @components = (); if ($host) { push @components, $host; } + if ($isolate =~ m@^[A-Za-z]+/.*/\d+$@) { + push @components, $isolate; + } else { if ($country) { push @components, $country; } if ($isolate) { push @components, $isolate; } if ($year) { push @components, $year; } + } return join('/', @components); } # Replace non-human host scientific names with common names my %sciToCommon = ( 'Canis lupus familiaris' => 'canine', 'Felis catus' => 'cat', 'Mustela lutreola' => 'mink', # Netherlands 'Neovison vison' => 'mink', # Denmark 'Panthera leo' => 'lion', 'Panthera tigris' => 'tiger', 'Panthera tigris jacksoni' => 'tiger' ); my $gbMetadataFile = shift @ARGV;