1af2f775e2a9c148f0cbff583910f1d68faf2598 angie Fri Aug 26 11:54:35 2022 -0700 Do a little more cleanup and always construct name from components; fasta names are not always updated diff --git src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl index e1a96a3..91b9a45 100755 --- src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl +++ src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl @@ -1,123 +1,106 @@ #!/usr/bin/env perl use warnings; use strict; sub usage() { print STDERR "usage: $0 ncbi_dataset.plusBioSample.tsv [fasta]\n"; exit 1; } # Read in metadata for GenBank virus sequences, then stream through fasta; if header already # has a well-formed country/isolate/year name after the accession then keep that, otherwise # add from metadata. sub makeName($$$$) { my ($host, $country, $isolate, $year) = @_; my @components = (); - if ($host) { + if ($host && $isolate !~ m@^$host/@) { push @components, $host; } - if ($isolate =~ m@^[A-Za-z]+/.*/\d+$@) { + if ($isolate =~ m@^([A-Za-z ]+/)?[A-Za-z]+/.*/\d+$@) { push @components, $isolate; } else { if ($country) { push @components, $country; } if ($isolate) { push @components, $isolate; } if ($year) { push @components, $year; } } return join('/', @components); } # Replace non-human host scientific names with common names my %sciToCommon = ( 'Canis lupus familiaris' => 'canine', 'Felis catus' => 'cat', 'Mustela lutreola' => 'mink', # Netherlands 'Neovison vison' => 'mink', # Denmark 'Panthera leo' => 'lion', 'Panthera tigris' => 'tiger', 'Panthera tigris jacksoni' => 'tiger' ); my $gbMetadataFile = shift @ARGV; open(my $GBMETA, "<$gbMetadataFile") || die "Can't open $gbMetadataFile: $!\n"; my %accToMeta = (); while (<$GBMETA>) { my ($acc, undef, $date, $geoLoc, $host, $isoName) = split("\t"); # Trim to just the country + $geoLoc =~ s/United Kingdom:(.*)/$1/; $geoLoc =~ s/:.*//; $geoLoc =~ s/ //g; if ($host eq "Homo sapiens") { $host = ''; } elsif (exists $sciToCommon{$host}) { $host = $sciToCommon{$host}; } $isoName =~ s@^SARS?[- ]Co[Vv]-?2/@@; $isoName =~ s@^hCo[Vv]-19/@@; $isoName =~ s@^BetaCoV/@@; - $isoName =~ s@^human/@@; + $isoName =~ s@^humans?,?/@@; $isoName =~ s@/ENV/@/env/@; + $isoName =~ s@Canis lupus familiaris@canine@; + $isoName =~ s@Felis catus@cat@; + $isoName =~ s@Mustela lutreola@mink@; + $isoName =~ s@Neovison vison@mink@; + $isoName =~ s@Panthera leo@lion@; + $isoName =~ s@Panthera tigris@tiger@; + $isoName =~ s@Panthera tigris jacksoni@tiger@; + $isoName =~ s@^COG-UK/@@; $accToMeta{$acc} = [$date, $geoLoc, $host, $isoName]; } close($GBMETA); while (<>) { if (/^>([A-Z]+\d+\.\d+)\s*(\S+.*)?\s*$/) { my ($acc, $fName) = ($1, $2); if (exists $accToMeta{$acc}) { my ($mDate, $mCountry, $mHost, $mName) = @{$accToMeta{$acc}}; my $mYear; if ($mDate =~ /^(\d\d\d\d)/) { $mYear = $1; } - my $name = $fName; - my $year = $mYear; - if (! $fName) { - $name = makeName($mHost, $mCountry, $mName, $mYear); - } else { - # If fasta name contains host, country, isolate name, and/or year, use those, - # otherwise take from metadata. - if ($fName =~ m@^((\w+)/)?(\w+)/[^/]+/(\d+)$@) { - # Well-formed; use it. - $name = $fName; - } else { - if ($fName =~ m@/(\d\d\d\d)$@) { - if ($1 && $mYear && $1 ne $mYear) { - print $STDERR "Year mismatch for $acc: name $name, metadata $mDate"; - } - $fName =~ s@/(\d\d\d\d)$@@; - $year = $1; - } - if ($fName =~ m@^[A-Z]{3}/@) { - # Not really well-formed, but at least it starts with a country code so - # don't mess it up further. - $name = $fName; - } else { - $name = makeName($mHost, $mCountry, $fName, $year); - } - } - } + my $name = makeName($mHost, $mCountry, $mName, $mYear); print ">$acc |$name\n"; } else { print STDERR "No metadata for $acc\n"; s/ / \|/; print; } } elsif (/^[A-Z]+$/) { print; } else { if (/^>/) { s/ / \|/; } else { warn "Passing through weird line:\n$_"; } print; } }