d4caa9d01a63434e9859ed47971217ff9a20edb9 angie Sat Dec 23 19:14:20 2023 -0800 Add country and year to isolate names that don't have them, analogous to fixNcbiFastaNames.pl. diff --git src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl index 88dca79..df6b74b 100755 --- src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl +++ src/hg/utils/otto/sarscov2phylo/gbMetadataAddBioSample.pl @@ -65,30 +65,37 @@ $gbDate = $bDate; } elsif ($bDate && $gbDate ne $bDate) { print STDERR join("\t", "dateMismatch", $gbAcc, $gbName, $gbDate, $bAcc, $bName, $bDate) . "\n"; } if (! $gbName) { $gbName = $bName; } elsif (($gbName eq '1' || $gbName eq 'NA') && length($bName) > length($gbName)) { $gbName = $bName; } elsif ($gbName eq 'nasopharyngeal' && $bName =~ m/\d/) { $gbName = $bName; } if (! $gbGeo) { $gbGeo = $bCountry; } + if ($gbName !~ m@/@ && $gbGeo ne "" && $gbDate =~ /^\d{4}/) { + my $country = $gbGeo; + $country =~ s/:.*//; $country =~ s/ //g; + my $year = $gbDate; + $year =~ s/^(\d{4}).*/$1/; + $gbName = "$country/$gbName/$year"; + } print join("\t", $gbAcc, $bAcc, $gbDate, $gbGeo, $host, $gbName, $completeness, $len); } else { # BioSample file doesn't have info for this BioSample accession print STDERR "Missing BioSample info for $bAcc\n"; $missingCount++; if ($missingCount > $maxMissing) { die "Too many missing BioSamples (> $maxMissing), quitting.\n"; } # Pass through as-is print; } } else { # No associated BioSample, just pass through as-is print; }