de878cd5e29e32002ebc64b66d7cf6989091f8ce angie Mon Oct 10 15:46:16 2022 -0700 Another regex tweak for fluff that ends up in GenBank names. Oh, those crazy [Hh]umans. diff --git src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl index 91b9a45..1b42933 100755 --- src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl +++ src/hg/utils/otto/sarscov2phylo/fixNcbiFastaNames.pl @@ -51,31 +51,31 @@ my %accToMeta = (); while (<$GBMETA>) { my ($acc, undef, $date, $geoLoc, $host, $isoName) = split("\t"); # Trim to just the country $geoLoc =~ s/United Kingdom:(.*)/$1/; $geoLoc =~ s/:.*//; $geoLoc =~ s/ //g; if ($host eq "Homo sapiens") { $host = ''; } elsif (exists $sciToCommon{$host}) { $host = $sciToCommon{$host}; } $isoName =~ s@^SARS?[- ]Co[Vv]-?2/@@; $isoName =~ s@^hCo[Vv]-19/@@; $isoName =~ s@^BetaCoV/@@; - $isoName =~ s@^humans?,?/@@; + $isoName =~ s@^[Hh]umans?,?/@@; $isoName =~ s@/ENV/@/env/@; $isoName =~ s@Canis lupus familiaris@canine@; $isoName =~ s@Felis catus@cat@; $isoName =~ s@Mustela lutreola@mink@; $isoName =~ s@Neovison vison@mink@; $isoName =~ s@Panthera leo@lion@; $isoName =~ s@Panthera tigris@tiger@; $isoName =~ s@Panthera tigris jacksoni@tiger@; $isoName =~ s@^COG-UK/@@; $accToMeta{$acc} = [$date, $geoLoc, $host, $isoName]; } close($GBMETA); while (<>) { if (/^>([A-Z]+\d+\.\d+)\s*(\S+.*)?\s*$/) {