1a2aa31bfa14f809ee4093cea7b436e331b86ff4 hiram Thu Feb 16 14:56:34 2023 -0800 improved common names for fungi and bacteria an lower limits for valid number of tracks in bacteria and plants refs #29545 diff --git src/hg/makeDb/doc/asmHubs/commonNames.pl src/hg/makeDb/doc/asmHubs/commonNames.pl index 2f60686..f0a052a 100755 --- src/hg/makeDb/doc/asmHubs/commonNames.pl +++ src/hg/makeDb/doc/asmHubs/commonNames.pl @@ -14,78 +14,90 @@ my $ncbiSrc="/hive/data/outside/ncbi/genomes"; my $listFile = shift; open (FH, "<$listFile") or die "can not open $listFile"; while (my $asmId = <FH>) { next if ($asmId =~ m/^#/); $asmId =~ s/\s+.*//; chomp $asmId; next if (length($asmId) < 1); my $gcx = substr($asmId, 0, 3); my $id0 = substr($asmId, 4, 3); my $id1 = substr($asmId, 7, 3); my $id2 = substr($asmId, 10, 3); my $srcDir = sprintf "%s/%s/%s/%s/%s/%s", $ncbiSrc, $gcx, $id0, $id1, $id2, $asmId; my $asmRpt = "$srcDir/${asmId}_assembly_report.txt"; + my $sciName = `grep -i -m 1 "Organism name:" "${asmRpt}" | tr -d " "`; + chomp $sciName; + $sciName =~ s/.*ism name:\s+//i; + $sciName =~ s/\s+\(.*\)$//; my $yearDate = `grep -i -m 1 "Date:" "${asmRpt}" | tr -d " " | awk '{print \$NF}' | sed -e 's/-.*//;'`; chomp $yearDate; my $isolate = `grep -i -m 1 "Isolate:" "${asmRpt}" | tr -d " "`; chomp $isolate; if (length($isolate)) { $isolate =~ s/.*solate: *//; } my $cultivar = `grep -i -m 1 "Infraspecific name:" "${asmRpt}" | tr -d " "`; chomp $cultivar; if (length($cultivar)) { $cultivar =~ s/.*cultivar=//; $cultivar =~ s/.*ecotype=//; $cultivar =~ s/.*strain=//; $cultivar =~ s/.*breed=//; } my $extraStrings = ""; if (length($isolate) && length($cultivar)) { $extraStrings = "$cultivar $isolate $yearDate"; } elsif (length($isolate)) { $extraStrings = "$isolate $yearDate"; } elsif (length($cultivar)) { $extraStrings = "$cultivar $yearDate"; } if ( "x${extraStrings}y" eq "xy" ) { $extraStrings = "$yearDate"; } my $orgName = `grep -i -m 1 "Organism name:" "${asmRpt}" | tr -d " "`; $orgName =~ s/.*\(//; $orgName =~ s/\)//; chomp $orgName; - if ($orgName =~ m/ascomycete|basidiomycete|budding|microsporidian|smut|fungi/) { + if ($orgName =~ m/firmicutes|proteobacteria|high G|enterobacteria|agent of/) { +# my @a = split('\s+', $sciName); +# my $lastN = scalar(@a) - 1; +# $orgName = uc(substr($a[0], 0, 1)) . "." . @a[1..$lastN]; + $orgName = $sciName; + } elsif ($orgName =~ m/ascomycete|basidiomycete|budding|microsporidian|smut|fungi/) { my ($order, undef) = split('\s', $orgName, 2); $order = "budding yeast" if ($order =~ m/budding/); $order = "smut fungi" if ($order =~ m/smut/); $order = "ascomycetes" if ($order =~ m/ascomycete/); $order = "basidiomycetes" if ($order =~ m/basidiomycete/); - my $sciName = `grep -i -m 1 "Organism name:" "${asmRpt}" | tr -d " "`; - chomp $sciName; - $sciName =~ s/.*ism name:\s+//i; - $sciName =~ s/\s+\(.*\)$//; my @a = split('\s+', $sciName); my $lastN = scalar(@a) - 1; $orgName = "$order " . uc(substr($a[0], 0, 1)) . "." . @a[1..$lastN]; } elsif ($orgName eq "viruses") { $orgName = `grep -i -m 1 "Organism name:" "${asmRpt}" | tr -d " "`; chomp $orgName; $orgName =~ s/.*ism name:\s+//i; $orgName =~ s/\s+\(.*\)$//; } if (length($extraStrings)) { + my @a = split('\s+', $extraStrings); + for (my $i = 0; $i < scalar(@a); ++$i) { + $orgName =~ s/$a[$i]//; + } + $orgName =~ s/=//g; + $orgName =~ s/ / /g; + $orgName =~ s/ +$//; printf "%s\t%s (%s)\n", $asmId, $orgName, $extraStrings; } else { printf "%s\t%s\n", $asmId, $orgName; } } close (FH); # GCA_003369685.2_UOA_Angus_1_assembly_report.txt # Organism name: # GCF_010993605.1_kPetMar1.pri # GCF_900246225.1_fAstCal1.2