dd97b4a2ecd10761d802b7f63bf64f147399a609 hiram Sun Oct 16 19:23:31 2022 -0700 better names on viruses and add addition info to others no redmine diff --git src/hg/makeDb/doc/asmHubs/commonNames.pl src/hg/makeDb/doc/asmHubs/commonNames.pl index 7dd46d1..3e7e528 100755 --- src/hg/makeDb/doc/asmHubs/commonNames.pl +++ src/hg/makeDb/doc/asmHubs/commonNames.pl @@ -14,29 +14,65 @@ my $ncbiSrc="/hive/data/outside/ncbi/genomes"; my $listFile = shift; open (FH, "<$listFile") or die "can not open $listFile"; while (my $asmId = <FH>) { next if ($asmId =~ m/^#/); $asmId =~ s/\s+.*//; chomp $asmId; next if (length($asmId) < 1); my $gcx = substr($asmId, 0, 3); my $id0 = substr($asmId, 4, 3); my $id1 = substr($asmId, 7, 3); my $id2 = substr($asmId, 10, 3); my $srcDir = sprintf "%s/%s/%s/%s/%s/%s", $ncbiSrc, $gcx, $id0, $id1, $id2, $asmId; my $asmRpt = "$srcDir/${asmId}_assembly_report.txt"; + my $yearDate = `grep -i -m 1 "Date:" "${asmRpt}" | tr -d " " | awk '{print \$NF}' | sed -e 's/-.*//;'`; + chomp $yearDate; + my $isolate = `grep -i -m 1 "Isolate:" "${asmRpt}" | tr -d " "`; + chomp $isolate; + if (length($isolate)) { + $isolate =~ s/.*solate: *//; + } + my $cultivar = `grep -i -m 1 "Infraspecific name:" "${asmRpt}" | tr -d " "`; + chomp $cultivar; + if (length($cultivar)) { + $cultivar =~ s/.*cultivar=//; + $cultivar =~ s/.*ecotype=//; + $cultivar =~ s/.*strain=//; + $cultivar =~ s/.*breed=//; + } + my $extraStrings = ""; + if (length($isolate) && length($cultivar)) { + $extraStrings = "$cultivar $isolate $yearDate"; + } elsif (length($isolate)) { + $extraStrings = "$isolate $yearDate"; + } elsif (length($cultivar)) { + $extraStrings = "$cultivar $yearDate"; + } + if ( "x${extraStrings}y" eq "xy" ) { + $extraStrings = "$yearDate"; + } my $orgName = `grep -i -m 1 "Organism name:" "${asmRpt}" | tr -d " "`; $orgName =~ s/.*\(//; $orgName =~ s/\)//; chomp $orgName; + if ($orgName eq "viruses") { + $orgName = `grep -i -m 1 "Organism name:" "${asmRpt}" | tr -d " "`; + chomp $orgName; + $orgName =~ s/.*ism name:\s+//i; + $orgName =~ s/\s+\(.*\)$//; + } + if (length($extraStrings)) { + printf "%s\t%s (%s)\n", $asmId, $orgName, $extraStrings; + } else { printf "%s\t%s\n", $asmId, $orgName; } +} close (FH); # GCA_003369685.2_UOA_Angus_1_assembly_report.txt # Organism name: # GCF_010993605.1_kPetMar1.pri # GCF_900246225.1_fAstCal1.2