cfd801899529a9a9b0db3cc408a8f5e3c6ecb670 hiram Thu May 15 13:24:49 2025 -0700 implement a scientific name override system to fixup names for F1 hybrid diploid assemblies refsd #34917 diff --git src/hg/makeDb/doc/asmHubs/trackData.pl src/hg/makeDb/doc/asmHubs/trackData.pl index d4e1ef695b7..0f9a312e7e9 100755 --- src/hg/makeDb/doc/asmHubs/trackData.pl +++ src/hg/makeDb/doc/asmHubs/trackData.pl @@ -8,30 +8,50 @@ use commonHtml; use File::stat; my $argc = scalar(@ARGV); if ($argc < 3) { printf STDERR "usage: trackData.pl Name asmHubName [two column name list] > trackData.html\n"; printf STDERR "e.g.: trackData.pl Mammals mammals mammals.asmId.commonName.tsv > trackData.html\n"; printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n"; printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n"; printf STDERR "column 2: common name for species, columns separated by tab\n"; exit 255; } my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; +my $sciNameOverrideFile = "$toolsDir/sciNameOverride.txt"; +my %sciNameOverride; # key is accession, value is corrected scientific name +my %taxIdOverride; # key is accession, value is corrected taxId + # keys for both of those can also be the asmId + +if ( -s "${sciNameOverrideFile}" ) { + open (my $sn, "<", "${sciNameOverrideFile}") or die "can not read ${sciNameOverrideFile}"; + while (my $line = <$sn>) { + next if ($line =~ m/^#/); + next if (length($line) < 2); + chomp $line; + my ($accO, $asmIdO, $sciNameO, $taxIdO) = split('\t', $line); + $sciNameOverride{$accO} = $sciNameO; + $sciNameOverride{$asmIdO} = $sciNameO; + $taxIdOverride{$accO} = $taxIdO; + $taxIdOverride{$asmIdO} = $taxIdO; + } + close ($sn); +} + my $testOutput = 0; my $spliceOut = -1; if ($argc > 2) { for (my $i = 0; $i < $argc; ++$i) { if ($ARGV[$i] =~ /-test/) { $testOutput = 1; $spliceOut = $i; } } } if ($spliceOut != -1) { splice @ARGV, $spliceOut, 1; } @@ -470,30 +490,31 @@ printf "</tr>\n"; next; } if ( ! -s "$faSizeTxt" ) { printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n"; print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`; } my ($gapSize, $maskPerCent, $sizeNoGaps) = maskStats($faSizeTxt); $overallGapSize += $gapSize; my ($seqCount, $totalSize) = asmCounts($chromSizes); $overallSeqCount += $seqCount; $overallNucleotides += $totalSize; my $gapCount = gapStats($buildDir, $asmId); $overallGapCount += $gapCount; my $sciName = "notFound"; + $sciName = $sciNameOverride{$accessionId} if (defined($sciNameOverride{$accessionId})); my $commonName = "notFound"; my $asmDate = "notFound"; my $itemsFound = 0; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = <FH>) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; $line =~ s/\s+$//g;; if ($line =~ m/Date:/) { if ($asmDate =~ m/notFound/) { ++$itemsFound; $asmDate = $line; $asmDate =~ s/.*:\s+//; }