cfd801899529a9a9b0db3cc408a8f5e3c6ecb670
hiram
  Thu May 15 13:24:49 2025 -0700
implement a scientific name override system to fixup names for F1 hybrid diploid assemblies refsd #34917

diff --git src/hg/makeDb/doc/asmHubs/trackData.pl src/hg/makeDb/doc/asmHubs/trackData.pl
index d4e1ef695b7..0f9a312e7e9 100755
--- src/hg/makeDb/doc/asmHubs/trackData.pl
+++ src/hg/makeDb/doc/asmHubs/trackData.pl
@@ -8,30 +8,50 @@
 use commonHtml;
 use File::stat;
 
 my $argc = scalar(@ARGV);
 if ($argc < 3) {
   printf STDERR "usage: trackData.pl Name asmHubName [two column name list] > trackData.html\n";
   printf STDERR "e.g.: trackData.pl Mammals mammals mammals.asmId.commonName.tsv > trackData.html\n";
   printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n";
   printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n";
   printf STDERR "column 2: common name for species, columns separated by tab\n";
   exit 255;
 }
 
 my $home = $ENV{'HOME'};
 my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs";
+my $sciNameOverrideFile = "$toolsDir/sciNameOverride.txt";
+my %sciNameOverride;	# key is accession, value is corrected scientific name
+my %taxIdOverride;	# key is accession, value is corrected taxId
+			# keys for both of those can also be the asmId
+
+if ( -s "${sciNameOverrideFile}" ) {
+  open (my $sn, "<", "${sciNameOverrideFile}") or die "can not read ${sciNameOverrideFile}";
+  while (my $line = <$sn>) {
+    next if ($line =~ m/^#/);
+    next if (length($line) < 2);
+    chomp $line;
+    my ($accO, $asmIdO, $sciNameO, $taxIdO) = split('\t', $line);
+    $sciNameOverride{$accO} = $sciNameO;
+    $sciNameOverride{$asmIdO} = $sciNameO;
+    $taxIdOverride{$accO} = $taxIdO;
+    $taxIdOverride{$asmIdO} = $taxIdO;
+  }
+  close ($sn);
+}
+
 
 my $testOutput = 0;
 my $spliceOut = -1;
 
 if ($argc > 2) {
   for (my $i = 0; $i < $argc; ++$i) {
     if ($ARGV[$i] =~ /-test/) {
       $testOutput = 1;
       $spliceOut = $i;
     }
   }
 }
 if ($spliceOut != -1) {
   splice @ARGV, $spliceOut, 1;
 }
@@ -470,30 +490,31 @@
       printf "</tr>\n";
       next;
     }
     if ( ! -s "$faSizeTxt" ) {
        printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n";
        print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`;
     }
     my ($gapSize, $maskPerCent, $sizeNoGaps) = maskStats($faSizeTxt);
     $overallGapSize += $gapSize;
     my ($seqCount, $totalSize) = asmCounts($chromSizes);
     $overallSeqCount += $seqCount;
     $overallNucleotides += $totalSize;
     my $gapCount = gapStats($buildDir, $asmId);
     $overallGapCount += $gapCount;
     my $sciName = "notFound";
+    $sciName = $sciNameOverride{$accessionId} if (defined($sciNameOverride{$accessionId}));
     my $commonName = "notFound";
     my $asmDate = "notFound";
     my $itemsFound = 0;
     open (FH, "<$asmReport") or die "can not read $asmReport";
     while (my $line = <FH>) {
       last if ($itemsFound > 5);
       chomp $line;
       $line =~ s/
//g;;
       $line =~ s/\s+$//g;;
       if ($line =~ m/Date:/) {
         if ($asmDate =~ m/notFound/) {
            ++$itemsFound;
            $asmDate = $line;
            $asmDate =~ s/.*:\s+//;
         }