cfd801899529a9a9b0db3cc408a8f5e3c6ecb670
hiram
  Thu May 15 13:24:49 2025 -0700
implement a scientific name override system to fixup names for F1 hybrid diploid assemblies refsd #34917

diff --git src/hg/makeDb/doc/asmHubs/mkAsmStats.pl src/hg/makeDb/doc/asmHubs/mkAsmStats.pl
index 5f4e598d77b..510ad1cd6d7 100755
--- src/hg/makeDb/doc/asmHubs/mkAsmStats.pl
+++ src/hg/makeDb/doc/asmHubs/mkAsmStats.pl
@@ -8,30 +8,49 @@
 use commonHtml;
 use File::stat;
 
 my $argc = scalar(@ARGV);
 if ($argc != 3) {
   printf STDERR "mkAsmStats Name asmHubName [two column name list]\n";
   printf STDERR "e.g.: mkAsmStats Mammals mammals mammals.asmId.commonName.tsv\n";
   printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n";
   printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n";
   printf STDERR "column 2: common name for species, columns separated by tab\n";
   exit 255;
 }
 
 my $home = $ENV{'HOME'};
 my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs";
+my $sciNameOverrideFile = "$toolsDir/sciNameOverride.txt";
+my %sciNameOverride;	# key is accession, value is corrected scientific name
+my %taxIdOverride;	# key is accession, value is corrected taxId
+			# keys for both of those can also be the asmId
+
+if ( -s "${sciNameOverrideFile}" ) {
+  open (my $sn, "<", "${sciNameOverrideFile}") or die "can not read ${sciNameOverrideFile}";
+  while (my $line = <$sn>) {
+    next if ($line =~ m/^#/);
+    next if (length($line) < 2);
+    chomp $line;
+    my ($accO, $asmIdO, $sciNameO, $taxIdO) = split('\t', $line);
+    $sciNameOverride{$accO} = $sciNameO;
+    $sciNameOverride{$asmIdO} = $sciNameO;
+    $taxIdOverride{$accO} = $taxIdO;
+    $taxIdOverride{$asmIdO} = $taxIdO;
+  }
+  close ($sn);
+}
 
 my $Name = shift;
 my $asmHubName = shift;
 my $inputList = shift;
 my $orderList = $inputList;
 if ( ! -s "$orderList" ) {
   $orderList = $toolsDir/$inputList;
 }
 
 my @orderList;	# asmId of the assemblies in order from the orderList file
 my %commonName;	# key is asmId, value is a common name, perhaps more appropriate
                 # than found in assembly_report file
 my $vgpIndex = 0;
 $vgpIndex = 1 if ($Name =~ m/vgp/i);
 my $hprcIndex = 0;
@@ -341,30 +360,31 @@
        printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n";
        print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`;
     }
     my ($gapSize, $maskPerCent) = maskStats($faSizeTxt);
     $overallGapSize += $gapSize;
     my ($seqCount, $totalSize) = asmCounts($chromSizes);
     $overallSeqCount += $seqCount;
     $overallNucleotides += $totalSize;
     my $gapCount = gapStats($buildDir, $asmId);
     $overallGapCount += $gapCount;
     my $sciName = "notFound";
     my $commonName = "notFound";
     my $bioSample = "notFound";
     my $bioProject = "notFound";
     my $taxId = "notFound";
+    $taxId = $taxIdOverride{$accessionId} if (defined($taxIdOverride{$accessionId}));
     my $asmDate = "notFound";
     my $itemsFound = 0;
     open (FH, "<$asmReport") or die "can not read $asmReport";
     while (my $line = <FH>) {
       last if ($itemsFound > 5);
       chomp $line;
       $line =~ s/
//g;;
       $line =~ s/\s+$//g;;
       if ($line =~ m/Date:/) {
         if ($asmDate =~ m/notFound/) {
            ++$itemsFound;
            $asmDate = $line;
            $asmDate =~ s/.*:\s+//;
         }
       } elsif ($line =~ m/BioSample:/) {
@@ -377,30 +397,31 @@
         if ($bioProject =~ m/notFound/) {
            ++$itemsFound;
            $bioProject = $line;
            $bioProject =~ s/.*:\s+//;
         }
       } elsif ($line =~ m/Organism name:/) {
         if ($sciName =~ m/notFound/) {
            ++$itemsFound;
            $commonName = $line;
            $sciName = $line;
            $commonName =~ s/.*\(//;
            $commonName =~ s/\)//;
            $commonName = $commonName{$asmId} if (exists($commonName{$asmId}));
            $sciName =~ s/.*:\s+//;
            $sciName =~ s/\s+\(.*//;
+           $sciName = $sciNameOverride{$accessionId} if (defined($sciNameOverride{$accessionId}));
         }
       } elsif ($line =~ m/Taxid:/) {
         if ($taxId =~ m/notFound/) {
            ++$itemsFound;
            $taxId = $line;
            $taxId =~ s/.*:\s+//;
         }
       }
     }
     close (FH);
     my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId";
     my $browserName = $commonName;
     my $browserUrl = "https://genome.ucsc.edu/h/$accessionId";
     if ($asmId !~ m/^GC/) {
        $hubUrl = "https://hgdownload.soe.ucsc.edu/goldenPath/$asmId/bigZips";