cfd801899529a9a9b0db3cc408a8f5e3c6ecb670
hiram
  Thu May 15 13:24:49 2025 -0700
implement a scientific name override system to fixup names for F1 hybrid diploid assemblies refsd #34917

diff --git src/hg/makeDb/doc/asmHubs/mkHubIndex.pl src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
index c7aa2647bfb..f9b6b2feab6 100755
--- src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
+++ src/hg/makeDb/doc/asmHubs/mkHubIndex.pl
@@ -11,30 +11,49 @@
 use commonHtml;
 
 my $argc = scalar(@ARGV);
 if ($argc != 4) {
   printf STDERR "mkHubIndex.pl Name asmName defaultAsmId [two column name list] > index.html\n";
   printf STDERR "e.g.: mkHubIndex Primates primates GCF_000001405.39_GRCh38.p13 primates.commonName.asmId.orderList.tsv\n";
   printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n";
   printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n";
   printf STDERR "column 2: common name for species, columns separated by tab\n";
   printf STDERR "The result prints to stdout the index.html page for this set of assemblies\n";
   exit 255;
 }
 
 my $home = $ENV{'HOME'};
 my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs";
+my $sciNameOverrideFile = "$toolsDir/sciNameOverride.txt";
+my %sciNameOverride;	# key is accession, value is corrected scientific name
+my %taxIdOverride;	# key is accession, value is corrected taxId
+			# keys for both of those can also be the asmId
+
+if ( -s "${sciNameOverrideFile}" ) {
+  open (my $sn, "<", "${sciNameOverrideFile}") or die "can not read ${sciNameOverrideFile}";
+  while (my $line = <$sn>) {
+    next if ($line =~ m/^#/);
+    next if (length($line) < 2);
+    chomp $line;
+    my ($accO, $asmIdO, $sciNameO, $taxIdO) = split('\t', $line);
+    $sciNameOverride{$accO} = $sciNameO;
+    $sciNameOverride{$asmIdO} = $sciNameO;
+    $taxIdOverride{$accO} = $taxIdO;
+    $taxIdOverride{$asmIdO} = $taxIdO;
+  }
+  close ($sn);
+}
 
 my $Name = shift;
 my $asmHubName = shift;
 my $defaultAssembly = shift;
 my $inputList = shift;
 my $orderList = $inputList;
 if ( ! -s "$orderList" ) {
   $orderList = $toolsDir/$inputList;
 }
 my %cladeId;	# value is asmId, value is clade, useful for 'legacy' index page
 
 printf STDERR "# mkHubIndex %s %s %s %s\n", $Name, $asmHubName, $defaultAssembly, $orderList;
 my $hprcIndex = 0;
 my $ccgpIndex = 0;
 my $vgpIndex = 0;
@@ -341,30 +360,31 @@
     my $asmReport="$buildDir/download/${asmId}_assembly_report.txt";
     if ($asmId =~ m/^GCA/) {
      $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId";
      $asmReport="$buildDir/download/${asmId}_assembly_report.txt";
     } elsif ($asmId !~ m/^GC/) {
        $buildDir="/hive/data/outside/ncbi/genomes/$accessionDir/${accessionId}_${asmName}";
        $asmReport="$buildDir/${accessionId}_${asmName}_assembly_report.txt";
     }
     my $trackDb="$buildDir/${asmId}.trackDb.txt";
 #    next if (! -s "$trackDb");	# assembly build not complete
     my $commonName = "notFound(${asmId})";
     my $sciName = "notFound";
     my $bioSample = "notFound";
     my $bioProject = "notFound";
     my $taxId = "notFound";
+    $taxId = $taxIdOverride{$accessionId} if (defined($taxIdOverride{$accessionId}));
     my $asmDate = "notFound";
     my $itemsFound = 0;
     if ( -s "${asmReport}" ) {
         open (FH, "<$asmReport") or die "can not read $asmReport";
         while (my $line = <FH>) {
           last if ($itemsFound > 5);
           chomp $line;
           $line =~ s/
//g;;
           $line =~ s/\s+$//g;;
           if ($line =~ m/Date:/) {
             if ($asmDate =~ m/notFound/) {
                ++$itemsFound;
                $line =~ s/.*:\s+//;
                my @a = split('-', $line);
                $asmDate = sprintf("%04d-%02d-%02d", $a[0], $a[1], $a[2]);
@@ -379,54 +399,63 @@
             if ($bioProject =~ m/notFound/) {
                ++$itemsFound;
                $bioProject = $line;
                $bioProject =~ s/.*:\s+//;
             }
           } elsif ($line =~ m/Organism name:/) {
             if ($sciName =~ m/notFound/) {
                ++$itemsFound;
                $commonName = $line;
                $sciName = $line;
                $commonName =~ s/.*\(//;
                $commonName =~ s/\)//;
                $commonName = $commonName{$asmId} if (exists($commonName{$asmId}));
                $sciName =~ s/.*:\s+//;
                $sciName =~ s/\s+\(.*//;
+               $sciName = $sciNameOverride{$accessionId} if (defined($sciNameOverride{$accessionId}));
             }
           } elsif ($line =~ m/Taxid:/) {
             if ($taxId =~ m/notFound/) {
                ++$itemsFound;
                $taxId = $line;
                $taxId =~ s/.*:\s+//;
             }
           }
         }
         close (FH);
     } elsif ( -s "${configRa}" ) {	#	if ( -s "${asmReport}" )
 # ncbiAssemblyName Sscrofa10.2
 # genBankAccessionID GCA_000003025.4
 # ncbiBioProject 13421
 # assemblyDate Aug. 2011
 
        $asmName = `grep ^ncbiAssemblyName "${configRa}" | cut -d' ' -f2`;
        chomp $asmName;
-       $taxId = `grep ^taxId "${configRa}" | cut -d' ' -f2`;
-       chomp $taxId;
        $commonName = `grep ^commonName "${configRa}" | cut -d' ' -f2-`;
        chomp $commonName;
+       if (defined($taxIdOverride{$accessionId})) {
+         $taxId = $taxIdOverride{$accessionId}
+       } else {
+         $taxId = `grep ^taxId "${configRa}" | cut -d' ' -f2`;
+         chomp $taxId;
+       }
+       if (defined($sciNameOverride{$accessionId})) {
+         $sciName = $sciNameOverride{$accessionId}
+       } else {
          $sciName = `grep ^scientificName "${configRa}" | cut -d' ' -f2-`;
          chomp $sciName;
+       }
        $asmDate = `grep ^assemblyDate "${configRa}" | cut -d' ' -f2-`;
        chomp $asmDate;
        $bioProject = `grep ^ncbiBioProject "${configRa}" | cut -d' ' -f2-`;
        chomp $bioProject;
        $bioSample = `grep ^ncbiBioSample "${configRa}" | cut -d' ' -f2-`;
        chomp $bioSample;
        $ncbiFtpLink = "https://ftp.ncbi.nlm.nih.gov/genomes/all/$accessionDir/${accessionId}_${asmName}";
     }
     my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId";
     my $gbdbUrl = "/gbdb/genark/$accessionDir/$accessionId";
     my $browserName = $commonName;
     my $browserUrl = "https://genome.ucsc.edu/cgi-bin/hgTracks?genome=$accessionId&hubUrl=$gbdbUrl/hub.txt";
     if ($asmId !~ m/^GC/) {
        $hubUrl = "https://hgdownload.soe.ucsc.edu/goldenPath/$asmId/bigZips";
        $browserUrl = "https://genome.ucsc.edu/cgi-bin/hgTracks?db=$asmId";