cfd801899529a9a9b0db3cc408a8f5e3c6ecb670 hiram Thu May 15 13:24:49 2025 -0700 implement a scientific name override system to fixup names for F1 hybrid diploid assemblies refsd #34917 diff --git src/hg/makeDb/doc/asmHubs/mkHubIndex.pl src/hg/makeDb/doc/asmHubs/mkHubIndex.pl index c7aa2647bfb..f9b6b2feab6 100755 --- src/hg/makeDb/doc/asmHubs/mkHubIndex.pl +++ src/hg/makeDb/doc/asmHubs/mkHubIndex.pl @@ -11,30 +11,49 @@ use commonHtml; my $argc = scalar(@ARGV); if ($argc != 4) { printf STDERR "mkHubIndex.pl Name asmName defaultAsmId [two column name list] > index.html\n"; printf STDERR "e.g.: mkHubIndex Primates primates GCF_000001405.39_GRCh38.p13 primates.commonName.asmId.orderList.tsv\n"; printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n"; printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n"; printf STDERR "column 2: common name for species, columns separated by tab\n"; printf STDERR "The result prints to stdout the index.html page for this set of assemblies\n"; exit 255; } my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; +my $sciNameOverrideFile = "$toolsDir/sciNameOverride.txt"; +my %sciNameOverride; # key is accession, value is corrected scientific name +my %taxIdOverride; # key is accession, value is corrected taxId + # keys for both of those can also be the asmId + +if ( -s "${sciNameOverrideFile}" ) { + open (my $sn, "<", "${sciNameOverrideFile}") or die "can not read ${sciNameOverrideFile}"; + while (my $line = <$sn>) { + next if ($line =~ m/^#/); + next if (length($line) < 2); + chomp $line; + my ($accO, $asmIdO, $sciNameO, $taxIdO) = split('\t', $line); + $sciNameOverride{$accO} = $sciNameO; + $sciNameOverride{$asmIdO} = $sciNameO; + $taxIdOverride{$accO} = $taxIdO; + $taxIdOverride{$asmIdO} = $taxIdO; + } + close ($sn); +} my $Name = shift; my $asmHubName = shift; my $defaultAssembly = shift; my $inputList = shift; my $orderList = $inputList; if ( ! -s "$orderList" ) { $orderList = $toolsDir/$inputList; } my %cladeId; # value is asmId, value is clade, useful for 'legacy' index page printf STDERR "# mkHubIndex %s %s %s %s\n", $Name, $asmHubName, $defaultAssembly, $orderList; my $hprcIndex = 0; my $ccgpIndex = 0; my $vgpIndex = 0; @@ -341,30 +360,31 @@ my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; if ($asmId =~ m/^GCA/) { $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId"; $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; } elsif ($asmId !~ m/^GC/) { $buildDir="/hive/data/outside/ncbi/genomes/$accessionDir/${accessionId}_${asmName}"; $asmReport="$buildDir/${accessionId}_${asmName}_assembly_report.txt"; } my $trackDb="$buildDir/${asmId}.trackDb.txt"; # next if (! -s "$trackDb"); # assembly build not complete my $commonName = "notFound(${asmId})"; my $sciName = "notFound"; my $bioSample = "notFound"; my $bioProject = "notFound"; my $taxId = "notFound"; + $taxId = $taxIdOverride{$accessionId} if (defined($taxIdOverride{$accessionId})); my $asmDate = "notFound"; my $itemsFound = 0; if ( -s "${asmReport}" ) { open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = <FH>) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; $line =~ s/\s+$//g;; if ($line =~ m/Date:/) { if ($asmDate =~ m/notFound/) { ++$itemsFound; $line =~ s/.*:\s+//; my @a = split('-', $line); $asmDate = sprintf("%04d-%02d-%02d", $a[0], $a[1], $a[2]); @@ -379,54 +399,63 @@ if ($bioProject =~ m/notFound/) { ++$itemsFound; $bioProject = $line; $bioProject =~ s/.*:\s+//; } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*\(//; $commonName =~ s/\)//; $commonName = $commonName{$asmId} if (exists($commonName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; + $sciName = $sciNameOverride{$accessionId} if (defined($sciNameOverride{$accessionId})); } } elsif ($line =~ m/Taxid:/) { if ($taxId =~ m/notFound/) { ++$itemsFound; $taxId = $line; $taxId =~ s/.*:\s+//; } } } close (FH); } elsif ( -s "${configRa}" ) { # if ( -s "${asmReport}" ) # ncbiAssemblyName Sscrofa10.2 # genBankAccessionID GCA_000003025.4 # ncbiBioProject 13421 # assemblyDate Aug. 2011 $asmName = `grep ^ncbiAssemblyName "${configRa}" | cut -d' ' -f2`; chomp $asmName; - $taxId = `grep ^taxId "${configRa}" | cut -d' ' -f2`; - chomp $taxId; $commonName = `grep ^commonName "${configRa}" | cut -d' ' -f2-`; chomp $commonName; + if (defined($taxIdOverride{$accessionId})) { + $taxId = $taxIdOverride{$accessionId} + } else { + $taxId = `grep ^taxId "${configRa}" | cut -d' ' -f2`; + chomp $taxId; + } + if (defined($sciNameOverride{$accessionId})) { + $sciName = $sciNameOverride{$accessionId} + } else { $sciName = `grep ^scientificName "${configRa}" | cut -d' ' -f2-`; chomp $sciName; + } $asmDate = `grep ^assemblyDate "${configRa}" | cut -d' ' -f2-`; chomp $asmDate; $bioProject = `grep ^ncbiBioProject "${configRa}" | cut -d' ' -f2-`; chomp $bioProject; $bioSample = `grep ^ncbiBioSample "${configRa}" | cut -d' ' -f2-`; chomp $bioSample; $ncbiFtpLink = "https://ftp.ncbi.nlm.nih.gov/genomes/all/$accessionDir/${accessionId}_${asmName}"; } my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; my $gbdbUrl = "/gbdb/genark/$accessionDir/$accessionId"; my $browserName = $commonName; my $browserUrl = "https://genome.ucsc.edu/cgi-bin/hgTracks?genome=$accessionId&hubUrl=$gbdbUrl/hub.txt"; if ($asmId !~ m/^GC/) { $hubUrl = "https://hgdownload.soe.ucsc.edu/goldenPath/$asmId/bigZips"; $browserUrl = "https://genome.ucsc.edu/cgi-bin/hgTracks?db=$asmId";