cfd801899529a9a9b0db3cc408a8f5e3c6ecb670 hiram Thu May 15 13:24:49 2025 -0700 implement a scientific name override system to fixup names for F1 hybrid diploid assemblies refsd #34917 diff --git src/hg/makeDb/doc/asmHubs/mkAsmStats.pl src/hg/makeDb/doc/asmHubs/mkAsmStats.pl index 5f4e598d77b..510ad1cd6d7 100755 --- src/hg/makeDb/doc/asmHubs/mkAsmStats.pl +++ src/hg/makeDb/doc/asmHubs/mkAsmStats.pl @@ -8,30 +8,49 @@ use commonHtml; use File::stat; my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "mkAsmStats Name asmHubName [two column name list]\n"; printf STDERR "e.g.: mkAsmStats Mammals mammals mammals.asmId.commonName.tsv\n"; printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n"; printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n"; printf STDERR "column 2: common name for species, columns separated by tab\n"; exit 255; } my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; +my $sciNameOverrideFile = "$toolsDir/sciNameOverride.txt"; +my %sciNameOverride; # key is accession, value is corrected scientific name +my %taxIdOverride; # key is accession, value is corrected taxId + # keys for both of those can also be the asmId + +if ( -s "${sciNameOverrideFile}" ) { + open (my $sn, "<", "${sciNameOverrideFile}") or die "can not read ${sciNameOverrideFile}"; + while (my $line = <$sn>) { + next if ($line =~ m/^#/); + next if (length($line) < 2); + chomp $line; + my ($accO, $asmIdO, $sciNameO, $taxIdO) = split('\t', $line); + $sciNameOverride{$accO} = $sciNameO; + $sciNameOverride{$asmIdO} = $sciNameO; + $taxIdOverride{$accO} = $taxIdO; + $taxIdOverride{$asmIdO} = $taxIdO; + } + close ($sn); +} my $Name = shift; my $asmHubName = shift; my $inputList = shift; my $orderList = $inputList; if ( ! -s "$orderList" ) { $orderList = $toolsDir/$inputList; } my @orderList; # asmId of the assemblies in order from the orderList file my %commonName; # key is asmId, value is a common name, perhaps more appropriate # than found in assembly_report file my $vgpIndex = 0; $vgpIndex = 1 if ($Name =~ m/vgp/i); my $hprcIndex = 0; @@ -341,30 +360,31 @@ printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n"; print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`; } my ($gapSize, $maskPerCent) = maskStats($faSizeTxt); $overallGapSize += $gapSize; my ($seqCount, $totalSize) = asmCounts($chromSizes); $overallSeqCount += $seqCount; $overallNucleotides += $totalSize; my $gapCount = gapStats($buildDir, $asmId); $overallGapCount += $gapCount; my $sciName = "notFound"; my $commonName = "notFound"; my $bioSample = "notFound"; my $bioProject = "notFound"; my $taxId = "notFound"; + $taxId = $taxIdOverride{$accessionId} if (defined($taxIdOverride{$accessionId})); my $asmDate = "notFound"; my $itemsFound = 0; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = <FH>) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; $line =~ s/\s+$//g;; if ($line =~ m/Date:/) { if ($asmDate =~ m/notFound/) { ++$itemsFound; $asmDate = $line; $asmDate =~ s/.*:\s+//; } } elsif ($line =~ m/BioSample:/) { @@ -377,30 +397,31 @@ if ($bioProject =~ m/notFound/) { ++$itemsFound; $bioProject = $line; $bioProject =~ s/.*:\s+//; } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*\(//; $commonName =~ s/\)//; $commonName = $commonName{$asmId} if (exists($commonName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; + $sciName = $sciNameOverride{$accessionId} if (defined($sciNameOverride{$accessionId})); } } elsif ($line =~ m/Taxid:/) { if ($taxId =~ m/notFound/) { ++$itemsFound; $taxId = $line; $taxId =~ s/.*:\s+//; } } } close (FH); my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; my $browserName = $commonName; my $browserUrl = "https://genome.ucsc.edu/h/$accessionId"; if ($asmId !~ m/^GC/) { $hubUrl = "https://hgdownload.soe.ucsc.edu/goldenPath/$asmId/bigZips";