d4fdeadb1c734b4bf4d9885f462fa93c79df55b7 hiram Mon Feb 21 16:02:14 2022 -0800 add tooltip doc on column headers refs #28930 diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl index 1db498f..c5f7856 100755 --- src/hg/gar/garTable.pl +++ src/hg/gar/garTable.pl @@ -71,53 +71,57 @@ return sprintf("%d", $n) if ($n < 1000); my $m = $n/1000; return sprintf("%.2fK", $m) if ($m < 1000); $m = $n/1000000; return sprintf("%.2fM", $m) if ($m < 1000); $m = $n/1000000000; return sprintf("%.3fG", $m); } ############################################################################### ############################################################################### # output a table cell for an N50 measurement sub n50Cell($$$) { my ($size, $count, $fh) = @_; if ($size > 0) { - printf "<td style='display:none; text-align:right;' sorttable_customkey='%d'>%s (%s)</td>", $size, gmk($size), $count; + printf "<td style='display:none; text-align:right;' sorttable_customkey='%d'>%s (%s)</td>", $size, gmk($size), commify($count); printf $fh "\t%d (%d)", $size, $count; # output to clade.tableData.txt } else { printf "<td style='display:none;'> </td>"; printf $fh "\tn/a (n/a)"; # output to clade.tableData.txt } } ############################################################################### my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi ); # my @clades = qw( primates mammals birds ); # to help weed out some of the noise # key is clade, value is minimal size to count as a whole genome +# these are actually pretty low to allow in some alternate haplotype +# assemblies that don't seem to be the whole assembly. +# The assemblies are also filtered by NCBI status 'full/partial' to only +# allow in the 'full' genomes meaning representation of the whole genome my %minimalGenomeSize = ( primates => 1000000000, - mammals => 200000000, + mammals => 20000000, birds => 200000000, - fish => 1000000, - vertebrate => 400000000, - invertebrates => 10000000, - plants => 10000000, - fungi => 1000000, + fish => 100000, + vertebrate => 4000000, + invertebrates => 10000, + plants => 100000, + fungi => 50000, ); ######################################################################### ## read in list of current GenArk assemblies my %genArkAsm; # key is asmId, value is string with: # accession<tab>assembly<tab>scientific name<tab>common name<tab>taxonId my $genArkCount = 0; printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n"; open (FH, "<UCSC_GI.assemblyHubList.txt") or die "can not read UCSC_GI.assemblyHubList.txt"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; @@ -548,43 +552,39 @@ $asmId =~ s/\//_/g; $asmId =~ s/\#/_/g; # $asmId =~ s/[.:%+/#]/_/g; $asmId =~ s/[()]//g; $asmId =~ s/__/_/g; ++$shouldBeGenArk if (defined($genArkAsm{$asmId})); ++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId})); next if (defined ($skipPartialGenome{$asmId})); next if (defined ($asmSuppressed{$asmId})); next if (defined ($alreadyDone{$asmId})); # something wrong with these two # GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683 # GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679 next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/); # verify this asmId will pass the asmSize limit -### if (defined($metaInfo{$asmId})) { -### my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); -### next if ($asmSize < $minimalGenomeSize{$clade}); # too small -### } if (defined($metaInfo{$asmId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId}; -### XXX next; + next; } } $alreadyDone{$asmId} = 1; if (defined($genArkClade{$asmId})) { die "ERROR: duplicate asmId today $asmId '$clade' '$genArkClade{$asmId}'"; } ++$checkedAsmIds; my $iucnSciName = ""; if (defined($sciNames{$asmId})) { ++$ncbiSpeciesRecorded{$sciNames{$asmId}}; next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit); $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); ++$iucnSpeciesRecorded{$iucnSciName}; } else { @@ -611,31 +611,31 @@ } if ($assembliesAvailable > 1) { my $bPtr = $sciNameAsmList{$sciNames{$asmId}}; foreach my $aId (@$bPtr) { next if (defined ($alreadyDone{$aId})); $alreadyDone{$aId} = 1; if ($aId ne $asmId) { if (defined($metaInfo{$aId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId})); printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId}; -### XXX next; + next; } } ++$ncbiSpeciesRecorded{$sciNames{$aId}}; # the defined($sciName{$aId}) indicates it is a GenArk genome # always accept those even if it goes beyond the limit if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($sciName{$aId}) ) { push (@$cPtr, $aId); ++$acceptedAsmIds; ++$goodToGoCount; ++$cladeCounts{$clade}; } # under limit count or is GenArk assembly } # if ($aId ne $asmId) } # foreach my $aId (@$bPtr) } # if ($assembliesAvailable > 1) } # if (defined($sciNames{$asmId})) @@ -699,49 +699,30 @@ printf "<td><a href='#pageTop'>top of page</a></td>\n"; printf "</table>\n"; } } # if ( 1 == 0 ) # count all assemblies in all clades my $totalAssemblies = 0; foreach my $c (@clades) { $totalAssemblies += $cladeCounts{$c}; } printf "<!-- to get the pull-down menu items centered together -->\n"; printf "<div style='text-align: center;'><!-- this will cause the next div to center -->\n"; printf " <div style='display: inline-block'>\n"; - -printf "<div class='legendMenu'>\n"; -printf " <span id='legendAnchor'>show legend for column definitions</span>\n"; -printf " <div class='legendContent'>\n"; -printf "<ul>\n"; -printf "<li>Click on the column headers to sort the table by that column</li>\n"; -printf "<li>The <em>common name</em> column will either open a genome browser on that assembly when it exists, or it will submit a request to have this assembly added to the GenArk collection when it has not yet been built.</li>\n"; -# printf "<li>Use the <em>select columns of information to display</em> menu to show more or less information in the tables.</li>\n"; -# printf "<li>Use the <em>select assembly type display</em> menu to refine the types of assemblies to show in the tables.</li>\n"; -# printf "<li>Use the <em>show/hide species clade sets</em> menu to refine the species clades to display.</li>\n"; -printf "<li>Use the <em>hide all</em> selection to hide everything, or restore all items back when selections have been partially hidden.</li>\n"; -printf "<li>The <em>scientific name</em> link opens google image search for that species. A name followed by a number in (parens) indicates number of assemblies available for this species when there are too many to include here.</li>\n"; -printf "<li>IUCN status column is the status from the <a href='https://www.iucnredlist.org/' target=_blank>IUCN (2021) Red List</a> of Threatened species, Version 2021-3, accessed on 17 Dec 2021 <span style='color:%s;'>CR - Critical</span> / <span style='color:%s;'>EN - Endangered</span> / <span style='color:%s;'>VU - Vulnerable</span> The link goes to the IUCN web site for more information for that species.</li>\n", $criticalColor, $endangeredColor, $vulnerableColor; -printf "<li>The <em>NCBI taxID</em> links to the <a href='https://www.ncbi.nlm.nih.gov/taxonomy' target='_blank'>NCBI taxonomy</a> database.</li>\n"; -printf "</ul>\n"; -printf " </div>\n"; -printf "</div>\n"; - - printf "<div class='pullDownMenu'>\n"; printf " <span id='speciesSelectAnchor'>choose clades to view/hide</span>\n"; printf " <div class='pullDownMenuContent'>\n"; printf " <ul id='checkBoxSpeciesSelect'>\n"; printf " <li><label><input class='hideAll' type='checkbox' onchange='gar.visCheckBox(this)' id='allCheckBox' value='all' checked><span class='hideAllLabel'> hide all</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='primatesCheckBox' value='primates' checked><span id='primatesLabel'> primates</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='mammalsCheckBox' value='mammals' checked><span id='mammalsLabel'> mammals</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='birdsCheckBox' value='birds' checked><span id='birdsLabel'> birds</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='fishCheckBox' value='fish' checked><span id='fishLabel'> fish</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='vertebrateCheckBox' value='vertebrate' checked><span id='vertebrateLabel'> vertebrate</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='invertebratesCheckBox' value='invertebrates' checked><span id='invertebratesLabel'> invertebrates</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='plantsCheckBox' value='plants' checked><span id='plantsLabel'> plants<span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='fungiCheckBox' value='fungi' checked><span id='fungiLabel'> fungi</span></label></li>\n"; printf " </ul>\n"; printf " </div>\n"; @@ -805,42 +786,42 @@ printf "<col id='sciName' span='1' class=colGSciName>\n"; printf "<col id='asmId' span='1' class=colGAsmId>\n"; printf "<col id='asmSize' span='1' class=colGAsmSize>\n"; printf "<col id='seqCount' span='1' class=colGAsmSeqCount>\n"; printf "<col id='scafN50' span='1' class=colGScafN50>\n"; printf "<col id='ctgN50' span='1' class=colGContigN50>\n"; printf "<col id='IUCN' span='1' class=colGIUCN>\n"; printf "<col id='taxId' span='1' class=colGTaxId>\n"; printf "<col id='asmDate' span='1' class=colGAsmDate>\n"; printf "<col id='submitter' span='1' class=colGSubmitter>\n"; printf "<col id='clade' span='1' class=colGClade>\n"; printf "</colgroup>\n"; printf "<thead>\n"; printf "<tr>\n"; -printf " <th class='colComName'>common name<br>display in browser<br>or request assembly</th>\n"; -printf " <th class='colSciName'>scientific name (count)<br>google image search</th>\n"; -printf " <th class='colAsmId'>NCBI assembly</th>\n"; -printf " <th class='colAsmSize'>assembly<br>size</th>\n"; -printf " <th class='colAsmSeqCount'>sequence<br>count</th>\n"; -printf " <th class='colScafN50'>scaffold N50<br>size (L50)</th>\n"; -printf " <th class='colContigN50'>contig N50<br>size (L50)</th>\n"; -printf " <th class='colIUCN'>I<br>U<br>C<br>N</th>\n"; -printf " <th class='colTaxId'>NCBI taxID</th>\n"; -printf " <th class='colAsmDate'>assembly<br>date</th>\n"; -printf " <th class='colSubmitter'>submitter of assembly<br>link to NCBI assembly</th>\n"; -printf " <th class='colClade'>clade</th>\n"; +printf " <th class='colComName'><div class='tooltip'>common name<span class='tooltiptext'>The <em>common name</em> links to a genome browser when it exists, or provides a request form to ask for the assembly to be added to the system</span></div></th>\n"; +printf " <th class='colSciName'><div class='tooltip'>scientific name (count)<span class='tooltiptext'>The numbers in parenthesis (1234) following the name indicates the number of assemblies available for this species. The link does a google image search for that name.</span></div></th>\n"; +printf " <th class='colAsmId'><div class='tooltip'>NCBI assembly<span class='tooltiptext'>The <em>NCBI assembly</em> provides a link to the NCBI resource record for this assembly</span></div></th>\n"; +printf " <th class='colAsmSize'><div class='tooltip'>assembly<br>size<span class='tooltiptext'>The <em>assembly size</em> is the total number of nucleotides in the assembly.</span></div></th>\n"; +printf " <th class='colAsmSeqCount'><div class='tooltip'>sequence<br>count<span class='tooltiptext'>The <em>sequence count</em> is the number of sequences in this assembly.</span></div></th>\n"; +printf " <th class='colScafN50'><div class='tooltip'>scaffold N50<br>size (L50)<span class='tooltiptext'>The <em>scaffold N50</em> is the <a href='https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics' target=_blank>N50 measurement</a> for this assembly.</span></div> </th>\n"; +printf " <th class='colContigN50'><div class='tooltip'>contig N50<br>size (L50)<span class='tooltiptext'>The <em>contig N50</em> is the <a href='https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics' target=_blank>N50 measurement</a> for this assembly when there are contigs to measure.</span></div></th>\n"; +printf " <th class='colIUCN'><div class='tooltip'>I<br>U<br>C<br>N<span class='tooltiptext'>IUCN status column is the status from the <a href='https://www.iucnredlist.org/' target=_blank>IUCN (2021) Red List</a> of Threatened species, Version 2021-3, accessed on 17 Dec 2021 <span style='color:%s;'>CR - Critical</span> / <span style='color:%s;'>EN - Endangered</span> / <span style='color:%s;'>VU - Vulnerable</span> The link goes to the IUCN web site for more information for that species.</span></div></th>\n", $statusColors{"CR"}, $statusColors{"EN"}, $statusColors{"VU"}; +printf " <th class='colTaxId'><div class='tooltip'>NCBI taxID<span class='tooltiptext'>The <em>taxID</em> links to the <a href='https://www.ncbi.nlm.nih.gov/taxonomy' target='_blank'>NCBI taxonomy</a> database.</span></div></th>\n"; +printf " <th class='colAsmDate'><div class='tooltip'>assembly<br>date<span class='tooltiptext'>The submission date for this assembly to the <a href='https://www.ncbi.nlm.nih.gov/assembly' target=_blank>NCBI assembly</a> system.</span></div></th>\n"; +printf " <th class='colSubmitter sorttable_alpha'><div class='tooltip'>submitter of assembly<span class='tooltiptextright'>The name of the organization that produced this assembly.</span></div></th>\n"; +printf " <th class='colClade'><div class='tooltip'>clade<span class='tooltiptextright'>Indicating the <em>clade</em> of this organism. Note: the <em>invertebrate</em> clade is a catch all category that includes organisims not typically classified as <em>invertebrate</em></span></div></th>\n"; printf "</tr>\n"; printf "</thead><tbody>\n"; my %equivalentNamesUsed; # key is NCBI sciName, value is IUCN sciName my $pageSectionCount = 0; my %checkDupAsmId; # key is asmId, value is count of times seen my %cladeSciNameCounts; # key is clade, value is number of different # scientific names my %gcfGcaCounts; # key is GCF or GCA, value is count of each my $asmCountInTable = 0; # counting the rows output my %statusCounts; # key is status: CR EN VU, value is count my $totalAssemblySize = 0; # sum of all assembly sizes @@ -961,31 +942,31 @@ ($n50ScaffoldSize, $n50ScaffoldCount) = readN50($scaffoldN50); } } else { printf STDERR "# chromInfo missing: %s\n", $asmId; printf STDERR "# %s\n", $chromInfo; } die "ERROR duplicate newMetaInfo{$asmId}" if (defined($newMetaInfo{$asmId})); die "ERROR duplicate metaInfo{$asmId} for newMetaInfo{$asmId}" if (defined($metaInfo{$asmId})); $newMetaInfo{$asmId} = join("\t", ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) ); } # else if (defined($metaInfo{$asmId})) # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 2 %d %s %s < %s\n", $clade, ++$underSized, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); -### XXX next; + next; } my $iucnStatus = " "; my $iucnLink = ""; if (defined($sciNames{$asmId})) { my $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); $iucnLink = "https://www.iucnredlist.org/species/$iucnLink{$iucnSciName}" if (defined($iucnLink{$iucnSciName})); if ($iucnSciName ne $sciNames{$asmId}) { $equivalentNamesUsed{$sciNames{$asmId}} = $iucnSciName; } if (defined($iucnSciNames{$iucnSciName})) { $iucnStatus = $iucnSciNames{$iucnSciName}; } } ++$asmCountInTable; @@ -1139,34 +1120,36 @@ printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# tenth column, assembly date ################ if (defined($asmDate{$asmId})) { printf "<td style='display:none;'>%s</td>", $asmDate{$asmId}; printf PC "\t%s", $asmDate{$asmId}; # output to clade.tableData.txt } else { printf "<td style='display:none;'>n/a</td>"; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# eleventh column, submitter ################ $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId"; if (defined($asmSubmitter{$asmId})) { - printf "<td style='display:none;'><a href='%s' target=_blank>%s</a></td>", $asmUrl, $asmSubmitter{$asmId}; + my $submitterSortKey = lc($asmSubmitter{$asmId}); + $submitterSortKey =~ s/ //g; + printf "<td sorttable_customkey='%s' style='display:none;'>%s</td>", substr($submitterSortKey,0,20), $asmSubmitter{$asmId}; printf PC "\t%s", $asmSubmitter{$asmId}; # output to clade.tableData.txt } else { - printf "<td style='display:none;'>n/a</td>"; + printf "<td sorttable_customkey='n/a' style='display:none;'>n/a</td>"; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, clade ################ printf "<td style='display:none;'>%s</td>\n", $clade; printf PC "\t%s", $clade; printf PC "\n"; # finished a line output to clade.tableData.txt printf "</tr>\n"; } # foreach my $asmId (@$cPtr) close (PC); # finished with clade.tableData.txt output } # foreach my $clade (@clades) ########################################################################## ## single table is finished, output the end of tbody and the tfoot row