d4fdeadb1c734b4bf4d9885f462fa93c79df55b7
hiram
  Mon Feb 21 16:02:14 2022 -0800
add tooltip doc on column headers refs #28930

diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl
index 1db498f..c5f7856 100755
--- src/hg/gar/garTable.pl
+++ src/hg/gar/garTable.pl
@@ -71,53 +71,57 @@
   return sprintf("%d", $n) if ($n < 1000);
   my $m = $n/1000;
   return sprintf("%.2fK", $m) if ($m < 1000);
   $m = $n/1000000;
   return sprintf("%.2fM", $m) if ($m < 1000);
   $m = $n/1000000000;
   return sprintf("%.3fG", $m);
 }
 ###############################################################################
 
 ###############################################################################
 # output a table cell for an N50 measurement
 sub n50Cell($$$) {
   my ($size, $count, $fh) = @_;
   if ($size > 0) {
-    printf "<td style='display:none; text-align:right;' sorttable_customkey='%d'>%s&nbsp;(%s)</td>", $size, gmk($size), $count;
+    printf "<td style='display:none; text-align:right;' sorttable_customkey='%d'>%s&nbsp;(%s)</td>", $size, gmk($size), commify($count);
     printf $fh "\t%d (%d)", $size, $count;	# output to clade.tableData.txt
   } else {
     printf "<td style='display:none;'>&nbsp;</td>";
     printf $fh "\tn/a (n/a)";	# output to clade.tableData.txt
   }
 }
 ###############################################################################
 
 my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi );
 # my @clades = qw( primates mammals birds );
 
 # to help weed out some of the noise
 # key is clade, value is minimal size to count as a whole genome
+# these are actually pretty low to allow in some alternate haplotype
+# assemblies that don't seem to be the whole assembly.
+# The assemblies are also filtered by NCBI status 'full/partial' to only
+# allow in the 'full' genomes meaning representation of the whole genome
 my %minimalGenomeSize = (
   primates => 1000000000,
-  mammals => 200000000,
+  mammals => 20000000,
   birds => 200000000,
-  fish => 1000000,
-  vertebrate => 400000000,
-  invertebrates => 10000000,
-  plants => 10000000,
-  fungi => 1000000,
+  fish => 100000,
+  vertebrate => 4000000,
+  invertebrates => 10000,
+  plants => 100000,
+  fungi => 50000,
 );
 
 #########################################################################
 ## read in list of current GenArk assemblies
 
 my %genArkAsm;	# key is asmId, value is string with:
 
 # accession<tab>assembly<tab>scientific name<tab>common name<tab>taxonId
 
 my $genArkCount = 0;
 printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n";
 open (FH, "<UCSC_GI.assemblyHubList.txt") or die "can not read UCSC_GI.assemblyHubList.txt";
 while (my $line = <FH>) {
   next if ($line =~ m/^#/);
   chomp $line;
@@ -548,43 +552,39 @@
      $asmId =~ s/\//_/g;
      $asmId =~ s/\#/_/g;
 #      $asmId =~ s/[.:%+/#]/_/g;
      $asmId =~ s/[()]//g;
      $asmId =~ s/__/_/g;
      ++$shouldBeGenArk if (defined($genArkAsm{$asmId}));
      ++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId}));
      next if (defined ($skipPartialGenome{$asmId}));
      next if (defined ($asmSuppressed{$asmId}));
      next if (defined ($alreadyDone{$asmId}));
      # something wrong with these two
 # GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683
 # GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679
      next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/);
      # verify this asmId will pass the asmSize limit
-###     if (defined($metaInfo{$asmId})) {
-###       my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId});
-###       next if ($asmSize < $minimalGenomeSize{$clade});	# too small
-###     }
      if (defined($metaInfo{$asmId})) {
       my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId});
       # if asmSize is below the minimum, don't use it
        if ($asmSize < $minimalGenomeSize{$clade}) {
          printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade});
      printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId}));
      printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId}));
 printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId};
-### XXX         next;
+         next;
        }
      }
      $alreadyDone{$asmId} = 1;
      if (defined($genArkClade{$asmId})) {
       die "ERROR: duplicate asmId today $asmId '$clade' '$genArkClade{$asmId}'";
      }
      ++$checkedAsmIds;
      my $iucnSciName = "";
      if (defined($sciNames{$asmId})) {
        ++$ncbiSpeciesRecorded{$sciNames{$asmId}};
        next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit);
        $iucnSciName = $sciNames{$asmId};
  $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}}));
        ++$iucnSpeciesRecorded{$iucnSciName};
      } else {
@@ -611,31 +611,31 @@
        }
        if ($assembliesAvailable > 1) {
           my $bPtr = $sciNameAsmList{$sciNames{$asmId}};
           foreach my $aId (@$bPtr) {
              next if (defined ($alreadyDone{$aId}));
              $alreadyDone{$aId} = 1;
              if ($aId ne $asmId) {
                if (defined($metaInfo{$aId})) {
                  my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId});
                  # if asmSize is below the minimum, don't use it
                  if ($asmSize < $minimalGenomeSize{$clade}) {
                    printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade});
      printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId}));
      printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId}));
 printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId};
-### XXX                   next;
+                   next;
                  }
                }
                ++$ncbiSpeciesRecorded{$sciNames{$aId}};
                # the defined($sciName{$aId}) indicates it is a GenArk genome
                # always accept those even if it goes beyond the limit
                if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($sciName{$aId}) ) {
                  push (@$cPtr, $aId);
                  ++$acceptedAsmIds;
                  ++$goodToGoCount;
                  ++$cladeCounts{$clade};
                }	#	under limit count or is GenArk assembly
              }	#	if ($aId ne $asmId)
           }	#	foreach my $aId (@$bPtr)
        }	#	if ($assembliesAvailable > 1)
      }	#	if (defined($sciNames{$asmId}))
@@ -699,49 +699,30 @@
   printf "<td><a href='#pageTop'>top of page</a></td>\n";
   printf "</table>\n";
 }
 }	#	if ( 1 == 0 )
 
 # count all assemblies in all clades
 my $totalAssemblies = 0;
 foreach my $c (@clades) {
   $totalAssemblies += $cladeCounts{$c};
 }
 
 printf "<!-- to get the pull-down menu items centered together -->\n";
 printf "<div style='text-align: center;'><!-- this will cause the next div to center -->\n";
 printf "  <div style='display: inline-block'>\n";
 
-
-printf "<div class='legendMenu'>\n";
-printf "  <span id='legendAnchor'>show legend for column definitions</span>\n";
-printf "  <div class='legendContent'>\n";
-printf "<ul>\n";
-printf "<li>Click on the column headers to sort the table by that column</li>\n";
-printf "<li>The <em>common name</em> column will either open a genome browser on that assembly when it exists, or it will submit a request to have this assembly added to the GenArk collection when it has not yet been built.</li>\n";
-# printf "<li>Use the <em>select columns of information to display</em> menu to show more or less information in the tables.</li>\n";
-# printf "<li>Use the <em>select assembly type display</em> menu to refine the types of assemblies to show in the tables.</li>\n";
-# printf "<li>Use the <em>show/hide species clade sets</em> menu to refine the species clades to display.</li>\n";
-printf "<li>Use the <em>hide all</em> selection to hide everything, or restore all items back when selections have been partially hidden.</li>\n";
-printf "<li>The <em>scientific name</em> link opens google image search for that species.  A name followed by a number in (parens) indicates number of assemblies available for this species when there are too many to include here.</li>\n";
-printf "<li>IUCN status column is the status from the <a href='https://www.iucnredlist.org/' target=_blank>IUCN (2021) Red List</a> of Threatened species, Version 2021-3, accessed on 17 Dec 2021 <span style='color:%s;'>CR - Critical</span> / <span style='color:%s;'>EN - Endangered</span> / <span style='color:%s;'>VU - Vulnerable</span> The link goes to the IUCN web site for more information for that species.</li>\n", $criticalColor, $endangeredColor, $vulnerableColor;
-printf "<li>The <em>NCBI taxID</em> links to the <a href='https://www.ncbi.nlm.nih.gov/taxonomy' target='_blank'>NCBI taxonomy</a> database.</li>\n";
-printf "</ul>\n";
-printf "  </div>\n";
-printf "</div>\n";
-
-
 printf "<div class='pullDownMenu'>\n";
 printf "  <span id='speciesSelectAnchor'>choose clades to view/hide</span>\n";
 printf "  <div class='pullDownMenuContent'>\n";
 printf "  <ul id='checkBoxSpeciesSelect'>\n";
 printf "    <li><label><input class='hideAll' type='checkbox' onchange='gar.visCheckBox(this)' id='allCheckBox' value='all' checked><span class='hideAllLabel'> hide all</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='primatesCheckBox' value='primates' checked><span id='primatesLabel'> primates</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='mammalsCheckBox' value='mammals' checked><span id='mammalsLabel'> mammals</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='birdsCheckBox' value='birds' checked><span id='birdsLabel'> birds</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='fishCheckBox' value='fish' checked><span id='fishLabel'> fish</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='vertebrateCheckBox' value='vertebrate' checked><span id='vertebrateLabel'> vertebrate</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='invertebratesCheckBox' value='invertebrates' checked><span id='invertebratesLabel'> invertebrates</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='plantsCheckBox' value='plants' checked><span id='plantsLabel'> plants<span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='fungiCheckBox' value='fungi' checked><span id='fungiLabel'> fungi</span></label></li>\n";
 printf "  </ul>\n";
 printf "  </div>\n";
@@ -805,42 +786,42 @@
 printf "<col id='sciName' span='1' class=colGSciName>\n";
 printf "<col id='asmId' span='1' class=colGAsmId>\n";
 printf "<col id='asmSize' span='1' class=colGAsmSize>\n";
 printf "<col id='seqCount' span='1' class=colGAsmSeqCount>\n";
 printf "<col id='scafN50' span='1' class=colGScafN50>\n";
 printf "<col id='ctgN50' span='1' class=colGContigN50>\n";
 printf "<col id='IUCN' span='1' class=colGIUCN>\n";
 printf "<col id='taxId' span='1' class=colGTaxId>\n";
 printf "<col id='asmDate' span='1' class=colGAsmDate>\n";
 printf "<col id='submitter' span='1' class=colGSubmitter>\n";
 printf "<col id='clade' span='1' class=colGClade>\n";
 printf "</colgroup>\n";
 
 printf "<thead>\n";
 printf "<tr>\n";
-printf "  <th class='colComName'>common name<br>display in browser<br>or request assembly</th>\n";
-printf "  <th class='colSciName'>scientific name (count)<br>google image search</th>\n";
-printf "  <th class='colAsmId'>NCBI assembly</th>\n";
-printf "  <th class='colAsmSize'>assembly<br>size</th>\n";
-printf "  <th class='colAsmSeqCount'>sequence<br>count</th>\n";
-printf "  <th class='colScafN50'>scaffold N50<br>size (L50)</th>\n";
-printf "  <th class='colContigN50'>contig N50<br>size (L50)</th>\n";
-printf "  <th class='colIUCN'>I<br>U<br>C<br>N</th>\n";
-printf "  <th class='colTaxId'>NCBI taxID</th>\n";
-printf "  <th class='colAsmDate'>assembly<br>date</th>\n";
-printf "  <th class='colSubmitter'>submitter of assembly<br>link to NCBI assembly</th>\n";
-printf "  <th class='colClade'>clade</th>\n";
+printf "  <th class='colComName'><div class='tooltip'>common name<span class='tooltiptext'>The <em>common name</em> links to a genome browser when it exists, or provides a request form to ask for the assembly to be added to the system</span></div></th>\n";
+printf "  <th class='colSciName'><div class='tooltip'>scientific name (count)<span class='tooltiptext'>The numbers in parenthesis (1234) following the name indicates the number of assemblies available for this species.  The link does a google image search for that name.</span></div></th>\n";
+printf "  <th class='colAsmId'><div class='tooltip'>NCBI assembly<span class='tooltiptext'>The <em>NCBI assembly</em> provides a link to the NCBI resource record for this assembly</span></div></th>\n";
+printf "  <th class='colAsmSize'><div class='tooltip'>assembly<br>size<span class='tooltiptext'>The <em>assembly size</em> is the total number of nucleotides in the assembly.</span></div></th>\n";
+printf "  <th class='colAsmSeqCount'><div class='tooltip'>sequence<br>count<span class='tooltiptext'>The <em>sequence count</em> is the number of sequences in this assembly.</span></div></th>\n";
+printf "  <th class='colScafN50'><div class='tooltip'>scaffold N50<br>size (L50)<span class='tooltiptext'>The <em>scaffold N50</em> is the <a href='https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics' target=_blank>N50 measurement</a> for this assembly.</span></div> </th>\n";
+printf "  <th class='colContigN50'><div class='tooltip'>contig N50<br>size (L50)<span class='tooltiptext'>The <em>contig N50</em> is the <a href='https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics' target=_blank>N50 measurement</a> for this assembly when there are contigs to measure.</span></div></th>\n";
+printf "  <th class='colIUCN'><div class='tooltip'>I<br>U<br>C<br>N<span class='tooltiptext'>IUCN status column is the status from the <a href='https://www.iucnredlist.org/' target=_blank>IUCN (2021) Red List</a> of Threatened species, Version 2021-3, accessed on 17 Dec 2021 <span style='color:%s;'>CR - Critical</span> / <span style='color:%s;'>EN - Endangered</span> / <span style='color:%s;'>VU - Vulnerable</span> The link goes to the IUCN web site for more information for that species.</span></div></th>\n", $statusColors{"CR"}, $statusColors{"EN"}, $statusColors{"VU"};
+printf "  <th class='colTaxId'><div class='tooltip'>NCBI taxID<span class='tooltiptext'>The <em>taxID</em> links to the <a href='https://www.ncbi.nlm.nih.gov/taxonomy' target='_blank'>NCBI taxonomy</a> database.</span></div></th>\n";
+printf "  <th class='colAsmDate'><div class='tooltip'>assembly<br>date<span class='tooltiptext'>The submission date for this assembly to the <a href='https://www.ncbi.nlm.nih.gov/assembly' target=_blank>NCBI assembly</a> system.</span></div></th>\n";
+printf "  <th class='colSubmitter sorttable_alpha'><div class='tooltip'>submitter of assembly<span class='tooltiptextright'>The name of the organization that produced this assembly.</span></div></th>\n";
+printf "  <th class='colClade'><div class='tooltip'>clade<span class='tooltiptextright'>Indicating the <em>clade</em> of this organism.  Note: the <em>invertebrate</em> clade is a catch all category that includes organisims not typically classified as <em>invertebrate</em></span></div></th>\n";
 printf "</tr>\n";
 printf "</thead><tbody>\n";
 
 my %equivalentNamesUsed;	# key is NCBI sciName, value is IUCN sciName
 my $pageSectionCount = 0;
 
 my %checkDupAsmId;	# key is asmId, value is count of times seen
 
 my %cladeSciNameCounts;	# key is clade, value is number of different
 			# scientific names
 
 my %gcfGcaCounts;	# key is GCF or GCA, value is count of each
 my $asmCountInTable = 0;	# counting the rows output
 my %statusCounts;	# key is status: CR EN VU, value is count
 my $totalAssemblySize = 0;	# sum of all assembly sizes
@@ -961,31 +942,31 @@
       ($n50ScaffoldSize, $n50ScaffoldCount) = readN50($scaffoldN50);
     }
   } else {
     printf STDERR "# chromInfo missing: %s\n", $asmId;
     printf STDERR "# %s\n", $chromInfo;
   }
    die "ERROR duplicate newMetaInfo{$asmId}" if (defined($newMetaInfo{$asmId}));
    die "ERROR duplicate metaInfo{$asmId} for newMetaInfo{$asmId}" if (defined($metaInfo{$asmId}));
    $newMetaInfo{$asmId} = join("\t", ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) );
   }	# else if (defined($metaInfo{$asmId}))
   # if asmSize is below the minimum, don't use it
   if ($asmSize < $minimalGenomeSize{$clade}) {
     printf STDERR "# %s underSized 2 %d %s %s < %s\n", $clade, ++$underSized, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade});
      printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId}));
      printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId}));
-### XXX    next;
+     next;
   }
   my $iucnStatus = "&nbsp;";
   my $iucnLink = "";
   if (defined($sciNames{$asmId})) {
      my $iucnSciName = $sciNames{$asmId};
  $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}}));
      $iucnLink = "https://www.iucnredlist.org/species/$iucnLink{$iucnSciName}" if (defined($iucnLink{$iucnSciName}));
      if ($iucnSciName ne $sciNames{$asmId}) {
        $equivalentNamesUsed{$sciNames{$asmId}} = $iucnSciName;
      }
      if (defined($iucnSciNames{$iucnSciName})) {
        $iucnStatus = $iucnSciNames{$iucnSciName};
      }
   }
   ++$asmCountInTable;
@@ -1139,34 +1120,36 @@
     printf PC "\t%s", "n/a";	# output to clade.tableData.txt
   }
 
   ############# tenth column,  assembly date ################
   if (defined($asmDate{$asmId})) {
     printf "<td style='display:none;'>%s</td>", $asmDate{$asmId};
     printf PC "\t%s", $asmDate{$asmId};	# output to clade.tableData.txt
   } else {
     printf "<td style='display:none;'>n/a</td>";
     printf PC "\t%s", "n/a";	# output to clade.tableData.txt
   }
 
   ############# eleventh column,  submitter ################
   $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId";
   if (defined($asmSubmitter{$asmId})) {
-    printf "<td style='display:none;'><a href='%s' target=_blank>%s</a></td>", $asmUrl, $asmSubmitter{$asmId};
+    my $submitterSortKey = lc($asmSubmitter{$asmId});
+    $submitterSortKey =~ s/ //g;
+    printf "<td sorttable_customkey='%s' style='display:none;'>%s</td>", substr($submitterSortKey,0,20), $asmSubmitter{$asmId};
     printf PC "\t%s", $asmSubmitter{$asmId};	# output to clade.tableData.txt
   } else {
-    printf "<td style='display:none;'>n/a</td>";
+    printf "<td sorttable_customkey='n/a' style='display:none;'>n/a</td>";
     printf PC "\t%s", "n/a";	# output to clade.tableData.txt
   }
 
   ############# twelveth column,  clade ################
   printf "<td style='display:none;'>%s</td>\n", $clade;
   printf PC "\t%s", $clade;
 
   printf PC "\n";	# finished a line output to clade.tableData.txt
   printf "</tr>\n";
   }	#	foreach my $asmId (@$cPtr)
   close (PC);	# finished with clade.tableData.txt output
 }	#	foreach my $clade (@clades)
 
 ##########################################################################
 ## single table is finished, output the end of tbody and the tfoot row