e56876e545b21501152e96f655141eb3b229cf0c
hiram
  Fri Feb 25 16:19:26 2022 -0800
add in bioSample and bioProject columns and control of IUCN display refs #28930

diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl
index 627318a..bf34033 100755
--- src/hg/gar/garTable.pl
+++ src/hg/gar/garTable.pl
@@ -502,30 +502,41 @@
     die "ERROR: duplicate asmId data $id '$country' '$gcxCountry{$id}'";
   }
   printf STDERR "# undefined collectDate for $id" if (!defined($collectDate));
   printf STDERR "# undefined submitter for $id" if (!defined($submitter));
   $country =~ s/"//g;
   $collectDate =~ s/"//g;
   $submitter =~ s/"//g;
   $gcxCountry{$id} = $country;
   $gcxDate{$id} = $collectDate;
   $gcxSubmitter{$id} = $submitter;
   ++$asmIdCount;
 }
 close (FH);
 printf STDERR "# asmId count: %s from asmId.country.date.by.tsv\n", commify($asmIdCount);
 
+my %asmReportData;	# key is asmId, value is tsv string for:
+#     sciName commonName bioSample bioProject taxId asmDate
+# obtained from scanning all the assembly report files
+open (FH, "<../asmReport.data.tsv") or die "can not read ../asmReport.data.tsv";
+while (my $line = <FH>) {
+  chomp $line;
+  my ($id, $rest) = split('\t', $line, 2);
+  $asmReportData{$id} = $rest;
+}
+close (FH);
+
 ### This cladeToGo set of lists is the main driver of the table
 ### An assembly needs to be in this set in order to get into the table
 my %cladeToGo;	# key is clade name, value is array pointer for asmId list
 my $totalGoodToGo = 0;
 my %cladeCounts;	# key is clade name, value is count of assemblies used
 
 my $maxDupAsm = 0;
 my %ncbiSpeciesRecorded;	# key is NCBI sciName, value is count of those
 my $ncbiSpeciesUsed = 0;	# a unique count of NCBI sciName
 my %iucnSpeciesRecorded;	# key is IUCN sciName, value is count of those
 my $iucnSpeciesUsed = 0;	# a unique count of IUCN sciName
 
 my $sciNameDisplayLimit = 5;    # do not display more than 5 instances
 
 my $checkedAsmIds = 0;
@@ -728,48 +739,51 @@
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='plantsCheckBox' value='plants' checked><span id='plantsLabel'> plants<span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='fungiCheckBox' value='fungi' checked><span id='fungiLabel'> fungi</span></label></li>\n";
 printf "  </ul>\n";
 printf "  </div>\n";
 printf "</div>\n";
 
 printf "<div style='width: 260px;' class='pullDownMenu'>\n";
 printf "  <span style='text-align: center;' id='assemblyTypeAnchor'>select assembly type to display</span>\n";
 printf "  <div class='pullDownMenuContent'>\n";
 printf "  <ul id='checkBoxAssemblyType'>\n";
 printf "    <li><label><input class='hideAll' type='checkbox' onchange='gar.visCheckBox(this)' id='allCheckBox' value='all' checked><span class='hideAllLabel'> hide all</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='gakCheckBox' value='gak' checked><span id='gakLabel'> Existing browser</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='garCheckBox' value='gar' checked><span id='garLabel'> Request browser</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='gcaCheckBox' value='gca' checked><span id='gcaLabel'> GCA/GenBank</span></label></li>\n";
 printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='gcfCheckBox' value='gcf' checked><span id='gcfLabel'> GCF/RefSeq</span></label></li>\n";
+printf "    <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='iucnCheckBox' value='hasIucn' checked><span id='iucnLabel'> IUCN</span></label></li>\n";
 printf "   </ul>\n";
 printf "  </div>\n";
 printf "</div>\n";
 
 printf "<div style='width: 240px;' class='pullDownMenu'>\n";
 printf "  <span id='columnSelectAnchor'>show/hide columns</span>\n";
 printf "  <div class='pullDownMenuContent'>\n";
 printf "  <ul id='checkBoxColumnSelect'>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='comNameCheckBox' value='comName' checked> common name</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='sciNameCheckBox' value='sciName' checked> scientific name</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='asmIdCheckBox' value='asmId' checked> NCBI accession</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='asmSizeCheckBox' value='asmSize'> assembly size</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='seqCountCheckBox' value='seqCount'> sequence count</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='scafN50CheckBox' value='scafN50'> scaffold N50 length (L50)</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='ctgN50CheckBox' value='ctgN50'> contig N50 length (L50)</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='IUCNCheckBox' value='IUCN'> IUCN status</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='taxIdCheckBox' value='taxId'> NCBI taxonomy ID</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='asmDateCheckBox' value='asmDate'> assembly date</label></li>\n";
+printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='bioSampleCheckBox' value='bioSample'> BioSample</label></li>\n";
+printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='bioProjectCheckBox' value='bioProject'> BioProject</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='submitterCheckBox' value='submitter'> assembly submitter</label></li>\n";
 printf "    <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='cladeCheckBox' value='clade'> clade</label></li>\n";
 printf "  </ul>\n";
 printf "  </div>\n";
 printf "</div>\n\n";
 printf "  </div>        <!-- display inline-block to be 'text' centered -->\n";
 printf "</div>  <!-- this parent div is text-align: center to center children -->\n\n";
 
 printf "<table style='width: 100%%;' class='borderOne' id='loadingStripes'>\n";
 printf "<caption><h2>. . . please wait while page loads . . .</h2></caption>\n";
 printf "</table>\n\n";
 
 printf "<div style='text-align: center;'><!-- this will cause the next div to center -->\n";
 printf "  <div style='display: inline-block'>\n\n";
 
@@ -783,48 +797,52 @@
 ## is still not usable until much time later.
 ##############################################################################
 printf "<table style='display: hide;' class='sortable borderOne cladeTable' id='dataTable'>\n";
 
 printf "<colgroup id='colDefinitions'>\n";
 printf "<col id='comName' span='1' class=colGComName>\n";
 printf "<col id='sciName' span='1' class=colGSciName>\n";
 printf "<col id='asmId' span='1' class=colGAsmId>\n";
 printf "<col id='asmSize' span='1' class=colGAsmSize>\n";
 printf "<col id='seqCount' span='1' class=colGAsmSeqCount>\n";
 printf "<col id='scafN50' span='1' class=colGScafN50>\n";
 printf "<col id='ctgN50' span='1' class=colGContigN50>\n";
 printf "<col id='IUCN' span='1' class=colGIUCN>\n";
 printf "<col id='taxId' span='1' class=colGTaxId>\n";
 printf "<col id='asmDate' span='1' class=colGAsmDate>\n";
+printf "<col id='bioSample' span='1' class=colGBioSample>\n";
+printf "<col id='bioProject' span='1' class=colGBioProject>\n";
 printf "<col id='submitter' span='1' class=colGSubmitter>\n";
 printf "<col id='clade' span='1' class=colGClade>\n";
 printf "</colgroup>\n";
 
 printf "<thead>\n";
 printf "<tr>\n";
 printf "  <th class='colComName'><div class='tooltip'>common name<span onclick='event.stopPropagation()' class='tooltiptext'>Links to an existing assembly browser, Button opens an assembly request form.</span></div></th>\n";
 printf "  <th class='colSciName'><div class='tooltip'>scientific name (count)<span onclick='event.stopPropagation()' class='tooltiptext'>Links to Google image search. Count shows the number of assemblies available for this orgnism.</span></div></th>\n";
 printf "  <th class='colAsmId'><div class='tooltip'>NCBI assembly<span onclick='event.stopPropagation()' class='tooltiptext'>Links to NCBI resource record.</span></div></th>\n";
 printf "  <th class='colAsmSize'><div class='tooltip'>assembly<br>size<span onclick='event.stopPropagation()' class='tooltiptext'>Number of nucleotides in the assembly.</span></div></th>\n";
 printf "  <th class='colAsmSeqCount'><div class='tooltip'>sequence<br>count<span onclick='event.stopPropagation()' class='tooltiptext'>The number of sequences in this assembly.</span></div></th>\n";
 printf "  <th class='colScafN50'><div class='tooltip'>scaffold N50<br>length (L50)<span onclick='event.stopPropagation()' class='tooltiptext'><a href='https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics' target=_blank>N50 (L50)</a> length.</span></div> </th>\n";
 printf "  <th class='colContigN50'><div class='tooltip'>contig N50<br>length (L50)<span onclick='event.stopPropagation()' class='tooltiptext'><a href='https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics' target=_blank>N50 (L50)</a> length.</span></div></th>\n";
 printf "  <th class='colIUCN'><div class='tooltip'>IUCN<span onclick='event.stopPropagation()' class='tooltiptext'>Links to <a href='https://www.iucnredlist.org/' target=_blank>IUCN Red List</a> of Threatened Species (version 2021-3) <span style='color:%s;'>CR - Critical</span> / <span style='color:%s;'>EN - Endangered</span> / <span style='color:%s;'>VU - Vulnerable</span></span></div></th>\n", $statusColors{"CR"}, $statusColors{"EN"}, $statusColors{"VU"};
 printf "  <th class='colTaxId'><div class='tooltip'>NCBI taxID<span onclick='event.stopPropagation()' class='tooltiptext'>Links to <a href='https://www.ncbi.nlm.nih.gov/taxonomy' target='_blank'>NCBI Taxonomy</a> database.</span></div></th>\n";
 printf "  <th class='colAsmDate'><div class='tooltip'>assembly<br>date<span onclick='event.stopPropagation()' class='tooltiptext'>Date submitted to <a href='https://www.ncbi.nlm.nih.gov/assembly' target=_blank>NCBI assembly</a> database.</span></div></th>\n";
+printf "  <th class='colBioSample sorttable_alpha'><div class='tooltip'>BioSample<span onclick='event.stopPropagation()' class='tooltiptext'>BioSample ID at <a href='https://www.ncbi.nlm.nih.gov/biosample' target=_blank>NCBI</a>.</span></div></th>\n";
+printf "  <th class='colBioProject sorttable_alpha'><div class='tooltip'>BioProject<span onclick='event.stopPropagation()' class='tooltiptext'>BioProject ID at <a href='https://www.ncbi.nlm.nih.gov/bioproject' target=_blank>NCBI</a>.</span></div></th>\n";
 printf "  <th class='colSubmitter sorttable_alpha'><div class='tooltip'>Assembly submitter<span onclick='event.stopPropagation()' class='tooltiptextright'>Person or group who submitted to <a href='https://www.ncbi.nlm.nih.gov/assembly' target=_blank>NCBI Assembly</a> database.</span></div></th>\n";
-printf "  <th class='colClade'><div class='tooltip'>clade<span onclick='event.stopPropagation()' class='tooltiptextright'>Clade of this organism.  Note: the <em>invertebrate</em> clade is a catch all category that includes organisims not typically classified as <em>invertebrate</em></span></div></th>\n";
+printf "  <th class='colClade'><div class='tooltip'>clade<span onclick='event.stopPropagation()' class='tooltiptextright'>Clade of this organism.</span></div></th>\n";
 printf "</tr>\n";
 printf "</thead><tbody>\n";
 
 my %equivalentNamesUsed;	# key is NCBI sciName, value is IUCN sciName
 my $pageSectionCount = 0;
 
 my %checkDupAsmId;	# key is asmId, value is count of times seen
 
 my %cladeSciNameCounts;	# key is clade, value is number of different
 			# scientific names
 
 my %gcfGcaCounts;	# key is GCF or GCA, value is count of each
 my $asmCountInTable = 0;	# counting the rows output
 my %statusCounts;	# key is status: CR EN VU, value is count
 my $totalAssemblySize = 0;	# sum of all assembly sizes
@@ -899,30 +917,35 @@
   my $browserUrl = sprintf("https://genome.ucsc.edu/h/%s", $accessionId);
   my $arkDownload = sprintf("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/%s/%s/", $gcX, $d0, $d1, $d2, $accessionId);
   my $destDir = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2";
   my $chromInfo = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.chromInfo.txt";
   my $n50Txt = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.n50.txt";
   my $contigN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.contigs.n50.txt";
   my $scaffoldN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.scaffolds.n50.txt";
   my $asmSize = 0;
   my $asmContigCount = 0;
   my $n50Size = 0;
   my $n50Count = 0;
   my $n50ContigSize = 0;
   my $n50ContigCount = 0;
   my $n50ScaffoldSize = 0;
   my $n50ScaffoldCount = 0;
+  my $bioSample = "";
+  my $bioProject = "";
+  if (defined($asmReportData{$asmId})) {
+     (undef, undef, $bioSample, $bioProject, undef) = split('\t', $asmReportData{$asmId}, 5);
+  }
   if (defined($metaInfo{$asmId})) {
    ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId});
   } else {
   my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks);
   my $fnaModTime = 0;
   if (-s "$asmFna") {
     ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($asmFna);
     $fnaModTime = $mtime;
   }
   my $ciModTime = 0;
   if ( -s "$chromInfo" ) {
     ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($chromInfo);
     $ciModTime = $mtime;
   }
   if ($fnaModTime > $ciModTime) {
@@ -992,30 +1015,31 @@
      $browserUrl = sprintf("https://genome.ucsc.edu/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId});
   }
 
   printf PC "%d", $asmCountInTable;	# start a line output to clade.tableData.tsv
 
   ## count number of different scientific names used in this clade table
   if (!defined($cladeSciNameCounts{$clade})) {
     my %h;
     $cladeSciNameCounts{$clade} = \%h;
   }
   my $csnPtr = $cladeSciNameCounts{$clade};
   $csnPtr->{$sciNames{$asmId}} += 1 if (defined($sciNames{$asmId}));
   my $rowClass = "";
   my $gcaGcfClass = "gca";
   $gcaGcfClass = "gcf" if ($asmId =~ m/^GCF/);
+  $gcaGcfClass .= " hasIucn" if (length($iucnLink));
   if (defined($comName{$asmId})) {
     if (defined($rrGcaGcfList{$asmId})) {
       $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db
     } else {
       $rowClass = " class='gak $gcaGcfClass $clade'"; # present in GenArk
     }
   } else { # can be requested
     if (defined($rrGcaGcfList{$asmId})) {
       $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db
     } else {
       $rowClass = " class='gar $gcaGcfClass $clade'"; # available for request
     }
   }
 ### can override CSS settings here
 ###    $rowClass = " class='gar' style='display: none;'";
@@ -1123,30 +1147,49 @@
     printf PC "\t%s", $asmTaxId{$asmId};	# output to clade.tableData.txt
   } else {
     printf "<td style='display:none;'>n/a</td>";
     printf PC "\t%s", "n/a";	# output to clade.tableData.txt
   }
 
   ############# tenth column,  assembly date ################
   if (defined($asmDate{$asmId})) {
     printf "<td style='display:none;'>%s</td>", $asmDate{$asmId};
     printf PC "\t%s", $asmDate{$asmId};	# output to clade.tableData.txt
   } else {
     printf "<td style='display:none;'>n/a</td>";
     printf PC "\t%s", "n/a";	# output to clade.tableData.txt
   }
 
+  ############# eleventh column,  bioSample ################
+  if (length($bioSample) && $bioSample !~ m#n/a#) {
+    printf "<td style='display:none; text-align:left;'><a href='https://www.ncbi.nlm.nih.gov/biosample/?term=%s' target=_blank>%s</a></td>", $bioSample, $bioSample;
+    printf PC "\t%s", $bioSample;	# output to clade.tableData.txt
+  } else {
+    printf "<td style='display:none; text-align=left;'>&nbsp;</td>";
+    printf PC "\t%s", "n/a";	# output to clade.tableData.txt
+  }
+
+  ############# twelveth column,  bioProject ################
+  if (length($bioProject) && $bioProject !~ m#n/a#) {
+    printf "<td style='display:none; text-align:left;'><a href='https://www.ncbi.nlm.nih.gov/bioproject/?term=%s' target=_blank>%s</a></td>", $bioProject, $bioProject;
+    printf PC "\t%s", $bioProject;	# output to clade.tableData.txt
+
+  } else {
+    printf "<td style='display:none; text-align=left;'>&nbsp;</td>";
+    printf PC "\t%s", "n/a";	# output to clade.tableData.txt
+  }
+
   ############# eleventh column,  submitter ################
   $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId";
   if (defined($asmSubmitter{$asmId})) {
     my $submitterSortKey = lc($asmSubmitter{$asmId});
     $submitterSortKey =~ s/ //g;
     $submitterSortKey =~ s/[^a-z0-9]//ig;
     printf "<td sorttable_customkey='%s' style='display:none;'>%s</td>", substr($submitterSortKey,0,20), $asmSubmitter{$asmId};
     printf PC "\t%s", $asmSubmitter{$asmId};	# output to clade.tableData.txt
   } else {
     printf "<td sorttable_customkey='n/a' style='display:none;'>n/a</td>";
     printf PC "\t%s", "n/a";	# output to clade.tableData.txt
   }
 
   ############# twelveth column,  clade ################
   printf "<td style='display:none;'>%s</td>\n", $clade;