e56876e545b21501152e96f655141eb3b229cf0c hiram Fri Feb 25 16:19:26 2022 -0800 add in bioSample and bioProject columns and control of IUCN display refs #28930 diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl index 627318a..bf34033 100755 --- src/hg/gar/garTable.pl +++ src/hg/gar/garTable.pl @@ -502,30 +502,41 @@ die "ERROR: duplicate asmId data $id '$country' '$gcxCountry{$id}'"; } printf STDERR "# undefined collectDate for $id" if (!defined($collectDate)); printf STDERR "# undefined submitter for $id" if (!defined($submitter)); $country =~ s/"//g; $collectDate =~ s/"//g; $submitter =~ s/"//g; $gcxCountry{$id} = $country; $gcxDate{$id} = $collectDate; $gcxSubmitter{$id} = $submitter; ++$asmIdCount; } close (FH); printf STDERR "# asmId count: %s from asmId.country.date.by.tsv\n", commify($asmIdCount); +my %asmReportData; # key is asmId, value is tsv string for: +# sciName commonName bioSample bioProject taxId asmDate +# obtained from scanning all the assembly report files +open (FH, "<../asmReport.data.tsv") or die "can not read ../asmReport.data.tsv"; +while (my $line = <FH>) { + chomp $line; + my ($id, $rest) = split('\t', $line, 2); + $asmReportData{$id} = $rest; +} +close (FH); + ### This cladeToGo set of lists is the main driver of the table ### An assembly needs to be in this set in order to get into the table my %cladeToGo; # key is clade name, value is array pointer for asmId list my $totalGoodToGo = 0; my %cladeCounts; # key is clade name, value is count of assemblies used my $maxDupAsm = 0; my %ncbiSpeciesRecorded; # key is NCBI sciName, value is count of those my $ncbiSpeciesUsed = 0; # a unique count of NCBI sciName my %iucnSpeciesRecorded; # key is IUCN sciName, value is count of those my $iucnSpeciesUsed = 0; # a unique count of IUCN sciName my $sciNameDisplayLimit = 5; # do not display more than 5 instances my $checkedAsmIds = 0; @@ -728,48 +739,51 @@ printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='plantsCheckBox' value='plants' checked><span id='plantsLabel'> plants<span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='fungiCheckBox' value='fungi' checked><span id='fungiLabel'> fungi</span></label></li>\n"; printf " </ul>\n"; printf " </div>\n"; printf "</div>\n"; printf "<div style='width: 260px;' class='pullDownMenu'>\n"; printf " <span style='text-align: center;' id='assemblyTypeAnchor'>select assembly type to display</span>\n"; printf " <div class='pullDownMenuContent'>\n"; printf " <ul id='checkBoxAssemblyType'>\n"; printf " <li><label><input class='hideAll' type='checkbox' onchange='gar.visCheckBox(this)' id='allCheckBox' value='all' checked><span class='hideAllLabel'> hide all</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='gakCheckBox' value='gak' checked><span id='gakLabel'> Existing browser</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='garCheckBox' value='gar' checked><span id='garLabel'> Request browser</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='gcaCheckBox' value='gca' checked><span id='gcaLabel'> GCA/GenBank</span></label></li>\n"; printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='gcfCheckBox' value='gcf' checked><span id='gcfLabel'> GCF/RefSeq</span></label></li>\n"; +printf " <li><label><input class='hideShow' type='checkbox' onchange='gar.visCheckBox(this)' id='iucnCheckBox' value='hasIucn' checked><span id='iucnLabel'> IUCN</span></label></li>\n"; printf " </ul>\n"; printf " </div>\n"; printf "</div>\n"; printf "<div style='width: 240px;' class='pullDownMenu'>\n"; printf " <span id='columnSelectAnchor'>show/hide columns</span>\n"; printf " <div class='pullDownMenuContent'>\n"; printf " <ul id='checkBoxColumnSelect'>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='comNameCheckBox' value='comName' checked> common name</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='sciNameCheckBox' value='sciName' checked> scientific name</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='asmIdCheckBox' value='asmId' checked> NCBI accession</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='asmSizeCheckBox' value='asmSize'> assembly size</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='seqCountCheckBox' value='seqCount'> sequence count</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='scafN50CheckBox' value='scafN50'> scaffold N50 length (L50)</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='ctgN50CheckBox' value='ctgN50'> contig N50 length (L50)</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='IUCNCheckBox' value='IUCN'> IUCN status</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='taxIdCheckBox' value='taxId'> NCBI taxonomy ID</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='asmDateCheckBox' value='asmDate'> assembly date</label></li>\n"; +printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='bioSampleCheckBox' value='bioSample'> BioSample</label></li>\n"; +printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='bioProjectCheckBox' value='bioProject'> BioProject</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='submitterCheckBox' value='submitter'> assembly submitter</label></li>\n"; printf " <li><label><input class='columnCheckBox' type='checkbox' onchange='gar.resetColumnVis(this)' id='cladeCheckBox' value='clade'> clade</label></li>\n"; printf " </ul>\n"; printf " </div>\n"; printf "</div>\n\n"; printf " </div> <!-- display inline-block to be 'text' centered -->\n"; printf "</div> <!-- this parent div is text-align: center to center children -->\n\n"; printf "<table style='width: 100%%;' class='borderOne' id='loadingStripes'>\n"; printf "<caption><h2>. . . please wait while page loads . . .</h2></caption>\n"; printf "</table>\n\n"; printf "<div style='text-align: center;'><!-- this will cause the next div to center -->\n"; printf " <div style='display: inline-block'>\n\n"; @@ -783,48 +797,52 @@ ## is still not usable until much time later. ############################################################################## printf "<table style='display: hide;' class='sortable borderOne cladeTable' id='dataTable'>\n"; printf "<colgroup id='colDefinitions'>\n"; printf "<col id='comName' span='1' class=colGComName>\n"; printf "<col id='sciName' span='1' class=colGSciName>\n"; printf "<col id='asmId' span='1' class=colGAsmId>\n"; printf "<col id='asmSize' span='1' class=colGAsmSize>\n"; printf "<col id='seqCount' span='1' class=colGAsmSeqCount>\n"; printf "<col id='scafN50' span='1' class=colGScafN50>\n"; printf "<col id='ctgN50' span='1' class=colGContigN50>\n"; printf "<col id='IUCN' span='1' class=colGIUCN>\n"; printf "<col id='taxId' span='1' class=colGTaxId>\n"; printf "<col id='asmDate' span='1' class=colGAsmDate>\n"; +printf "<col id='bioSample' span='1' class=colGBioSample>\n"; +printf "<col id='bioProject' span='1' class=colGBioProject>\n"; printf "<col id='submitter' span='1' class=colGSubmitter>\n"; printf "<col id='clade' span='1' class=colGClade>\n"; printf "</colgroup>\n"; printf "<thead>\n"; printf "<tr>\n"; printf " <th class='colComName'><div class='tooltip'>common name<span onclick='event.stopPropagation()' class='tooltiptext'>Links to an existing assembly browser, Button opens an assembly request form.</span></div></th>\n"; printf " <th class='colSciName'><div class='tooltip'>scientific name (count)<span onclick='event.stopPropagation()' class='tooltiptext'>Links to Google image search. Count shows the number of assemblies available for this orgnism.</span></div></th>\n"; printf " <th class='colAsmId'><div class='tooltip'>NCBI assembly<span onclick='event.stopPropagation()' class='tooltiptext'>Links to NCBI resource record.</span></div></th>\n"; printf " <th class='colAsmSize'><div class='tooltip'>assembly<br>size<span onclick='event.stopPropagation()' class='tooltiptext'>Number of nucleotides in the assembly.</span></div></th>\n"; printf " <th class='colAsmSeqCount'><div class='tooltip'>sequence<br>count<span onclick='event.stopPropagation()' class='tooltiptext'>The number of sequences in this assembly.</span></div></th>\n"; printf " <th class='colScafN50'><div class='tooltip'>scaffold N50<br>length (L50)<span onclick='event.stopPropagation()' class='tooltiptext'><a href='https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics' target=_blank>N50 (L50)</a> length.</span></div> </th>\n"; printf " <th class='colContigN50'><div class='tooltip'>contig N50<br>length (L50)<span onclick='event.stopPropagation()' class='tooltiptext'><a href='https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics' target=_blank>N50 (L50)</a> length.</span></div></th>\n"; printf " <th class='colIUCN'><div class='tooltip'>IUCN<span onclick='event.stopPropagation()' class='tooltiptext'>Links to <a href='https://www.iucnredlist.org/' target=_blank>IUCN Red List</a> of Threatened Species (version 2021-3) <span style='color:%s;'>CR - Critical</span> / <span style='color:%s;'>EN - Endangered</span> / <span style='color:%s;'>VU - Vulnerable</span></span></div></th>\n", $statusColors{"CR"}, $statusColors{"EN"}, $statusColors{"VU"}; printf " <th class='colTaxId'><div class='tooltip'>NCBI taxID<span onclick='event.stopPropagation()' class='tooltiptext'>Links to <a href='https://www.ncbi.nlm.nih.gov/taxonomy' target='_blank'>NCBI Taxonomy</a> database.</span></div></th>\n"; printf " <th class='colAsmDate'><div class='tooltip'>assembly<br>date<span onclick='event.stopPropagation()' class='tooltiptext'>Date submitted to <a href='https://www.ncbi.nlm.nih.gov/assembly' target=_blank>NCBI assembly</a> database.</span></div></th>\n"; +printf " <th class='colBioSample sorttable_alpha'><div class='tooltip'>BioSample<span onclick='event.stopPropagation()' class='tooltiptext'>BioSample ID at <a href='https://www.ncbi.nlm.nih.gov/biosample' target=_blank>NCBI</a>.</span></div></th>\n"; +printf " <th class='colBioProject sorttable_alpha'><div class='tooltip'>BioProject<span onclick='event.stopPropagation()' class='tooltiptext'>BioProject ID at <a href='https://www.ncbi.nlm.nih.gov/bioproject' target=_blank>NCBI</a>.</span></div></th>\n"; printf " <th class='colSubmitter sorttable_alpha'><div class='tooltip'>Assembly submitter<span onclick='event.stopPropagation()' class='tooltiptextright'>Person or group who submitted to <a href='https://www.ncbi.nlm.nih.gov/assembly' target=_blank>NCBI Assembly</a> database.</span></div></th>\n"; -printf " <th class='colClade'><div class='tooltip'>clade<span onclick='event.stopPropagation()' class='tooltiptextright'>Clade of this organism. Note: the <em>invertebrate</em> clade is a catch all category that includes organisims not typically classified as <em>invertebrate</em></span></div></th>\n"; +printf " <th class='colClade'><div class='tooltip'>clade<span onclick='event.stopPropagation()' class='tooltiptextright'>Clade of this organism.</span></div></th>\n"; printf "</tr>\n"; printf "</thead><tbody>\n"; my %equivalentNamesUsed; # key is NCBI sciName, value is IUCN sciName my $pageSectionCount = 0; my %checkDupAsmId; # key is asmId, value is count of times seen my %cladeSciNameCounts; # key is clade, value is number of different # scientific names my %gcfGcaCounts; # key is GCF or GCA, value is count of each my $asmCountInTable = 0; # counting the rows output my %statusCounts; # key is status: CR EN VU, value is count my $totalAssemblySize = 0; # sum of all assembly sizes @@ -899,30 +917,35 @@ my $browserUrl = sprintf("https://genome.ucsc.edu/h/%s", $accessionId); my $arkDownload = sprintf("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/%s/%s/", $gcX, $d0, $d1, $d2, $accessionId); my $destDir = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2"; my $chromInfo = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.chromInfo.txt"; my $n50Txt = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.n50.txt"; my $contigN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.contigs.n50.txt"; my $scaffoldN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.scaffolds.n50.txt"; my $asmSize = 0; my $asmContigCount = 0; my $n50Size = 0; my $n50Count = 0; my $n50ContigSize = 0; my $n50ContigCount = 0; my $n50ScaffoldSize = 0; my $n50ScaffoldCount = 0; + my $bioSample = ""; + my $bioProject = ""; + if (defined($asmReportData{$asmId})) { + (undef, undef, $bioSample, $bioProject, undef) = split('\t', $asmReportData{$asmId}, 5); + } if (defined($metaInfo{$asmId})) { ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); } else { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks); my $fnaModTime = 0; if (-s "$asmFna") { ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($asmFna); $fnaModTime = $mtime; } my $ciModTime = 0; if ( -s "$chromInfo" ) { ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($chromInfo); $ciModTime = $mtime; } if ($fnaModTime > $ciModTime) { @@ -992,30 +1015,31 @@ $browserUrl = sprintf("https://genome.ucsc.edu/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId}); } printf PC "%d", $asmCountInTable; # start a line output to clade.tableData.tsv ## count number of different scientific names used in this clade table if (!defined($cladeSciNameCounts{$clade})) { my %h; $cladeSciNameCounts{$clade} = \%h; } my $csnPtr = $cladeSciNameCounts{$clade}; $csnPtr->{$sciNames{$asmId}} += 1 if (defined($sciNames{$asmId})); my $rowClass = ""; my $gcaGcfClass = "gca"; $gcaGcfClass = "gcf" if ($asmId =~ m/^GCF/); + $gcaGcfClass .= " hasIucn" if (length($iucnLink)); if (defined($comName{$asmId})) { if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db } else { $rowClass = " class='gak $gcaGcfClass $clade'"; # present in GenArk } } else { # can be requested if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db } else { $rowClass = " class='gar $gcaGcfClass $clade'"; # available for request } } ### can override CSS settings here ### $rowClass = " class='gar' style='display: none;'"; @@ -1123,30 +1147,49 @@ printf PC "\t%s", $asmTaxId{$asmId}; # output to clade.tableData.txt } else { printf "<td style='display:none;'>n/a</td>"; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# tenth column, assembly date ################ if (defined($asmDate{$asmId})) { printf "<td style='display:none;'>%s</td>", $asmDate{$asmId}; printf PC "\t%s", $asmDate{$asmId}; # output to clade.tableData.txt } else { printf "<td style='display:none;'>n/a</td>"; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } + ############# eleventh column, bioSample ################ + if (length($bioSample) && $bioSample !~ m#n/a#) { + printf "<td style='display:none; text-align:left;'><a href='https://www.ncbi.nlm.nih.gov/biosample/?term=%s' target=_blank>%s</a></td>", $bioSample, $bioSample; + printf PC "\t%s", $bioSample; # output to clade.tableData.txt + } else { + printf "<td style='display:none; text-align=left;'> </td>"; + printf PC "\t%s", "n/a"; # output to clade.tableData.txt + } + + ############# twelveth column, bioProject ################ + if (length($bioProject) && $bioProject !~ m#n/a#) { + printf "<td style='display:none; text-align:left;'><a href='https://www.ncbi.nlm.nih.gov/bioproject/?term=%s' target=_blank>%s</a></td>", $bioProject, $bioProject; + printf PC "\t%s", $bioProject; # output to clade.tableData.txt + + } else { + printf "<td style='display:none; text-align=left;'> </td>"; + printf PC "\t%s", "n/a"; # output to clade.tableData.txt + } + ############# eleventh column, submitter ################ $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId"; if (defined($asmSubmitter{$asmId})) { my $submitterSortKey = lc($asmSubmitter{$asmId}); $submitterSortKey =~ s/ //g; $submitterSortKey =~ s/[^a-z0-9]//ig; printf "<td sorttable_customkey='%s' style='display:none;'>%s</td>", substr($submitterSortKey,0,20), $asmSubmitter{$asmId}; printf PC "\t%s", $asmSubmitter{$asmId}; # output to clade.tableData.txt } else { printf "<td sorttable_customkey='n/a' style='display:none;'>n/a</td>"; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, clade ################ printf "<td style='display:none;'>%s</td>\n", $clade;