d4fdeadb1c734b4bf4d9885f462fa93c79df55b7 hiram Mon Feb 21 16:02:14 2022 -0800 add tooltip doc on column headers refs #28930 diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl index 1db498f..c5f7856 100755 --- src/hg/gar/garTable.pl +++ src/hg/gar/garTable.pl @@ -71,53 +71,57 @@ return sprintf("%d", $n) if ($n < 1000); my $m = $n/1000; return sprintf("%.2fK", $m) if ($m < 1000); $m = $n/1000000; return sprintf("%.2fM", $m) if ($m < 1000); $m = $n/1000000000; return sprintf("%.3fG", $m); } ############################################################################### ############################################################################### # output a table cell for an N50 measurement sub n50Cell($$$) { my ($size, $count, $fh) = @_; if ($size > 0) { - printf "%s (%s)", $size, gmk($size), $count; + printf "%s (%s)", $size, gmk($size), commify($count); printf $fh "\t%d (%d)", $size, $count; # output to clade.tableData.txt } else { printf " "; printf $fh "\tn/a (n/a)"; # output to clade.tableData.txt } } ############################################################################### my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi ); # my @clades = qw( primates mammals birds ); # to help weed out some of the noise # key is clade, value is minimal size to count as a whole genome +# these are actually pretty low to allow in some alternate haplotype +# assemblies that don't seem to be the whole assembly. +# The assemblies are also filtered by NCBI status 'full/partial' to only +# allow in the 'full' genomes meaning representation of the whole genome my %minimalGenomeSize = ( primates => 1000000000, - mammals => 200000000, + mammals => 20000000, birds => 200000000, - fish => 1000000, - vertebrate => 400000000, - invertebrates => 10000000, - plants => 10000000, - fungi => 1000000, + fish => 100000, + vertebrate => 4000000, + invertebrates => 10000, + plants => 100000, + fungi => 50000, ); ######################################################################### ## read in list of current GenArk assemblies my %genArkAsm; # key is asmId, value is string with: # accessionassemblyscientific namecommon nametaxonId my $genArkCount = 0; printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n"; open (FH, ") { next if ($line =~ m/^#/); chomp $line; @@ -548,43 +552,39 @@ $asmId =~ s/\//_/g; $asmId =~ s/\#/_/g; # $asmId =~ s/[.:%+/#]/_/g; $asmId =~ s/[()]//g; $asmId =~ s/__/_/g; ++$shouldBeGenArk if (defined($genArkAsm{$asmId})); ++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId})); next if (defined ($skipPartialGenome{$asmId})); next if (defined ($asmSuppressed{$asmId})); next if (defined ($alreadyDone{$asmId})); # something wrong with these two # GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683 # GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679 next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/); # verify this asmId will pass the asmSize limit -### if (defined($metaInfo{$asmId})) { -### my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); -### next if ($asmSize < $minimalGenomeSize{$clade}); # too small -### } if (defined($metaInfo{$asmId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId}; -### XXX next; + next; } } $alreadyDone{$asmId} = 1; if (defined($genArkClade{$asmId})) { die "ERROR: duplicate asmId today $asmId '$clade' '$genArkClade{$asmId}'"; } ++$checkedAsmIds; my $iucnSciName = ""; if (defined($sciNames{$asmId})) { ++$ncbiSpeciesRecorded{$sciNames{$asmId}}; next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit); $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); ++$iucnSpeciesRecorded{$iucnSciName}; } else { @@ -611,31 +611,31 @@ } if ($assembliesAvailable > 1) { my $bPtr = $sciNameAsmList{$sciNames{$asmId}}; foreach my $aId (@$bPtr) { next if (defined ($alreadyDone{$aId})); $alreadyDone{$aId} = 1; if ($aId ne $asmId) { if (defined($metaInfo{$aId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId})); printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId}; -### XXX next; + next; } } ++$ncbiSpeciesRecorded{$sciNames{$aId}}; # the defined($sciName{$aId}) indicates it is a GenArk genome # always accept those even if it goes beyond the limit if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($sciName{$aId}) ) { push (@$cPtr, $aId); ++$acceptedAsmIds; ++$goodToGoCount; ++$cladeCounts{$clade}; } # under limit count or is GenArk assembly } # if ($aId ne $asmId) } # foreach my $aId (@$bPtr) } # if ($assembliesAvailable > 1) } # if (defined($sciNames{$asmId})) @@ -699,49 +699,30 @@ printf "top of page\n"; printf "\n"; } } # if ( 1 == 0 ) # count all assemblies in all clades my $totalAssemblies = 0; foreach my $c (@clades) { $totalAssemblies += $cladeCounts{$c}; } printf "\n"; printf "
\n"; printf "
\n"; - -printf "
\n"; -printf " show legend for column definitions\n"; -printf "
\n"; -printf "
    \n"; -printf "
  • Click on the column headers to sort the table by that column
  • \n"; -printf "
  • The common name column will either open a genome browser on that assembly when it exists, or it will submit a request to have this assembly added to the GenArk collection when it has not yet been built.
  • \n"; -# printf "
  • Use the select columns of information to display menu to show more or less information in the tables.
  • \n"; -# printf "
  • Use the select assembly type display menu to refine the types of assemblies to show in the tables.
  • \n"; -# printf "
  • Use the show/hide species clade sets menu to refine the species clades to display.
  • \n"; -printf "
  • Use the hide all selection to hide everything, or restore all items back when selections have been partially hidden.
  • \n"; -printf "
  • The scientific name link opens google image search for that species. A name followed by a number in (parens) indicates number of assemblies available for this species when there are too many to include here.
  • \n"; -printf "
  • IUCN status column is the status from the IUCN (2021) Red List of Threatened species, Version 2021-3, accessed on 17 Dec 2021 CR - Critical / EN - Endangered / VU - Vulnerable The link goes to the IUCN web site for more information for that species.
  • \n", $criticalColor, $endangeredColor, $vulnerableColor; -printf "
  • The NCBI taxID links to the NCBI taxonomy database.
  • \n"; -printf "
\n"; -printf "
\n"; -printf "
\n"; - - printf "
\n"; printf " choose clades to view/hide\n"; printf "
\n"; printf "
    \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
\n"; printf "
\n"; @@ -805,42 +786,42 @@ printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; -printf " common name
display in browser
or request assembly\n"; -printf " scientific name (count)
google image search\n"; -printf " NCBI assembly\n"; -printf " assembly
size\n"; -printf " sequence
count\n"; -printf " scaffold N50
size (L50)\n"; -printf " contig N50
size (L50)\n"; -printf " I
U
C
N\n"; -printf " NCBI taxID\n"; -printf " assembly
date\n"; -printf " submitter of assembly
link to NCBI assembly\n"; -printf " clade\n"; +printf "
common nameThe common name links to a genome browser when it exists, or provides a request form to ask for the assembly to be added to the system
\n"; +printf "
scientific name (count)The numbers in parenthesis (1234) following the name indicates the number of assemblies available for this species. The link does a google image search for that name.
\n"; +printf "
NCBI assemblyThe NCBI assembly provides a link to the NCBI resource record for this assembly
\n"; +printf "
assembly
sizeThe assembly size is the total number of nucleotides in the assembly.
\n"; +printf "
sequence
countThe sequence count is the number of sequences in this assembly.
\n"; +printf "
scaffold N50
size (L50)The scaffold N50 is the N50 measurement for this assembly.
\n"; +printf "
contig N50
size (L50)The contig N50 is the N50 measurement for this assembly when there are contigs to measure.
\n"; +printf "
I
U
C
NIUCN status column is the status from the IUCN (2021) Red List of Threatened species, Version 2021-3, accessed on 17 Dec 2021 CR - Critical / EN - Endangered / VU - Vulnerable The link goes to the IUCN web site for more information for that species.
\n", $statusColors{"CR"}, $statusColors{"EN"}, $statusColors{"VU"}; +printf "
NCBI taxIDThe taxID links to the NCBI taxonomy database.
\n"; +printf "
assembly
dateThe submission date for this assembly to the NCBI assembly system.
\n"; +printf "
submitter of assemblyThe name of the organization that produced this assembly.
\n"; +printf "
cladeIndicating the clade of this organism. Note: the invertebrate clade is a catch all category that includes organisims not typically classified as invertebrate
\n"; printf "\n"; printf "\n"; my %equivalentNamesUsed; # key is NCBI sciName, value is IUCN sciName my $pageSectionCount = 0; my %checkDupAsmId; # key is asmId, value is count of times seen my %cladeSciNameCounts; # key is clade, value is number of different # scientific names my %gcfGcaCounts; # key is GCF or GCA, value is count of each my $asmCountInTable = 0; # counting the rows output my %statusCounts; # key is status: CR EN VU, value is count my $totalAssemblySize = 0; # sum of all assembly sizes @@ -961,31 +942,31 @@ ($n50ScaffoldSize, $n50ScaffoldCount) = readN50($scaffoldN50); } } else { printf STDERR "# chromInfo missing: %s\n", $asmId; printf STDERR "# %s\n", $chromInfo; } die "ERROR duplicate newMetaInfo{$asmId}" if (defined($newMetaInfo{$asmId})); die "ERROR duplicate metaInfo{$asmId} for newMetaInfo{$asmId}" if (defined($metaInfo{$asmId})); $newMetaInfo{$asmId} = join("\t", ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) ); } # else if (defined($metaInfo{$asmId})) # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 2 %d %s %s < %s\n", $clade, ++$underSized, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); -### XXX next; + next; } my $iucnStatus = " "; my $iucnLink = ""; if (defined($sciNames{$asmId})) { my $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); $iucnLink = "https://www.iucnredlist.org/species/$iucnLink{$iucnSciName}" if (defined($iucnLink{$iucnSciName})); if ($iucnSciName ne $sciNames{$asmId}) { $equivalentNamesUsed{$sciNames{$asmId}} = $iucnSciName; } if (defined($iucnSciNames{$iucnSciName})) { $iucnStatus = $iucnSciNames{$iucnSciName}; } } ++$asmCountInTable; @@ -1139,34 +1120,36 @@ printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# tenth column, assembly date ################ if (defined($asmDate{$asmId})) { printf "%s", $asmDate{$asmId}; printf PC "\t%s", $asmDate{$asmId}; # output to clade.tableData.txt } else { printf "n/a"; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# eleventh column, submitter ################ $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId"; if (defined($asmSubmitter{$asmId})) { - printf "%s", $asmUrl, $asmSubmitter{$asmId}; + my $submitterSortKey = lc($asmSubmitter{$asmId}); + $submitterSortKey =~ s/ //g; + printf "%s", substr($submitterSortKey,0,20), $asmSubmitter{$asmId}; printf PC "\t%s", $asmSubmitter{$asmId}; # output to clade.tableData.txt } else { - printf "n/a"; + printf "n/a"; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, clade ################ printf "%s\n", $clade; printf PC "\t%s", $clade; printf PC "\n"; # finished a line output to clade.tableData.txt printf "\n"; } # foreach my $asmId (@$cPtr) close (PC); # finished with clade.tableData.txt output } # foreach my $clade (@clades) ########################################################################## ## single table is finished, output the end of tbody and the tfoot row