**===== summary counts =====**
number of assemblies	category of count	table data in tsv (tab separated value) file format	assembly minimal size to filter out projects that are not whole genomes
%s	total number of NCBI assemblies under consideration
%s	number of unique species in NCBI assemblies
%s	number of unique NCBI species matched to IUCN classification
%s	number of IUCN species with CR/EN/VU classification
%s	number of such IUCN species matched to NCBI assemblies
%s	total number of NCBI assemblies classified in these tables
%s	%s	%s	%s

===== summary counts =====

number of assemblies

category of count

table data in tsv
(tab separated value)
file format

assembly minimal size
to filter out projects that
are not whole genomes

total number of NCBI assemblies under consideration

number of unique species in NCBI assemblies

number of unique NCBI species matched to IUCN classification

number of IUCN species with CR/EN/VU classification

number of such IUCN species matched to NCBI assemblies

total number of NCBI assemblies classified in these tables

\n"; printf "

\n\n"; printf "\n\n", commify($totalAssemblies); ############################################################################## ## begin single table output, start the table and the header ## ## table starts out as display: none and will be reset to 'table' after ## page load. Saves a lot of time for Chrome browsers, however the page ## is still not usable until much time later. ############################################################################## printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n", $statusColors{"CR"}, $statusColors{"EN"}, $statusColors{"VU"}; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf "\n"; printf "\n"; my %equivalentNamesUsed; # key is NCBI sciName, value is IUCN sciName my $pageSectionCount = 0; my %checkDupAsmId; # key is asmId, value is count of times seen my %cladeSciNameCounts; # key is clade, value is number of different # scientific names my %gcfGcaCounts; # key is GCF or GCA, value is count of each my $asmCountInTable = 0; # counting the rows output my %statusCounts; # key is status: CR EN VU, value is count my $totalAssemblySize = 0; # sum of all assembly sizes my $outputGenArkRR = 0; my %outputGenArk; # key is asmId, value is clade my %outputRR; # key is asmId, value is clade foreach my $clade (@clades) { my $cPtr = $cladeToGo{$clade}; my $countThisClade = scalar(@$cPtr); printf STDERR "# working on clade '%s', count: %s\n", $clade, commify($countThisClade); ## sectionDiv($clade); # starting new clade table ++$pageSectionCount; my $totalContigCounts = 0; my $underSized = 0; my $noCommonName = 0; my $suppressedCount = 0; ######################## rows of per clade table output here ################ my $tsvFile = sprintf("%s.tableData.txt", $clade); open (PC, ">$tsvFile") or die "can not write to $tsvFile"; foreach my $asmId (@$cPtr) { if (defined($checkDupAsmId{$asmId})) { printf STDERR "# what: duplicate asmId: %s in clade %s\n", $asmId, $clade; $checkDupAsmId{$asmId} += 1; } else { $checkDupAsmId{$asmId} = 1; } my $assembliesAvailable = 0; if (defined($sciNames{$asmId})) { if (defined($sciNameCount{$sciNames{$asmId}})) { $assembliesAvailable = $sciNameCount{$sciNames{$asmId}}; } } if (defined ($asmSuppressed{$asmId})) { if (!defined($rrGcaGcfList{$asmId})) { if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) { ++$suppressedCount; printf STDERR "# suppressed $asmId\n" if ($suppressedCount < 5); next; } else { printf STDERR "# genArk/RR %s would have been suppressed\n", $asmId; } } } my $commonName = "n/a"; if (defined($ncbiCommonName{$asmId})) { $commonName = $ncbiCommonName{$asmId}; if ("n/a" eq $commonName) { printf STDERR "# getting n/a commonName from ncbiCommonName{%s}\n", $asmId; } } elsif (defined($comName{$asmId})) { $commonName = $comName{$asmId}; if ("n/a" eq $commonName) { printf STDERR "# getting n/a commonName from comName{%s}\n", $asmId; } } else { if (defined($genArkAsm{$asmId}) || defined($rrGcaGcfList{$asmId})) { printf STDERR "# ACK missed genArk/RR due to no common name for %s\n", $asmId; } ++$noCommonName; printf STDERR "# no commonName for %s in ncbiCommonName or comName\n", $asmId if ($noCommonName < 5); next; } # if (! defined($gcxCountry{$asmId})) { # printf STDERR "# no country for $asmId, date: %s, submitter: %s\n", $gcxDate{$asmId}, $gcxSubmitter{$asmId}; # next; # } # if ("n/a" eq $gcxCountry{$asmId}) { # printf STDERR "# country is n/a $asmId\n"; # next; # } my ($p0, $p1, $p2) = split('_', $asmId, 3); my $accessionId = "${p0}_${p1}"; my $gcX = substr($asmId, 0, 3); my $d0 = substr($asmId, 4, 3); my $d1 = substr($asmId, 7, 3); my $d2 = substr($asmId, 10, 3); my $buildDir = "/hive/data/outside/ncbi/genomes/$gcX/$d0/$d1/$d2/$asmId"; my $asmRpt = "$buildDir/${asmId}_assembly_report.txt"; my $asmFna = "$buildDir/${asmId}_genomic.fna.gz"; my $browserUrl = sprintf("/h/%s", $accessionId); my $arkDownload = sprintf("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/%s/%s/", $gcX, $d0, $d1, $d2, $accessionId); my $destDir = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2"; my $chromInfo = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.chromInfo.txt"; my $n50Txt = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.n50.txt"; my $contigN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.contigs.n50.txt"; my $scaffoldN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.scaffolds.n50.txt"; my $asmSize = 0; my $asmContigCount = 0; my $n50Size = 0; my $n50Count = 0; my $n50ContigSize = 0; my $n50ContigCount = 0; my $n50ScaffoldSize = 0; my $n50ScaffoldCount = 0; my $bioSample = ""; my $bioProject = ""; if (defined($asmReportData{$asmId})) { (undef, undef, $bioSample, $bioProject, undef) = split('\t', $asmReportData{$asmId}, 5); } if (defined($metaInfo{$asmId})) { ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); } else { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks); my $fnaModTime = 0; if (-s "$asmFna") { ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($asmFna); $fnaModTime = $mtime; } my $ciModTime = 0; if ( -s "$chromInfo" ) { ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($chromInfo); $ciModTime = $mtime; } if ($fnaModTime > $ciModTime) { printf STDERR "# new %s\n", $chromInfo; printf STDERR "# from %s\n", $asmFna; print `mkdir -p $destDir`; print `faSize -detailed $asmFna | sort -k2,2nr > $chromInfo`; print `touch -r $asmFna $chromInfo`; } if ( -s "$chromInfo" ) { ($asmSize, $asmContigCount) = sizeUpChromInfo($chromInfo); if ( ! -s "$n50Txt" ) { print `n50.pl "$chromInfo" > "$n50Txt" 2>&1`; } ($n50Size, $n50Count) = readN50($n50Txt); if ( -s "$contigN50" ) { ($n50ContigSize, $n50ContigCount) = readN50($contigN50); } if ( -s "$scaffoldN50" ) { ($n50ScaffoldSize, $n50ScaffoldCount) = readN50($scaffoldN50); } } else { printf STDERR "# chromInfo missing: %s\n", $asmId; printf STDERR "# %s\n", $chromInfo; } die "ERROR duplicate newMetaInfo{$asmId}" if (defined($newMetaInfo{$asmId})); die "ERROR duplicate metaInfo{$asmId} for newMetaInfo{$asmId}" if (defined($metaInfo{$asmId})); $newMetaInfo{$asmId} = join("\t", ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) ); } # else if (defined($metaInfo{$asmId})) # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 2 %d %s %s < %s\n", $clade, ++$underSized, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); next; } my $iucnStatus = " "; my $iucnLink = ""; if (defined($sciNames{$asmId})) { my $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); $iucnLink = "https://www.iucnredlist.org/species/$iucnLink{$iucnSciName}" if (defined($iucnLink{$iucnSciName})); if ($iucnSciName ne $sciNames{$asmId}) { $equivalentNamesUsed{$sciNames{$asmId}} = $iucnSciName; } if (defined($iucnSciNames{$iucnSciName})) { $iucnStatus = $iucnSciNames{$iucnSciName}; } } ++$asmCountInTable; my $statusColor = ""; if ($iucnStatus ne " ") { $statusColor = $statusColors{$iucnStatus}; ++$statusCounts{$iucnStatus}; } ############# starting a table row ################################# ++$outputGenArkRR if (defined($rrGcaGcfList{$asmId}) || defined($genArkAsm{$asmId})); $outputGenArk{$asmId} = $clade if (defined($genArkAsm{$asmId})); $outputRR{$asmId} = $clade if (defined($rrGcaGcfList{$asmId})); if ($asmId =~ m/^GCF/) { $gcfGcaCounts{'GCF'} += 1; } elsif ($asmId =~ m/GCA/) { $gcfGcaCounts{'GCA'} += 1; } ### If equivalent to UCSC database browser, make reference to RR browser my $ucscDb = ""; $ucscDb = "/" . $rrGcaGcfList{$asmId} if (defined($rrGcaGcfList{$asmId})); if (length($ucscDb)) { $browserUrl = sprintf("/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId}); } - printf PC "%d", $asmCountInTable; # start a line output to clade.tableData.tsv + printf PC "%s", $browserUrl; # start a line output to clade.tableData.tsv ## count number of different scientific names used in this clade table if (!defined($cladeSciNameCounts{$clade})) { my %h; $cladeSciNameCounts{$clade} = \%h; } my $csnPtr = $cladeSciNameCounts{$clade}; $csnPtr->{$sciNames{$asmId}} += 1 if (defined($sciNames{$asmId})); my $rowClass = ""; my $gcaGcfClass = "gca"; $gcaGcfClass = "gcf" if ($asmId =~ m/^GCF/); $gcaGcfClass .= " hasIucn" if (length($iucnLink)); if (defined($comName{$asmId})) { if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db } else { $rowClass = " class='gak $gcaGcfClass $clade'"; # present in GenArk } } else { # can be requested if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db } else { $rowClass = " class='gar $gcaGcfClass $clade'"; # available for request } } ### can override CSS settings here ### $rowClass = " class='gar' style='display: none;'"; # experiment with hiding all rows over 500 count to see if that helps # chrom browser initial loading performance # try out the table with out any count, just get the row started if (length($statusColor)) { my $statusClass = sprintf(" style='color:%s;", $statusColor); # let's see what nostatus looks like $statusClass = ""; if ($asmCountInTable > 500) { printf "", $browserUrl; printf PC "\tview"; # output to clade.tableData.tsv printf "", $commonName; } else { if (length($ucscDb)) { printf "", $browserUrl; printf PC "\tview"; # output to clade.tableData.tsv printf "", $commonName; } else { printf "", $asmId; printf PC "\trequest"; # output to clade.tableData.tsv printf "", $commonName; } } printf PC "\t%s", $commonName; # output to clade.tableData.tsv ############# second column, scientific name and google image search ######### if (defined($sciNames{$asmId})) { my $noSpace = $sciNames{$asmId}; $noSpace =~ s/ /+/g; my $imgSearchUrl="https://images.google.com/images?q=$noSpace&um=1&hl=en&safe=active&nfpr=1&tbs=il:cl"; if ($assembliesAvailable > 1) { printf "", $imgSearchUrl, $sciNames{$asmId}, commify($assembliesAvailable); } else { printf "", $imgSearchUrl, $sciNames{$asmId}; } printf PC "\t%s", $sciNames{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# third column, NCBI assembly and link to NCBI ############ my $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId"; printf "", $asmUrl, $asmId, $ucscDb; printf PC "\t%s", $asmId; # output to clade.tableData.txt ############# fourth column, assembly size ################ if ($asmSize > 0) { $totalAssemblySize += $asmSize; printf "", commify($asmSize); printf PC "\t%d", $asmSize; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# fifth column, sequence count ################ if ($asmContigCount > 0) { $totalContigCounts += $asmContigCount; printf "", commify($asmContigCount); printf PC "\t%d", $asmContigCount; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# sixth, seventh columns, N50 for scaffold, contig ############## if ($n50ScaffoldSize > 0 ) { n50Cell($n50ScaffoldSize, $n50ScaffoldCount, \*PC); } else { # substitute assembly N50 when no scaffold N50 available n50Cell($n50Size, $n50Count, \*PC); } n50Cell($n50ContigSize, $n50ContigCount, \*PC); ############# eighth column, IUCN status and link ################ if (length($iucnLink) > 0) { printf "", $iucnStatus, $iucnLink, $iucnStatus; } else { printf "", $iucnStatus; } printf PC "\t%s", $iucnStatus; # output to clade.tableData.txt ############# ninth column, taxId and link to NCBI ################ if (defined($asmTaxId{$asmId})) { my $taxUrl = "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$asmTaxId{$asmId}"; printf "", $taxUrl, $asmTaxId{$asmId}; printf PC "\t%s", $asmTaxId{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# tenth column, assembly date ################ if (defined($asmDate{$asmId})) { printf "", $asmDate{$asmId}; printf PC "\t%s", $asmDate{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# eleventh column, bioSample ################ if (length($bioSample) && $bioSample !~ m#n/a#) { printf "", $bioSample, $bioSample; printf PC "\t%s", $bioSample; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, bioProject ################ if (length($bioProject) && $bioProject !~ m#n/a#) { printf "", $bioProject, $bioProject; printf PC "\t%s", $bioProject; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# eleventh column, submitter ################ $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId"; if (defined($asmSubmitter{$asmId})) { my $submitterSortKey = lc($asmSubmitter{$asmId}); $submitterSortKey =~ s/ //g; $submitterSortKey =~ s/[^a-z0-9]//ig; printf "", substr($submitterSortKey,0,20), $asmSubmitter{$asmId}; printf PC "\t%s", $asmSubmitter{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, clade ################ printf "\n", $clade; printf PC "\t%s", $clade; printf PC "\n"; # finished a line output to clade.tableData.txt printf "\n"; } # foreach my $asmId (@$cPtr) close (PC); # finished with clade.tableData.txt output printf STDERR "# no commonName %s for clade %s\n", commify($noCommonName), $clade; printf STDERR "# suppressed %s for clade %s\n", commify($suppressedCount), $clade; } # foreach my $clade (@clades) printf STDERR "# output %s genArk or RR assemblies\n", commify($outputGenArkRR); ########################################################################## ## single table is finished, output the end of tbody and the tfoot row ########################################################################## if ($asmCountInTable > 1) { my $crCount = 0; my $enCount = 0; my $vuCount = 0; foreach my $statId (keys %statusCounts) { $crCount = $statusCounts{$statId} if ($statId eq "CR"); $enCount = $statusCounts{$statId} if ($statId eq "EN"); $vuCount = $statusCounts{$statId} if ($statId eq "VU"); } my $sciNameTotal = 0; foreach my $c (@clades) { my $csnPtr = $cladeSciNameCounts{$c}; my $sciNameTotal = 0; foreach my $cladeSciName (keys %$csnPtr) { ++$sciNameTotal; } } printf " \n"; } else { print "\n"; } printf "

\n"; printf "

Please note, in some cases the IUCN scientific name was translated to the scientific name used in the NCBI assembly to establish an equivalence between the two data sources. Please beware of this translation when interpreting the IUCN status.

view/request ⓘ'view' opens the genome browser for an existing assembly, 'request' opens an assembly request form.	English common name ⓘEnglish common name	scientific name (count) ⓘLinks to Google image search. Count shows the number of assemblies available for this organism.	NCBI Assembly ⓘLinks to NCBI resource record.	assembly size ⓘNumber of nucleotides in the assembly.	sequence count ⓘThe number of sequences in this assembly.	scaffold N50 length (L50) ⓘN50 (L50) length.	contig N50 length (L50) ⓘN50 (L50) length.	IUCN ⓘLinks to IUCN Red List of Threatened Species (version 2021-3) CR - Critical / EN - Endangered / VU - Vulnerable	NCBI taxID ⓘLinks to NCBI Taxonomy database.	assembly date ⓘDate submitted to NCBI Assembly database.	BioSample ⓘBioSample ID at NCBI.	BioProject ⓘBioProject ID at NCBI.	Assembly submitter ⓘPerson or group who submitted to NCBI Assembly database.	clade ⓘClade of this organism.
view	%s	view	%s		%s	%s (%s)	%s	n/a	%s%s	%s		%s		%s	%s	%s	n/a	%s	n/a	%s		%s		%s	n/a	%s

**IUCN to NCBI scientific name translation**
NCBI assembly scientific name	translated to IUCN scientific name
%s	%s

IUCN to NCBI scientific name translation

NCBI assembly scientific name

translated to IUCN scientific name

. . . please wait while page loads . . .

%s total assemblies : use the selection menus to select subsets