452b689cab327a20bb7ac506eaf89a3f69d23feb hiram Tue Jan 17 08:23:41 2023 -0800 invertebrates has become invertebrate and master.run.list has changed columns no redmine diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl index 247d28b..a803556 100755 --- src/hg/gar/garTable.pl +++ src/hg/gar/garTable.pl @@ -1,1431 +1,1430 @@ #!/usr/bin/env perl use strict; use warnings; # 1. reading iucnToNcbiEquivalent.txt to get ncbiToIucnNames hash # 2. reading iucn/IUCN.CR.txt # 3. reading iucn/IUCN.EN.txt # 4. reading iucn/IUCN.VU.txt # 5. reading iucn/links.csv # 6. reading asmId.sciName.commonName to get better common names # 7. reading accumulateMetaInfo.tsv to get better common names # 8. reading ../../../sizes/asmId.sciName.commonName.date.tsv # 9. reading assembly_summary_genbank.txt and assembly_summary_refseq.txt # to get asmId and scientific names for all assemblies # 10. reading asmId.suppressed.list # 11. reading asmId.date.taxId.asmSubmitter.tsv # 12. reading asmId.country.date.by.tsv # 13. now in a perClade loop, reading each all.*.clade.today listings # to get all refseq and genbank assemblies # 14. during that loop, it may use the sizes chromInfo and the n50.txt # files from # /hive/data/outside/ncbi/genomes/sizes/gcX/d0/d1/d2/asmId.chromInfo.txt"; ############################################################################### # from Perl Cookbook Recipe 2.17, print out large numbers with comma # delimiters: sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } ############################################################################### ############################################################################### # obtain assembly size and sequence count from chromInfo file sub sizeUpChromInfo($) { my ($ciFile) = @_; my $sizeCount=`ave -col=2 "$ciFile" | egrep -w "count|total" | awk '{printf "%d ", \$NF}'`; chomp $sizeCount; my ($count, $size, undef) = split('\s+', $sizeCount, 3); return ($size, $count); } ############################################################################### ############################################################################### # read an N50.txt output file from n50.pl to extract the size and count sub readN50($) { my ($n50File) = @_; my $size = 0; my $count = 0; open (TF, "grep -v '^#' $n50File|") or die "can not read $n50File"; while (my $line = ) { chomp $line; my @a = split('\s+', $line); last if ($size > $a[0]); if ($a[1] ne "one") { $size = $a[3]; $count = $a[1]; } } close (TF); return ($size, $count); } ############################################################################### ############################################################################### # given a number, return reduced significant digits for giga, mega, kilo or none sub gmk($) { my ($n) = @_; return sprintf("%d", $n) if ($n < 1000); my $m = $n/1000; return sprintf("%.2fK", $m) if ($m < 1000); $m = $n/1000000; return sprintf("%.2fM", $m) if ($m < 1000); $m = $n/1000000000; return sprintf("%.3fG", $m); } ############################################################################### ############################################################################### # output a table cell for an N50 measurement sub n50Cell($$$) { my ($size, $count, $fh) = @_; if ($size > 0) { printf "%s (%s)", $size, gmk($size), commify($count); printf $fh "\t%d (%d)", $size, $count; # output to clade.tableData.txt } else { printf " "; printf $fh "\tn/a (n/a)"; # output to clade.tableData.txt } } ############################################################################### -my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi ); +my @clades = qw( primates mammals birds fish vertebrate invertebrate plants fungi ); # to help weed out some of the noise # key is clade, value is minimal size to count as a whole genome # these are actually pretty low to allow in some alternate haplotype # assemblies that don't seem to be the whole assembly. # The assemblies are also filtered by NCBI status 'full/partial' to only # allow in the 'full' genomes meaning representation of the whole genome my %minimalGenomeSize = ( primates => 1000000000, mammals => 20000000, birds => 200000000, fish => 100000, vertebrate => 4000000, - invertebrates => 10000, + invertebrate => 10000, plants => 100000, fungi => 50000, ); ######################################################################### ## read in list of current GenArk assemblies my %genArkAsm; # key is asmId, value is string with: my %genArkClade; # key is asmId, value is clade my %genArkCladeProfile; # key is clade, value is count my %genArkSciName; # key is asmId, value is sciName my %genArkComName; # key is asmId, value is sciName # accessionassemblyscientific namecommon nametaxonIdclade my $genArkCount = 0; printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n"; open (FH, ") { next if ($line =~ m/^#/); chomp $line; my ($accession, $assembly, $sciName, $commonName, $taxonId, $gClade) = split('\t', $line); my $asmId = sprintf("%s_%s", $accession, $assembly); die "ERROR: duplicate accession from UCSC_GI.assemblyHubList.txt" if (defined($genArkAsm{$asmId})); $genArkAsm{$asmId} = sprintf("%s\t%s\t%s\t%s\t%s", $accession, $assembly, $sciName, $commonName, $taxonId); $gClade =~ s/\(L\)//; # remove the 'legacy' tag - $gClade = "invertebrates" if ($gClade =~ m/invertebrate/); $genArkClade{$asmId} = $gClade; ++$genArkCladeProfile{$gClade}; ++$genArkCount; } close (FH); # accession assembly scientific name common name taxonId # GCA_000001905.1 Loxafr3.0 Loxodonta africana African savanna elephant (2009 genbank) 9785 printf STDERR "# have %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount); printf STDERR "# genArk clade profile:\n"; foreach my $gClade (sort keys %genArkCladeProfile) { printf STDERR "# %6d\t%s\n", $genArkCladeProfile{$gClade}, $gClade; } my $rrCount = 0; my %rrGcaGcfList; # key is asmId, value is UCSC db name open (FH, ") { chomp $line; my ($ucscDb, $asmId) = split('\s+', $line); $rrGcaGcfList{$asmId} = $ucscDb; ++$rrCount; } close (FH); printf STDERR "# have %d assemblies with RR equivalent databases\n", $rrCount; printf STDERR "# therefore, should be a total of %s existing browsers\n", commify($rrCount+$genArkCount-$genArkCladeProfile{'viral'}-$genArkCladeProfile{'bacteria'}); printf STDERR "# %s + %s - %s - %s = %s\n", commify($genArkCount), commify($rrCount), commify($genArkCladeProfile{'viral'}), commify($genArkCladeProfile{'bacteria'}), commify($rrCount+$genArkCount-$genArkCladeProfile{'viral'}-$genArkCladeProfile{'bacteria'}); # establish a name equivalence for some NCBI three word scientific names # to IUCN two word scientific names my %ncbiToIucnNames; # key is NCBI name (one), value is IUCN name (many) # for example: # Giraffa camelopardalis antiquorum -> Giraffa camelopardalis # Giraffa camelopardalis rothschildi -> Giraffa camelopardalis printf STDERR "### reading iucnToNcbiEquivalent.txt\n"; open (FH, "<../iucnToNcbiEquivalent.txt") or die "can not read iucnToNcbiEquivalent.txt"; while (my $line = ) { chomp $line; my ($ncbiName, $iucnName) = split('\t', $line); $ncbiToIucnNames{$ncbiName} = $iucnName; } close (FH); my $criticalColor = "#ff0000"; my $endangeredColor = "#dd6600"; my $vulnerableColor = "#663300"; $criticalColor = "#ee3333"; $endangeredColor = "#333388"; $vulnerableColor = "#88aaaa"; my %statusColors = ( "CR" => $criticalColor, "EN" => $endangeredColor, "VU" => $vulnerableColor, ); my %iucnCR; # key is scientific name, value is 1 == Critically Endangered my %iucnEN; # key is scientific name, value is 1 == Endangered my %iucnVU; # key is scientific name, value is 1 == Vulnerable my %iucnSciNames; # key is scientific name, value is IUCN code CR EN VU my $iucnSpeciesCount = 0; my $iucnCount = 0; printf STDERR "### reading iucn/IUCN.CR.txt\n"; open (FH, "<../iucn/IUCN.CR.txt") or die "can not read iucn/IUCN.CR.txt"; while (my $sName = ) { chomp $sName; $iucnCR{$sName} = 1; ++$iucnCount; $iucnSciNames{$sName} = "CR"; } close (FH); printf STDERR "# IUCN Critical %s\n", commify($iucnCount); $iucnSpeciesCount += $iucnCount; $iucnCount = 0; printf STDERR "### reading iucn/IUCN.EN.txt\n"; open (FH, "<../iucn/IUCN.EN.txt") or die "can not read iucn/IUCN.EN.txt"; while (my $sName = ) { chomp $sName; $iucnEN{$sName} = 1; ++$iucnCount; die "ERROR: duplicate sciName in IUCN CR and EN listings" if (defined($iucnCR{$sName})); $iucnSciNames{$sName} = "EN"; } close (FH); printf STDERR "# IUCN Endangered %s\n", commify($iucnCount); $iucnSpeciesCount += $iucnCount; $iucnCount = 0; printf STDERR "### reading iucn/IUCN.VU.txt\n"; open (FH, "<../iucn/IUCN.VU.txt") or die "can not read iucn/IUCN.VU.txt"; while (my $sName = ) { chomp $sName; $iucnVU{$sName} = 1; ++$iucnCount; $iucnSciNames{$sName} = "VU"; die "ERROR: duplicate sciName in IUCN CR and VU listings" if (defined($iucnCR{$sName})); die "ERROR: duplicate sciName in IUCN EN and VU listings" if (defined($iucnEN{$sName})); } close (FH); printf STDERR "# IUCN Vulnerable %s\n", commify($iucnCount); $iucnSpeciesCount += $iucnCount; my %iucnLink; # key is sciName, value is the two id numbers 123/456 # to make a link: https://www.iucnredlist.org/species/123/456 # get the id numbers from iucn to make links to IUCN printf STDERR "### reading iucn/links.csv\n"; open (FH, "<../iucn/links.csv") or die "can not read iucn/links.csv"; while (my $line = ) { chomp $line; my ($n2, $n1, $sName) = split(',', $line); $iucnLink{$sName} = sprintf("%d/%d", $n1, $n2); } close (FH); my $comNameCount = 0; my %ncbiCommonName; # key is asmId, value is common name from NCBI assembly_report my %ncbiDate; # key is asmId, value is assembly submission date from NCBI assembly_report # common names from genArk hubs, better common name when available my $hubComNames = 0; my %sciName; # key is asmId, value is scientific name my %comName; # key is asmId, value is common name printf STDERR "### reading asmId.sciName.commonName\n"; open (FH, ") { chomp $line; my ($id, $sci, $com) = split('\t', $line); $sci =~ s/_/ /g; if (defined($sciName{$id})) { die "ERROR: duplicate asmId sciName: $id '$sci' '$sciName{$id}'"; } $sciName{$id} = $sci; $comName{$id} = $com; ++$hubComNames; } close (FH); printf STDERR "# common names from GenArk hubs: %s\n", commify($hubComNames); my $genArkCatchup = 0; my $genArkDone = 0; ### catch up for any missed genArk hubs # accessionassemblyscientific namecommon nametaxonId foreach my $asmIdKey (sort keys %genArkAsm) { if (defined($sciName{$asmIdKey})) { ++$genArkDone; next; } if (defined($comName{$asmIdKey})) { ++$genArkDone; next; } my @a = split('\t', $genArkAsm{$asmIdKey}); $a[2] =~ s/_/ /g; $sciName{$asmIdKey} = $a[2]; $comName{$asmIdKey} = $a[3]; ++$genArkCatchup; } printf STDERR "# %d genArk hubs from hgdownload list already done\n", $genArkDone; printf STDERR "# catch up %d genArk hubs from hgdownload list\n", $genArkCatchup; my %metaInfo; # key is asmId, value is a tsv string of numbers for: # asmSize asmContigCount n50Size n50Count n50ContigSize # n50ContigCount n50ScaffoldSize n50ScaffoldCount my %newMetaInfo; # accumulate new metaInfo to add to end of this file if ( -s "../accumulateMetaInfo.tsv" ) { printf STDERR "# reading ../accumulateMetaInfo.tsv\n"; open (FH, "<../accumulateMetaInfo.tsv") or die "can not read ../accumulateMetaInfo.tsv"; while (my $line = ) { chomp $line; my ($asmId, $metaData) = split('\t', $line, 2); die "ERROR: duplicate metaInfo{$asmId}" if (defined($metaInfo{$asmId})); $metaInfo{$asmId} = $metaData; } close (FH); } my $noDateFromSize = 0; my $earliestDate = 22001231; my $latestDate = 19000101; printf STDERR "### reading asmId.sciName.comonName.date.tsv\n"; open (FH, "<../asmId.sciName.commonName.date.tsv") or die "can not read ../asmId.sciName.commonName.date.tsv"; while (my $line = ) { chomp $line; my @a = split('\t', $line); next if ($a[0] !~ m/^GC/); $a[0] =~ s/__/_/g; ++$comNameCount; $ncbiCommonName{$a[0]} = $a[2]; if (defined($a[3]) && ($a[3] =~ m/^[12][0-9][0-9][0-9]-[0-9]+-[0-9]+$/) ) { my @b = split('-', $a[3]); $ncbiDate{$a[0]} = sprintf("%04d%02d%02d", $b[0], $b[1], $b[2]); $earliestDate = $ncbiDate{$a[0]} if ($ncbiDate{$a[0]} < $earliestDate); $latestDate = $ncbiDate{$a[0]} if ($latestDate < $ncbiDate{$a[0]}); } else { $ncbiDate{$a[0]} = 19720101; ++$noDateFromSize; if (defined($a[3])) { printf STDERR "# no date '%s' for %s'\n", $a[3], $a[0] if ($noDateFromSize < 5); } else { printf STDERR "# no date 'undef' for %s'\n", $a[0] if ($noDateFromSize < 5); } } } close (FH); printf STDERR "# comNameCount %s from ../../../sizes/asmId.sciName.commonName.date.tsv\n", commify($comNameCount); printf STDERR "# dates from %s to %s, no dates: %d\n", $earliestDate, $latestDate, $noDateFromSize;; my %sciNameCount; # key is scientific name, value is count of assemblies # from NCBI assembly_summary .txt files my %sciNames; # key is asmId, value is scientific name my %sciNameAsmList; # key is scientific name, value is pointer to array # which is a list of assemblies on this sciName # goodToGo will be the master asmId list for output my %goodToGoSpecies; # key is scientific name, value is asmId selected my %goodToGo; # key is asmId, value is iucn status code CR EN VU my %goodToGoDate; # key is sciName, value is date YYYYMMDD for selected # assembly # choices in order when duplicate species: # GCA assembly # newer GCA assembly # GCF assembly # newer GCF assembly my $ncbiTotalAssemblies = 0; my $ncbiUniqueSpecies = 0; my $highestSpeciesCount = 0; my $highestCountName = ""; my $countingGoodToGo = 0; my $goodToGoReplaced = 0; my $warningNoDate = 0; my %skipPartialGenome; # key is asmId, value is sciName for 'partial' status my $genomeReports = "/hive/data/outside/ncbi/genomes/reports"; # obtain scientific name and asmId from assembly_summary files printf STDERR "### reading genbank and refseq assembly_summary\n"; open (FH, "grep -v '^#' $genomeReports/assembly_summary_genbank.txt $genomeReports/assembly_summary_refseq.txt $genomeReports/assembly_summary_genbank_historical.txt $genomeReports/assembly_summary_refseq_historical.txt|cut -f8,14,20|") or die "can not read the assembly_summary files"; while (my $line = ) { chomp $line; # asmId will be derived from the ftpPath my ($sName, $fullPartial, $asmId) = split('\t', $line); # most interesting, skipping these partial assemblies ends up with more # in the final result ? $asmId =~ s#.*/##; next if ($asmId =~ m/^na$/); # something wrong with these two next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/); if ($fullPartial =~ m/partial/i) { # skip partial assemblies $skipPartialGenome{$asmId} = $sName; next; } ++$sciNameCount{$sName}; $sciNames{$asmId} = $sName; if (!defined($sciNameAsmList{$sName})) { my @a; $sciNameAsmList{$sName} = \@a; } my $sPtr = $sciNameAsmList{$sName}; push (@$sPtr, $asmId); # maintain list of asmIds for this sciName ++$ncbiTotalAssemblies; ++$ncbiUniqueSpecies if (1 == $sciNameCount{$sName}); if ($sciNameCount{$sName} > $highestSpeciesCount) { $highestSpeciesCount = $sciNameCount{$sName}; $highestCountName = $sName; } # only selecting assemblies if IUCN classification exists my $iucnSciName = $sName; # check name equivalence $iucnSciName = $ncbiToIucnNames{$sName} if (defined($ncbiToIucnNames{$sName})); #### XXX ### let's try everything instead of just those with IUCN class if ( 1 == 1 || defined($iucnSciNames{$iucnSciName})) { if (defined($goodToGoSpecies{$sName})) { # already seen my $prevGC = $goodToGoSpecies{$sName}; # see if it should be override if ($prevGC =~ m/^GCA/) { if ($asmId =~ m/^GCF/) { # GCF overrides GCA $goodToGoSpecies{$sName} = $asmId; delete $goodToGo{$prevGC}; ++$goodToGoReplaced; $goodToGo{$asmId} = $iucnSciNames{$iucnSciName}; if (defined($ncbiDate{$asmId})) { $goodToGoDate{$sName} = $ncbiDate{$asmId}; } else { # printf STDERR "# 0 warning: no ncbiDate for $asmId $sName '$line'\n"; ++$warningNoDate; $goodToGoDate{$sName} = 19720101; } } else { # another GCA, check dates if (defined($ncbiDate{$asmId})) { if ($ncbiDate{$asmId} > $goodToGoDate{$sName}) { # newer delete $goodToGo{$prevGC}; ++$goodToGoReplaced; $goodToGoSpecies{$sName} = $asmId; $goodToGo{$asmId} = $iucnSciNames{$iucnSciName}; $goodToGoDate{$sName} = $ncbiDate{$asmId}; } else { } } else { # no date, no replace # printf STDERR "# 1 warning: no ncbiDate for $asmId $sName '$line'\n"; ++$warningNoDate; } } } elsif ($asmId =~ m/^GCF/) { # a second GCF if (defined($ncbiDate{$asmId})) { if ($ncbiDate{$asmId} > $goodToGoDate{$sName}) { # newer delete $goodToGo{$prevGC}; ++$goodToGoReplaced; $goodToGoSpecies{$sName} = $asmId; $goodToGo{$asmId} = $iucnSciNames{$iucnSciName}; $goodToGoDate{$sName} = $ncbiDate{$asmId}; } else { } } else { # no date, no replace # printf STDERR "# 2 warning: no ncbiDate for $asmId $sName '$line'\n"; ++$warningNoDate; } } } else { # first time, take this assembly ++$countingGoodToGo; $goodToGoSpecies{$sName} = $asmId; $goodToGo{$asmId} = $iucnSciNames{$iucnSciName}; if (defined($ncbiDate{$asmId})) { $goodToGoDate{$sName} = $ncbiDate{$asmId}; } else { # printf STDERR "# 3 warning: no ncbiDate for $asmId $sName '$line'\n"; ++$warningNoDate; $goodToGoDate{$sName} = 19720101; } } } # if (defined($iucnSciNames{$iucnSciName})) } # while (my $line = ) reading the assembly_summary files close (FH); printf STDERR "# countedGoodToGo: %d, replaced: %d, noDate: %d\n", $countingGoodToGo, $goodToGoReplaced, $warningNoDate; printf STDERR "# have %s NCBI assemblies, %s unique NCBI species\n", commify($ncbiTotalAssemblies), commify($ncbiUniqueSpecies); printf STDERR "#\thighest species count %s : '%s'\n", commify($highestSpeciesCount), $highestCountName; my %gcxCountry; # key is asmId, value is collection location my %gcxDate; # key is asmId, value is collection date my %gcxSubmitter; # key is asmId, value is collected by my %asmSuppressed; # key is asmId, value is 1 meaning suppressed printf STDERR "### reading asmId.suppressed.list\n"; open (FH, ") { chomp $id; next if (defined($genArkAsm{$id}) || defined($rrGcaGcfList{$id})); $asmSuppressed{$id} = 1; } close (FH); my %asmDate; # key is asmId, value is date of assembly my %asmSubmitter; # key is asmId, value is name of assembly submitter my %asmTaxId; # key is asmId, value is taxId of species my $asmIdCount = 0; printf STDERR "### reading ../asmId.date.taxId.asmSubmitter.tsv\n"; open (FH, "<../asmId.date.taxId.asmSubmitter.tsv") or die "can not read a../smId.date.taxId.asmSubmitter.tsv"; while (my $line = ) { chomp $line; my ($id, $da, $tid, $asmS) = split('\t', $line); $id =~ s/__/_/g; $da = " " if ($da =~ m#n/a#); $tid = " " if ($tid =~ m#n/a#); $asmS = " " if ($asmS =~ m#n/a#); $asmDate{$id} = $da; $asmTaxId{$id} = $tid; $asmSubmitter{$id} = $asmS; ++$asmIdCount; } close (FH); printf STDERR "# asmId count: %s from asmId.date.taxId.asmSubmitter.tsv\n", commify($asmIdCount); $asmIdCount = 0; printf STDERR "### reading ../asmId.country.date.by.tsv\n"; open (FH, "<../asmId.country.date.by.tsv") or die "can not read ../asmId.country.date.by.tsv"; while (my $line = ) { chomp $line; my ($id, $country, $collectDate, $submitter) = split('\t', $line); $id =~ s/__/_/g; if (defined($gcxCountry{$id})) { die "ERROR: duplicate asmId data $id '$country' '$gcxCountry{$id}'"; } printf STDERR "# undefined collectDate for $id" if (!defined($collectDate)); printf STDERR "# undefined submitter for $id" if (!defined($submitter)); $country =~ s/"//g; $collectDate =~ s/"//g; $submitter =~ s/"//g; $gcxCountry{$id} = $country; $gcxDate{$id} = $collectDate; $gcxSubmitter{$id} = $submitter; ++$asmIdCount; } close (FH); printf STDERR "# asmId count: %s from asmId.country.date.by.tsv\n", commify($asmIdCount); my %asmReportData; # key is asmId, value is tsv string for: # sciName commonName bioSample bioProject taxId asmDate # obtained from scanning all the assembly report files open (FH, "<../asmReport.data.tsv") or die "can not read ../asmReport.data.tsv"; while (my $line = ) { chomp $line; my ($id, $rest) = split('\t', $line, 2); $asmReportData{$id} = $rest; } close (FH); ### This cladeToGo set of lists is the main driver of the table ### An assembly needs to be in this set in order to get into the table my %cladeToGo; # key is clade name, value is array pointer for asmId list my $totalGoodToGo = 0; my %cladeCounts; # key is clade name, value is count of assemblies used my $maxDupAsm = 0; my %ncbiSpeciesRecorded; # key is NCBI sciName, value is count of those my $ncbiSpeciesUsed = 0; # a unique count of NCBI sciName my %iucnSpeciesRecorded; # key is IUCN sciName, value is count of those my $iucnSpeciesUsed = 0; # a unique count of IUCN sciName my $sciNameDisplayLimit = 5; # do not display more than 5 instances my $checkedAsmIds = 0; my $acceptedAsmIds = 0; my $notInGoodToGo = 0; my %alreadyDone; # key is asmId, value = 1 indicates already done my %underSized; # key is clade, value is count of underSized my $NA = "/hive/data/outside/ncbi/genomes/reports/newAsm"; my $examinedAsmId = 0; my $shouldBeGenArk = 0; my $shouldBeUcsc = 0; my %usedGenArk; # key is asmId, value is count seen in *.today files my $genArkTooManySpecies = 0; my $countingUsedGenArk = 0; my $prevUsedGenArk = 0; foreach my $clade (@clades) { my $cladeCount = 0; my $goodToGoCount = 0; my $refSeq = "${NA}/all.refseq.${clade}.today"; my $genBank = "${NA}/all.genbank.${clade}.today"; my $asmHubTsv = "~/kent/src/hg/makeDb/doc/${clade}AsmHub/${clade}.orderList.tsv"; - if ($clade eq "invertebrates") { + if ($clade eq "invertebrate") { $asmHubTsv = "~/kent/src/hg/makeDb/doc/invertebrateAsmHub/invertebrate.orderList.tsv"; } open (FH, "cut -f1 ${refSeq} ${genBank} ${asmHubTsv}|sort -u|") or die "can not read $refSeq or $genBank or $asmHubTsv"; while (my $asmId = ) { chomp $asmId; ++$examinedAsmId; $asmId =~ s/\.]/_/g; $asmId =~ s/\:/_/g; $asmId =~ s/\%/_/g; $asmId =~ s/\+/_/g; $asmId =~ s/\//_/g; $asmId =~ s/\#/_/g; # $asmId =~ s/[.:%+/#]/_/g; $asmId =~ s/[()]//g; $asmId =~ s/__/_/g; ++$shouldBeGenArk if (defined($genArkAsm{$asmId})); ++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId})); if (defined ($skipPartialGenome{$asmId})) { next if (!defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})); } if (defined ($asmSuppressed{$asmId})) { next if (!defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})); } next if (defined ($alreadyDone{$asmId})); # something wrong with these two # GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683 # GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679 next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/); # verify this asmId will pass the asmSize limit if (defined($metaInfo{$asmId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId}; next; } } $alreadyDone{$asmId} = 1; if (defined($genArkClade{$asmId})) { printf STDERR "# warning genArkClade{%s} does not = %s\n", $genArkClade{$asmId}, $clade if ($genArkClade{$asmId} ne $clade); } ++$checkedAsmIds; my $iucnSciName = ""; if (defined($sciNames{$asmId})) { ++$ncbiSpeciesRecorded{$sciNames{$asmId}}; if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) { next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit); $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); ++$iucnSpeciesRecorded{$iucnSciName}; } else { ++$genArkTooManySpecies if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit); } } else { printf STDERR "# no sciName for: '%s' at %s\n", $asmId, commify($goodToGoCount + 1); } ++$cladeCount; if (! defined($cladeToGo{$clade})) { my @a; $cladeToGo{$clade} = \@a; } my $cPtr = $cladeToGo{$clade}; push (@$cPtr, $asmId); ++$usedGenArk{$asmId} if (defined($genArkAsm{$asmId})); ++$countingUsedGenArk if (defined($genArkAsm{$asmId})); ++$acceptedAsmIds; ++$goodToGoCount; ++$ncbiSpeciesUsed if ( defined($sciNames{$asmId}) && defined($ncbiSpeciesRecorded{$sciNames{$asmId}}) && (1 == $ncbiSpeciesRecorded{$sciNames{$asmId}}) ); ++$iucnSpeciesUsed if ( defined($iucnSpeciesRecorded{$iucnSciName}) && (1 == $iucnSpeciesRecorded{$iucnSciName})); ++$cladeCounts{$clade}; my $assembliesAvailable = 0; if (defined($sciNames{$asmId})) { if (defined($sciNameCount{$sciNames{$asmId}})) { $assembliesAvailable = $sciNameCount{$sciNames{$asmId}}; $maxDupAsm = $assembliesAvailable if ($assembliesAvailable > $maxDupAsm); } if ($assembliesAvailable > 1) { my $bPtr = $sciNameAsmList{$sciNames{$asmId}}; foreach my $aId (@$bPtr) { next if (defined ($alreadyDone{$aId})); $alreadyDone{$aId} = 1; if ($aId ne $asmId) { if (defined($metaInfo{$aId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { if (! defined($genArkAsm{$aId}) && !defined($rrGcaGcfList{$aId}) ) { printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId})); printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId}; next; } else { printf STDERR "# ACK %s genArk/RR too small at %s\n", $aId, commify($asmSize); } } } ++$ncbiSpeciesRecorded{$sciNames{$aId}}; # the defined($sciName{$aId}) indicates it is a GenArk genome # always accept those even if it goes beyond the limit if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($genArkAsm{$aId}) || defined($rrGcaGcfList{$aId})) { push (@$cPtr, $aId); $usedGenArk{$aId} += 1 if (defined($genArkAsm{$aId})); ++$countingUsedGenArk if (defined($genArkAsm{$asmId})); ++$acceptedAsmIds; ++$goodToGoCount; ++$cladeCounts{$clade}; } # under limit count or is GenArk assembly } # if ($aId ne $asmId) } # foreach my $aId (@$bPtr) } # if ($assembliesAvailable > 1) } # if (defined($sciNames{$asmId})) } # while (my $asmId = ) close (FH); $totalGoodToGo += $goodToGoCount; printf STDERR "# %s\t%s\tgood to go %s, total good: %d\n", $clade, commify($cladeCount), $goodToGoCount, $totalGoodToGo; printf STDERR "# examined %d asmIds from\n#%s\n# %s\n# %s\n", $examinedAsmId, $refSeq, $genBank, $asmHubTsv; printf STDERR "# usedGenArk: %s total: %s\t%s\n", commify($countingUsedGenArk-$prevUsedGenArk), commify($countingUsedGenArk), $clade; $prevUsedGenArk = $countingUsedGenArk; } # foreach my $clade (@clades) printf STDERR "# would have knocked out %s GenArk assemblies over species count\n", commify($genArkTooManySpecies); printf STDERR "# checkedAsmIds: %d, acceptedAsmIds: %d, notInGoodToGo: %d\n", $checkedAsmIds, $acceptedAsmIds, $notInGoodToGo; printf STDERR "# total good to go %s, maxDupAsmCount: %s\n", commify($totalGoodToGo), commify($maxDupAsm);; printf STDERR "# should be GenArk: %s\n", commify($shouldBeGenArk); printf STDERR "# had originally %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount); my $usedGenArk = 0; my $missedGenArk = 0; foreach my $genArkAsmId (sort keys %genArkAsm) { my $gClade = "n/a"; $gClade = $genArkClade{$genArkAsmId} if (defined($genArkClade{$genArkAsmId})); if (defined($usedGenArk{$genArkAsmId})) { ++$usedGenArk; printf STDERR "# used genArk %s %s %s\n", $genArkAsmId, $gClade, $genArkAsm{$genArkAsmId} if ($usedGenArk < 5); } else { ++$missedGenArk; printf STDERR "# missed genArk %s %s %s\n", $genArkAsmId, $gClade, $genArkAsm{$genArkAsmId} if ($gClade !~ m/viral|bacteria/); } } printf STDERR "# should be UCSC RR %s, used GenArk: %s, missed GenArk: %s\n", commify($shouldBeUcsc), commify($usedGenArk), commify($missedGenArk); printf "
\n"; if ( 1 == 0 ) { printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n", commify($ncbiTotalAssemblies); printf "\n", commify($ncbiUniqueSpecies); printf "\n", commify($ncbiSpeciesUsed); printf "\n", commify($iucnSpeciesCount); printf "\n", commify($iucnSpeciesUsed); printf "\n", commify($totalGoodToGo); foreach my $clade (@clades) { if (defined($cladeCounts{$clade})) { my $tsvFile = sprintf("%s.tableData.txt", $clade); printf "", commify($cladeCounts{$clade}); printf "", $clade, $clade; printf "", $tsvFile, $tsvFile; printf "", commify($minimalGenomeSize{$clade}); printf "\n"; } } print "
===== summary counts =====
number of assembliescategory of counttable data in tsv
(tab separated value)
file format
assembly minimal size
to filter out projects that
are not whole genomes
%stotal number of NCBI assemblies under consideration  
%snumber of unique species in NCBI assemblies  
%snumber of unique NCBI species matched to IUCN classification  
%snumber of IUCN species with CR/EN/VU classification  
%snumber of such IUCN species matched to NCBI assemblies  
%stotal number of NCBI assemblies classified in these tables  
%s%s%s%s
\n"; printf "
\n"; sub sectionDiv($) { my ($clade) = @_; printf "\n", $clade; printf "\n"; printf "\n", $clade, $clade, $clade; foreach my $c (@clades) { if ($c ne $clade) { printf "\n", $c, $c; } else { printf "\n"; } } printf "\n"; printf "
 %s%spage sectionstop of page
\n"; } } # if ( 1 == 0 ) # count all assemblies in all clades my $totalAssemblies = 0; foreach my $c (@clades) { $totalAssemblies += $cladeCounts{$c}; } printf "\n"; printf "
\n"; printf "
\n"; printf "
\n"; printf " choose clades to view/hide ▼\n"; printf "
\n"; printf "
    \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
\n"; printf "
\n"; printf "
\n"; printf "
\n"; printf " select assembly type to display ▼\n"; printf "
\n"; printf "
    \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
\n"; printf "
\n"; printf "
\n"; printf "
\n"; printf " show/hide columns ▼\n"; printf "
\n"; printf "
    \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
  • \n"; printf "
\n"; printf "
\n"; printf "
\n\n"; printf "
\n"; printf "
\n\n"; printf "\n"; printf "\n"; printf "

. . . please wait while page loads . . .

\n\n"; printf "
\n"; printf "
\n\n"; printf "\n\n", commify($totalAssemblies); ############################################################################## ## begin single table output, start the table and the header ## ## table starts out as display: none and will be reset to 'table' after ## page load. Saves a lot of time for Chrome browsers, however the page ## is still not usable until much time later. ############################################################################## printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n", $statusColors{"CR"}, $statusColors{"EN"}, $statusColors{"VU"}; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf "\n"; printf "\n"; my %equivalentNamesUsed; # key is NCBI sciName, value is IUCN sciName my $pageSectionCount = 0; my %checkDupAsmId; # key is asmId, value is count of times seen my %cladeSciNameCounts; # key is clade, value is number of different # scientific names my %gcfGcaCounts; # key is GCF or GCA, value is count of each my $asmCountInTable = 0; # counting the rows output my %statusCounts; # key is status: CR EN VU, value is count my $totalAssemblySize = 0; # sum of all assembly sizes my $outputGenArkRR = 0; my %outputGenArk; # key is asmId, value is clade my %outputRR; # key is asmId, value is clade foreach my $clade (@clades) { my $cPtr = $cladeToGo{$clade}; my $countThisClade = scalar(@$cPtr); printf STDERR "# working on clade '%s', count: %s\n", $clade, commify($countThisClade); ## sectionDiv($clade); # starting new clade table ++$pageSectionCount; my $totalContigCounts = 0; my $underSized = 0; my $noCommonName = 0; my $suppressedCount = 0; ######################## rows of per clade table output here ################ my $tsvFile = sprintf("%s.tableData.txt", $clade); open (PC, ">$tsvFile") or die "can not write to $tsvFile"; foreach my $asmId (@$cPtr) { if (defined($checkDupAsmId{$asmId})) { printf STDERR "# what: duplicate asmId: %s in clade %s\n", $asmId, $clade; $checkDupAsmId{$asmId} += 1; } else { $checkDupAsmId{$asmId} = 1; } my $assembliesAvailable = 0; if (defined($sciNames{$asmId})) { if (defined($sciNameCount{$sciNames{$asmId}})) { $assembliesAvailable = $sciNameCount{$sciNames{$asmId}}; } } if (defined ($asmSuppressed{$asmId})) { if (!defined($rrGcaGcfList{$asmId})) { if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) { ++$suppressedCount; printf STDERR "# suppressed $asmId\n" if ($suppressedCount < 5); next; } else { printf STDERR "# genArk/RR %s would have been suppressed\n", $asmId; } } } my $commonName = "n/a"; if (defined($ncbiCommonName{$asmId})) { $commonName = $ncbiCommonName{$asmId}; if ("n/a" eq $commonName) { printf STDERR "# getting n/a commonName from ncbiCommonName{%s}\n", $asmId; } } elsif (defined($comName{$asmId})) { $commonName = $comName{$asmId}; if ("n/a" eq $commonName) { printf STDERR "# getting n/a commonName from comName{%s}\n", $asmId; } } else { if (defined($genArkAsm{$asmId}) || defined($rrGcaGcfList{$asmId})) { printf STDERR "# ACK missed genArk/RR due to no common name for %s\n", $asmId; } ++$noCommonName; printf STDERR "# no commonName for %s in ncbiCommonName or comName\n", $asmId if ($noCommonName < 5); next; } # if (! defined($gcxCountry{$asmId})) { # printf STDERR "# no country for $asmId, date: %s, submitter: %s\n", $gcxDate{$asmId}, $gcxSubmitter{$asmId}; # next; # } # if ("n/a" eq $gcxCountry{$asmId}) { # printf STDERR "# country is n/a $asmId\n"; # next; # } my ($p0, $p1, $p2) = split('_', $asmId, 3); my $accessionId = "${p0}_${p1}"; my $gcX = substr($asmId, 0, 3); my $d0 = substr($asmId, 4, 3); my $d1 = substr($asmId, 7, 3); my $d2 = substr($asmId, 10, 3); my $buildDir = "/hive/data/outside/ncbi/genomes/$gcX/$d0/$d1/$d2/$asmId"; my $asmRpt = "$buildDir/${asmId}_assembly_report.txt"; my $asmFna = "$buildDir/${asmId}_genomic.fna.gz"; my $browserUrl = sprintf("/h/%s", $accessionId); my $arkDownload = sprintf("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/%s/%s/", $gcX, $d0, $d1, $d2, $accessionId); my $destDir = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2"; my $chromInfo = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.chromInfo.txt"; my $n50Txt = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.n50.txt"; my $contigN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.contigs.n50.txt"; my $scaffoldN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.scaffolds.n50.txt"; my $asmSize = 0; my $asmContigCount = 0; my $n50Size = 0; my $n50Count = 0; my $n50ContigSize = 0; my $n50ContigCount = 0; my $n50ScaffoldSize = 0; my $n50ScaffoldCount = 0; my $bioSample = ""; my $bioProject = ""; if (defined($asmReportData{$asmId})) { (undef, undef, $bioSample, $bioProject, undef) = split('\t', $asmReportData{$asmId}, 5); } if (defined($metaInfo{$asmId})) { ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); } else { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks); my $fnaModTime = 0; if (-s "$asmFna") { ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($asmFna); $fnaModTime = $mtime; } my $ciModTime = 0; if ( -s "$chromInfo" ) { ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($chromInfo); $ciModTime = $mtime; } if ($fnaModTime > $ciModTime) { printf STDERR "# new %s\n", $chromInfo; printf STDERR "# from %s\n", $asmFna; print `mkdir -p $destDir`; print `faSize -detailed $asmFna | sort -k2,2nr > $chromInfo`; print `touch -r $asmFna $chromInfo`; } if ( -s "$chromInfo" ) { ($asmSize, $asmContigCount) = sizeUpChromInfo($chromInfo); if ( ! -s "$n50Txt" ) { print `n50.pl "$chromInfo" > "$n50Txt" 2>&1`; } ($n50Size, $n50Count) = readN50($n50Txt); if ( -s "$contigN50" ) { ($n50ContigSize, $n50ContigCount) = readN50($contigN50); } if ( -s "$scaffoldN50" ) { ($n50ScaffoldSize, $n50ScaffoldCount) = readN50($scaffoldN50); } } else { printf STDERR "# chromInfo missing: %s\n", $asmId; printf STDERR "# %s\n", $chromInfo; } die "ERROR duplicate newMetaInfo{$asmId}" if (defined($newMetaInfo{$asmId})); die "ERROR duplicate metaInfo{$asmId} for newMetaInfo{$asmId}" if (defined($metaInfo{$asmId})); $newMetaInfo{$asmId} = join("\t", ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) ); } # else if (defined($metaInfo{$asmId})) # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 2 %d %s %s < %s\n", $clade, ++$underSized, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); next; } my $iucnStatus = " "; my $iucnLink = ""; if (defined($sciNames{$asmId})) { my $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); $iucnLink = "https://www.iucnredlist.org/species/$iucnLink{$iucnSciName}" if (defined($iucnLink{$iucnSciName})); if ($iucnSciName ne $sciNames{$asmId}) { $equivalentNamesUsed{$sciNames{$asmId}} = $iucnSciName; } if (defined($iucnSciNames{$iucnSciName})) { $iucnStatus = $iucnSciNames{$iucnSciName}; } } ++$asmCountInTable; my $statusColor = ""; if ($iucnStatus ne " ") { $statusColor = $statusColors{$iucnStatus}; ++$statusCounts{$iucnStatus}; } ############# starting a table row ################################# ++$outputGenArkRR if (defined($rrGcaGcfList{$asmId}) || defined($genArkAsm{$asmId})); $outputGenArk{$asmId} = $clade if (defined($genArkAsm{$asmId})); $outputRR{$asmId} = $clade if (defined($rrGcaGcfList{$asmId})); if ($asmId =~ m/^GCF/) { $gcfGcaCounts{'GCF'} += 1; } elsif ($asmId =~ m/GCA/) { $gcfGcaCounts{'GCA'} += 1; } ### If equivalent to UCSC database browser, make reference to RR browser my $ucscDb = ""; $ucscDb = "/" . $rrGcaGcfList{$asmId} if (defined($rrGcaGcfList{$asmId})); if (length($ucscDb)) { $browserUrl = sprintf("/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId}); } - printf PC "%d", $asmCountInTable; # start a line output to clade.tableData.tsv + printf PC "%s", $browserUrl; # start a line output to clade.tableData.tsv ## count number of different scientific names used in this clade table if (!defined($cladeSciNameCounts{$clade})) { my %h; $cladeSciNameCounts{$clade} = \%h; } my $csnPtr = $cladeSciNameCounts{$clade}; $csnPtr->{$sciNames{$asmId}} += 1 if (defined($sciNames{$asmId})); my $rowClass = ""; my $gcaGcfClass = "gca"; $gcaGcfClass = "gcf" if ($asmId =~ m/^GCF/); $gcaGcfClass .= " hasIucn" if (length($iucnLink)); if (defined($comName{$asmId})) { if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db } else { $rowClass = " class='gak $gcaGcfClass $clade'"; # present in GenArk } } else { # can be requested if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db } else { $rowClass = " class='gar $gcaGcfClass $clade'"; # available for request } } ### can override CSS settings here ### $rowClass = " class='gar' style='display: none;'"; # experiment with hiding all rows over 500 count to see if that helps # chrom browser initial loading performance # try out the table with out any count, just get the row started if (length($statusColor)) { my $statusClass = sprintf(" style='color:%s;", $statusColor); # let's see what nostatus looks like $statusClass = ""; if ($asmCountInTable > 500) { printf "", $rowClass, $statusClass; } else { printf "", $rowClass, $statusClass; } } else { if ($asmCountInTable > 500) { printf "", $rowClass; } else { printf "", $rowClass; } } ############# trying out a first column that is just the button or link ############# first column, common name link to browser or request button ## if (defined($comName{$asmId})) { printf "", $browserUrl; printf PC "\tview"; # output to clade.tableData.tsv printf "", $commonName; } else { if (length($ucscDb)) { printf "", $browserUrl; printf PC "\tview"; # output to clade.tableData.tsv printf "", $commonName; } else { printf "", $asmId; printf PC "\trequest"; # output to clade.tableData.tsv printf "", $commonName; } } printf PC "\t%s", $commonName; # output to clade.tableData.tsv ############# second column, scientific name and google image search ######### if (defined($sciNames{$asmId})) { my $noSpace = $sciNames{$asmId}; $noSpace =~ s/ /+/g; my $imgSearchUrl="https://images.google.com/images?q=$noSpace&um=1&hl=en&safe=active&nfpr=1&tbs=il:cl"; if ($assembliesAvailable > 1) { printf "", $imgSearchUrl, $sciNames{$asmId}, commify($assembliesAvailable); } else { printf "", $imgSearchUrl, $sciNames{$asmId}; } printf PC "\t%s", $sciNames{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# third column, NCBI assembly and link to NCBI ############ my $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId"; printf "", $asmUrl, $asmId, $ucscDb; printf PC "\t%s", $asmId; # output to clade.tableData.txt ############# fourth column, assembly size ################ if ($asmSize > 0) { $totalAssemblySize += $asmSize; printf "", commify($asmSize); printf PC "\t%d", $asmSize; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# fifth column, sequence count ################ if ($asmContigCount > 0) { $totalContigCounts += $asmContigCount; printf "", commify($asmContigCount); printf PC "\t%d", $asmContigCount; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# sixth, seventh columns, N50 for scaffold, contig ############## if ($n50ScaffoldSize > 0 ) { n50Cell($n50ScaffoldSize, $n50ScaffoldCount, \*PC); } else { # substitute assembly N50 when no scaffold N50 available n50Cell($n50Size, $n50Count, \*PC); } n50Cell($n50ContigSize, $n50ContigCount, \*PC); ############# eighth column, IUCN status and link ################ if (length($iucnLink) > 0) { printf "", $iucnStatus, $iucnLink, $iucnStatus; } else { printf "", $iucnStatus; } printf PC "\t%s", $iucnStatus; # output to clade.tableData.txt ############# ninth column, taxId and link to NCBI ################ if (defined($asmTaxId{$asmId})) { my $taxUrl = "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=$asmTaxId{$asmId}"; printf "", $taxUrl, $asmTaxId{$asmId}; printf PC "\t%s", $asmTaxId{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# tenth column, assembly date ################ if (defined($asmDate{$asmId})) { printf "", $asmDate{$asmId}; printf PC "\t%s", $asmDate{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# eleventh column, bioSample ################ if (length($bioSample) && $bioSample !~ m#n/a#) { printf "", $bioSample, $bioSample; printf PC "\t%s", $bioSample; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, bioProject ################ if (length($bioProject) && $bioProject !~ m#n/a#) { printf "", $bioProject, $bioProject; printf PC "\t%s", $bioProject; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# eleventh column, submitter ################ $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId"; if (defined($asmSubmitter{$asmId})) { my $submitterSortKey = lc($asmSubmitter{$asmId}); $submitterSortKey =~ s/ //g; $submitterSortKey =~ s/[^a-z0-9]//ig; printf "", substr($submitterSortKey,0,20), $asmSubmitter{$asmId}; printf PC "\t%s", $asmSubmitter{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, clade ################ printf "\n", $clade; printf PC "\t%s", $clade; printf PC "\n"; # finished a line output to clade.tableData.txt printf "\n"; } # foreach my $asmId (@$cPtr) close (PC); # finished with clade.tableData.txt output printf STDERR "# no commonName %s for clade %s\n", commify($noCommonName), $clade; printf STDERR "# suppressed %s for clade %s\n", commify($suppressedCount), $clade; } # foreach my $clade (@clades) printf STDERR "# output %s genArk or RR assemblies\n", commify($outputGenArkRR); ########################################################################## ## single table is finished, output the end of tbody and the tfoot row ########################################################################## if ($asmCountInTable > 1) { my $crCount = 0; my $enCount = 0; my $vuCount = 0; foreach my $statId (keys %statusCounts) { $crCount = $statusCounts{$statId} if ($statId eq "CR"); $enCount = $statusCounts{$statId} if ($statId eq "EN"); $vuCount = $statusCounts{$statId} if ($statId eq "VU"); } my $sciNameTotal = 0; foreach my $c (@clades) { my $csnPtr = $cladeSciNameCounts{$c}; my $sciNameTotal = 0; foreach my $cladeSciName (keys %$csnPtr) { ++$sciNameTotal; } } printf " \n"; } else { print "\n"; } printf "
\n"; printf "
\n\n"; ############################################################################ if ( 1 == 0) { printf "
\n"; printf "\n"; printf "

=== back to top of page ===

\n"; printf "

Please note, in some cases the IUCN scientific name was translated to the scientific name used in the NCBI assembly to establish an equivalence between the two data sources. Please beware of this translation when interpreting the IUCN status.

\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; foreach my $ncbiName (sort keys(%equivalentNamesUsed)) { printf "\n", $ncbiName, $equivalentNamesUsed{$ncbiName}; } print "
IUCN to NCBI scientific name translation
NCBI assembly scientific nametranslated to IUCN scientific name
%s%s
\n"; } # if ( 1 == 0) ######### output new metaInfo to accumulating metaInfo file my $newMetaInfoCount = 0; foreach my $asmId (sort keys %newMetaInfo) { ++$newMetaInfoCount; } if ($newMetaInfoCount > 0) { printf STDERR "# adding %d new items to ../accumulateMetaInfo.tsv\n", $newMetaInfoCount; if (-s "../accumulateMetaInfo.tsv" ) { open (FH, ">>../accumulateMetaInfo.tsv") or die "can not append to ../accumulateMetaInfo.tsv"; } else { open (FH, ">../accumulateMetaInfo.tsv") or die "can not write to ../accumulateMetaInfo.tsv"; } foreach my $asmId (sort keys %newMetaInfo) { printf FH "%s\t%s\n", $asmId, $newMetaInfo{$asmId}; printf STDERR "# new metaData for %s\n", $asmId; ++$newMetaInfoCount; } close (FH); } else { printf STDERR "# no new metaInfo\n"; } my $beenThere = 0; printf STDERR "# checking missing genArk assemblies in output\n"; foreach my $gAsmId (sort keys %genArkAsm) { ++$beenThere if ($genArkClade{$gAsmId} !~ m/viral|bacteria/); if (!defined($outputGenArk{$gAsmId})) { printf STDERR "# not output genArk %s\t%s\n", $gAsmId, $genArkClade{$gAsmId} if ($genArkClade{$gAsmId} !~ m/viral|bacteria/); } } printf STDERR "# there were %s GenArk potential to output\n", commify($beenThere); $beenThere = 0; printf STDERR "# checking missing RR assemblies in output\n"; foreach my $rrAsmId (sort keys %rrGcaGcfList) { ++$beenThere; if (!defined($outputRR{$rrAsmId})) { printf STDERR "# not output RR %s\t%s\n", $rrAsmId, $rrGcaGcfList{$rrAsmId}; } } printf STDERR "# there were %s RR potential to output\n", commify($beenThere); printf STDERR "# done at exit\n"; exit 0; __END__ my $count = 0; foreach my $asmId (sort keys %gcxCountry) { next if (defined ($asmSuppressed{$asmId})); next if (! defined($gcxCountry{$asmId})); next if (! defined($sciName{$asmId})); next if ("n/a" eq $gcxCountry{$asmId}); ########################################################################## # found discussion of google image search URL at: # https://stackoverflow.com/questions/21530274/format-for-a-url-that-goes-to-google-image-search # did not work: # my $imgSearchUrl="https://www.google.com/search?q=$noSpace&tbm=isch"; # um=1&hl=en&safe=active&nfpr=1 # and # http://devcoma.blogspot.com/2017/04/google-search-parameters-resources.html # as_rights=cc_publicdomain # my $imgSearchUrl="https://images.google.com/images?q=$noSpace&um=1&hl=en&safe=active&nfpr=1"; # safe=active == safe search on # tbs=il:cl - limit result to creative commons license }