afe1592acd7a4dd79f8ca81b390d10113cb6aadc hiram Tue Sep 20 09:40:42 2022 -0700 more complete accounting of all GenArk and UCSC assemblies to get them into the page refs #28930 diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl index 9f3b2b6..247d28b 100755 --- src/hg/gar/garTable.pl +++ src/hg/gar/garTable.pl @@ -81,87 +81,100 @@ ############################################################################### # output a table cell for an N50 measurement sub n50Cell($$$) { my ($size, $count, $fh) = @_; if ($size > 0) { printf "%s (%s)", $size, gmk($size), commify($count); printf $fh "\t%d (%d)", $size, $count; # output to clade.tableData.txt } else { printf " "; printf $fh "\tn/a (n/a)"; # output to clade.tableData.txt } } ############################################################################### my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi ); -# my @clades = qw( primates mammals birds ); # to help weed out some of the noise # key is clade, value is minimal size to count as a whole genome # these are actually pretty low to allow in some alternate haplotype # assemblies that don't seem to be the whole assembly. # The assemblies are also filtered by NCBI status 'full/partial' to only # allow in the 'full' genomes meaning representation of the whole genome my %minimalGenomeSize = ( primates => 1000000000, mammals => 20000000, birds => 200000000, fish => 100000, vertebrate => 4000000, invertebrates => 10000, plants => 100000, fungi => 50000, ); ######################################################################### ## read in list of current GenArk assemblies my %genArkAsm; # key is asmId, value is string with: +my %genArkClade; # key is asmId, value is clade +my %genArkCladeProfile; # key is clade, value is count +my %genArkSciName; # key is asmId, value is sciName +my %genArkComName; # key is asmId, value is sciName -# accessionassemblyscientific namecommon nametaxonId +# accessionassemblyscientific namecommon nametaxonIdclade my $genArkCount = 0; printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n"; open (FH, ") { next if ($line =~ m/^#/); chomp $line; - my ($accession, $assembly, $sciName, $commonName, $taxonId) = split('\t', $line); - die "ERROR: duplicate accession from UCSC_GI.assemblyHubList.txt" if (defined($genArkAsm{$accession})); + my ($accession, $assembly, $sciName, $commonName, $taxonId, $gClade) = split('\t', $line); my $asmId = sprintf("%s_%s", $accession, $assembly); + die "ERROR: duplicate accession from UCSC_GI.assemblyHubList.txt" if (defined($genArkAsm{$asmId})); $genArkAsm{$asmId} = sprintf("%s\t%s\t%s\t%s\t%s", $accession, $assembly, $sciName, $commonName, $taxonId); + $gClade =~ s/$L$//; # remove the 'legacy' tag + $gClade = "invertebrates" if ($gClade =~ m/invertebrate/); + $genArkClade{$asmId} = $gClade; + ++$genArkCladeProfile{$gClade}; ++$genArkCount; } close (FH); # accession assembly scientific name common name taxonId # GCA_000001905.1 Loxafr3.0 Loxodonta africana African savanna elephant (2009 genbank) 9785 printf STDERR "# have %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount); +printf STDERR "# genArk clade profile:\n"; +foreach my $gClade (sort keys %genArkCladeProfile) { + printf STDERR "# %6d\t%s\n", $genArkCladeProfile{$gClade}, $gClade; +} my $rrCount = 0; my %rrGcaGcfList; # key is asmId, value is UCSC db name open (FH, ") { chomp $line; my ($ucscDb, $asmId) = split('\s+', $line); $rrGcaGcfList{$asmId} = $ucscDb; ++$rrCount; } close (FH); printf STDERR "# have %d assemblies with RR equivalent databases\n", $rrCount; +printf STDERR "# therefore, should be a total of %s existing browsers\n", commify($rrCount+$genArkCount-$genArkCladeProfile{'viral'}-$genArkCladeProfile{'bacteria'}); +printf STDERR "# %s + %s - %s - %s = %s\n", commify($genArkCount), commify($rrCount), commify($genArkCladeProfile{'viral'}), commify($genArkCladeProfile{'bacteria'}), commify($rrCount+$genArkCount-$genArkCladeProfile{'viral'}-$genArkCladeProfile{'bacteria'}); # establish a name equivalence for some NCBI three word scientific names # to IUCN two word scientific names my %ncbiToIucnNames; # key is NCBI name (one), value is IUCN name (many) # for example: # Giraffa camelopardalis antiquorum -> Giraffa camelopardalis # Giraffa camelopardalis rothschildi -> Giraffa camelopardalis printf STDERR "### reading iucnToNcbiEquivalent.txt\n"; open (FH, "<../iucnToNcbiEquivalent.txt") or die "can not read iucnToNcbiEquivalent.txt"; while (my $line = ) { chomp $line; my ($ncbiName, $iucnName) = split('\t', $line); $ncbiToIucnNames{$ncbiName} = $iucnName; @@ -255,93 +268,99 @@ chomp $line; my ($id, $sci, $com) = split('\t', $line); $sci =~ s/_/ /g; if (defined($sciName{$id})) { die "ERROR: duplicate asmId sciName: $id '$sci' '$sciName{$id}'"; } $sciName{$id} = $sci; $comName{$id} = $com; ++$hubComNames; } close (FH); printf STDERR "# common names from GenArk hubs: %s\n", commify($hubComNames); my $genArkCatchup = 0; +my $genArkDone = 0; ### catch up for any missed genArk hubs # accessionassemblyscientific namecommon nametaxonId foreach my $asmIdKey (sort keys %genArkAsm) { if (defined($sciName{$asmIdKey})) { + ++$genArkDone; next; } if (defined($comName{$asmIdKey})) { + ++$genArkDone; next; } my @a = split('\t', $genArkAsm{$asmIdKey}); $a[2] =~ s/_/ /g; $sciName{$asmIdKey} = $a[2]; $comName{$asmIdKey} = $a[3]; ++$genArkCatchup; } +printf STDERR "# %d genArk hubs from hgdownload list already done\n", $genArkDone; printf STDERR "# catch up %d genArk hubs from hgdownload list\n", $genArkCatchup; my %metaInfo; # key is asmId, value is a tsv string of numbers for: # asmSize asmContigCount n50Size n50Count n50ContigSize # n50ContigCount n50ScaffoldSize n50ScaffoldCount my %newMetaInfo; # accumulate new metaInfo to add to end of this file if ( -s "../accumulateMetaInfo.tsv" ) { printf STDERR "# reading ../accumulateMetaInfo.tsv\n"; open (FH, "<../accumulateMetaInfo.tsv") or die "can not read ../accumulateMetaInfo.tsv"; while (my $line = ) { chomp $line; my ($asmId, $metaData) = split('\t', $line, 2); die "ERROR: duplicate metaInfo{$asmId}" if (defined($metaInfo{$asmId})); $metaInfo{$asmId} = $metaData; } close (FH); } +my $noDateFromSize = 0; my $earliestDate = 22001231; my $latestDate = 19000101; printf STDERR "### reading asmId.sciName.comonName.date.tsv\n"; open (FH, "<../asmId.sciName.commonName.date.tsv") or die "can not read ../asmId.sciName.commonName.date.tsv"; while (my $line = ) { chomp $line; my @a = split('\t', $line); next if ($a[0] !~ m/^GC/); $a[0] =~ s/__/_/g; ++$comNameCount; $ncbiCommonName{$a[0]} = $a[2]; if (defined($a[3]) && ($a[3] =~ m/^[12][0-9][0-9][0-9]-[0-9]+-[0-9]+$/) ) { my @b = split('-', $a[3]); $ncbiDate{$a[0]} = sprintf("%04d%02d%02d", $b[0], $b[1], $b[2]); $earliestDate = $ncbiDate{$a[0]} if ($ncbiDate{$a[0]} < $earliestDate); $latestDate = $ncbiDate{$a[0]} if ($latestDate < $ncbiDate{$a[0]}); } else { $ncbiDate{$a[0]} = 19720101; + ++$noDateFromSize; if (defined($a[3])) { - printf STDERR "# no date '%s' for %s'\n", $a[3], $a[0]; + printf STDERR "# no date '%s' for %s'\n", $a[3], $a[0] if ($noDateFromSize < 5); } else { - printf STDERR "# no date 'undef' for %s'\n", $a[0]; + printf STDERR "# no date 'undef' for %s'\n", $a[0] if ($noDateFromSize < 5); } } } close (FH); printf STDERR "# comNameCount %s from ../../../sizes/asmId.sciName.commonName.date.tsv\n", commify($comNameCount); -printf STDERR "# dates from %s to %s\n", $earliestDate, $latestDate; +printf STDERR "# dates from %s to %s, no dates: %d\n", $earliestDate, $latestDate, $noDateFromSize;; my %sciNameCount; # key is scientific name, value is count of assemblies # from NCBI assembly_summary .txt files my %sciNames; # key is asmId, value is scientific name my %sciNameAsmList; # key is scientific name, value is pointer to array # which is a list of assemblies on this sciName # goodToGo will be the master asmId list for output my %goodToGoSpecies; # key is scientific name, value is asmId selected my %goodToGo; # key is asmId, value is iucn status code CR EN VU my %goodToGoDate; # key is sciName, value is date YYYYMMDD for selected # assembly # choices in order when duplicate species: # GCA assembly # newer GCA assembly @@ -459,30 +478,31 @@ close (FH); printf STDERR "# countedGoodToGo: %d, replaced: %d, noDate: %d\n", $countingGoodToGo, $goodToGoReplaced, $warningNoDate; printf STDERR "# have %s NCBI assemblies, %s unique NCBI species\n", commify($ncbiTotalAssemblies), commify($ncbiUniqueSpecies); printf STDERR "#\thighest species count %s : '%s'\n", commify($highestSpeciesCount), $highestCountName; my %gcxCountry; # key is asmId, value is collection location my %gcxDate; # key is asmId, value is collection date my %gcxSubmitter; # key is asmId, value is collected by my %asmSuppressed; # key is asmId, value is 1 meaning suppressed printf STDERR "### reading asmId.suppressed.list\n"; open (FH, ") { chomp $id; + next if (defined($genArkAsm{$id}) || defined($rrGcaGcfList{$id})); $asmSuppressed{$id} = 1; } close (FH); my %asmDate; # key is asmId, value is date of assembly my %asmSubmitter; # key is asmId, value is name of assembly submitter my %asmTaxId; # key is asmId, value is taxId of species my $asmIdCount = 0; printf STDERR "### reading ../asmId.date.taxId.asmSubmitter.tsv\n"; open (FH, "<../asmId.date.taxId.asmSubmitter.tsv") or die "can not read a../smId.date.taxId.asmSubmitter.tsv"; while (my $line = ) { chomp $line; my ($id, $da, $tid, $asmS) = split('\t', $line); $id =~ s/__/_/g; @@ -540,165 +560,191 @@ my $maxDupAsm = 0; my %ncbiSpeciesRecorded; # key is NCBI sciName, value is count of those my $ncbiSpeciesUsed = 0; # a unique count of NCBI sciName my %iucnSpeciesRecorded; # key is IUCN sciName, value is count of those my $iucnSpeciesUsed = 0; # a unique count of IUCN sciName my $sciNameDisplayLimit = 5; # do not display more than 5 instances my $checkedAsmIds = 0; my $acceptedAsmIds = 0; my $notInGoodToGo = 0; my %alreadyDone; # key is asmId, value = 1 indicates already done my %underSized; # key is clade, value is count of underSized my $NA = "/hive/data/outside/ncbi/genomes/reports/newAsm"; -my %genArkClade; # key is asmId, value is clade +my $examinedAsmId = 0; my $shouldBeGenArk = 0; my $shouldBeUcsc = 0; my %usedGenArk; # key is asmId, value is count seen in *.today files +my $genArkTooManySpecies = 0; +my $countingUsedGenArk = 0; +my $prevUsedGenArk = 0; foreach my $clade (@clades) { my $cladeCount = 0; my $goodToGoCount = 0; my $refSeq = "${NA}/all.refseq.${clade}.today"; my $genBank = "${NA}/all.genbank.${clade}.today"; my $asmHubTsv = "~/kent/src/hg/makeDb/doc/${clade}AsmHub/${clade}.orderList.tsv"; + if ($clade eq "invertebrates") { + $asmHubTsv = "~/kent/src/hg/makeDb/doc/invertebrateAsmHub/invertebrate.orderList.tsv"; + } open (FH, "cut -f1 ${refSeq} ${genBank} ${asmHubTsv}|sort -u|") or die "can not read $refSeq or $genBank or $asmHubTsv"; while (my $asmId = ) { chomp $asmId; + ++$examinedAsmId; $asmId =~ s/\.]/_/g; $asmId =~ s/\:/_/g; $asmId =~ s/\%/_/g; $asmId =~ s/\+/_/g; $asmId =~ s/\//_/g; $asmId =~ s/\#/_/g; # $asmId =~ s/[.:%+/#]/_/g; $asmId =~ s/[()]//g; $asmId =~ s/__/_/g; ++$shouldBeGenArk if (defined($genArkAsm{$asmId})); ++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId})); if (defined ($skipPartialGenome{$asmId})) { - next if (!defined($genArkAsm{$asmId})); + next if (!defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})); } if (defined ($asmSuppressed{$asmId})) { - next if (!defined($genArkAsm{$asmId})); + next if (!defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})); } next if (defined ($alreadyDone{$asmId})); # something wrong with these two # GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683 # GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679 next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/); # verify this asmId will pass the asmSize limit if (defined($metaInfo{$asmId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId}; next; } } $alreadyDone{$asmId} = 1; if (defined($genArkClade{$asmId})) { - die "ERROR: duplicate asmId today $asmId '$clade' '$genArkClade{$asmId}'"; + printf STDERR "# warning genArkClade{%s} does not = %s\n", $genArkClade{$asmId}, $clade if ($genArkClade{$asmId} ne $clade); } ++$checkedAsmIds; my $iucnSciName = ""; if (defined($sciNames{$asmId})) { ++$ncbiSpeciesRecorded{$sciNames{$asmId}}; + if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) { next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit); $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); ++$iucnSpeciesRecorded{$iucnSciName}; } else { + ++$genArkTooManySpecies if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit); + } + } else { printf STDERR "# no sciName for: '%s' at %s\n", $asmId, commify($goodToGoCount + 1); } - $genArkClade{$asmId} = $clade; ++$cladeCount; if (! defined($cladeToGo{$clade})) { my @a; $cladeToGo{$clade} = \@a; } my $cPtr = $cladeToGo{$clade}; push (@$cPtr, $asmId); - $usedGenArk{$asmId} += 1 if (defined($genArkAsm{$asmId})); + ++$usedGenArk{$asmId} if (defined($genArkAsm{$asmId})); + ++$countingUsedGenArk if (defined($genArkAsm{$asmId})); ++$acceptedAsmIds; ++$goodToGoCount; ++$ncbiSpeciesUsed if ( defined($sciNames{$asmId}) && defined($ncbiSpeciesRecorded{$sciNames{$asmId}}) && (1 == $ncbiSpeciesRecorded{$sciNames{$asmId}}) ); ++$iucnSpeciesUsed if ( defined($iucnSpeciesRecorded{$iucnSciName}) && (1 == $iucnSpeciesRecorded{$iucnSciName})); ++$cladeCounts{$clade}; my $assembliesAvailable = 0; if (defined($sciNames{$asmId})) { if (defined($sciNameCount{$sciNames{$asmId}})) { $assembliesAvailable = $sciNameCount{$sciNames{$asmId}}; $maxDupAsm = $assembliesAvailable if ($assembliesAvailable > $maxDupAsm); } if ($assembliesAvailable > 1) { my $bPtr = $sciNameAsmList{$sciNames{$asmId}}; foreach my $aId (@$bPtr) { next if (defined ($alreadyDone{$aId})); $alreadyDone{$aId} = 1; if ($aId ne $asmId) { if (defined($metaInfo{$aId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { + if (! defined($genArkAsm{$aId}) && !defined($rrGcaGcfList{$aId}) ) { printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId})); printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId}; next; + } else { + printf STDERR "# ACK %s genArk/RR too small at %s\n", $aId, commify($asmSize); + } } } ++$ncbiSpeciesRecorded{$sciNames{$aId}}; # the defined($sciName{$aId}) indicates it is a GenArk genome # always accept those even if it goes beyond the limit - if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($sciName{$aId}) ) { + if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($genArkAsm{$aId}) || defined($rrGcaGcfList{$aId})) { push (@$cPtr, $aId); $usedGenArk{$aId} += 1 if (defined($genArkAsm{$aId})); + ++$countingUsedGenArk if (defined($genArkAsm{$asmId})); ++$acceptedAsmIds; ++$goodToGoCount; ++$cladeCounts{$clade}; } # under limit count or is GenArk assembly } # if ($aId ne $asmId) } # foreach my $aId (@$bPtr) } # if ($assembliesAvailable > 1) } # if (defined($sciNames{$asmId})) } # while (my $asmId = ) close (FH); - printf STDERR "# %s\t%s\tgood to go %s\n", $clade, commify($cladeCount), $goodToGoCount; $totalGoodToGo += $goodToGoCount; + printf STDERR "# %s\t%s\tgood to go %s, total good: %d\n", $clade, commify($cladeCount), $goodToGoCount, $totalGoodToGo; + printf STDERR "# examined %d asmIds from\n#%s\n# %s\n# %s\n", $examinedAsmId, $refSeq, $genBank, $asmHubTsv; + printf STDERR "# usedGenArk: %s total: %s\t%s\n", commify($countingUsedGenArk-$prevUsedGenArk), commify($countingUsedGenArk), $clade; + $prevUsedGenArk = $countingUsedGenArk; } # foreach my $clade (@clades) +printf STDERR "# would have knocked out %s GenArk assemblies over species count\n", commify($genArkTooManySpecies); printf STDERR "# checkedAsmIds: %d, acceptedAsmIds: %d, notInGoodToGo: %d\n", $checkedAsmIds, $acceptedAsmIds, $notInGoodToGo; printf STDERR "# total good to go %s, maxDupAsmCount: %s\n", commify($totalGoodToGo), commify($maxDupAsm);; printf STDERR "# should be GenArk: %s\n", commify($shouldBeGenArk); printf STDERR "# had originally %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount); +my $usedGenArk = 0; +my $missedGenArk = 0; foreach my $genArkAsmId (sort keys %genArkAsm) { + my $gClade = "n/a"; + $gClade = $genArkClade{$genArkAsmId} if (defined($genArkClade{$genArkAsmId})); if (defined($usedGenArk{$genArkAsmId})) { - printf STDERR "# used genArk %s %s\n", $genArkAsmId, $genArkAsm{$genArkAsmId}; + ++$usedGenArk; + printf STDERR "# used genArk %s %s %s\n", $genArkAsmId, $gClade, $genArkAsm{$genArkAsmId} if ($usedGenArk < 5); } else { - printf STDERR "# missed genArk %s %s\n", $genArkAsmId, $genArkAsm{$genArkAsmId}; + ++$missedGenArk; + printf STDERR "# missed genArk %s %s %s\n", $genArkAsmId, $gClade, $genArkAsm{$genArkAsmId} if ($gClade !~ m/viral|bacteria/); } } -printf STDERR "# should be UCSC RR %s\n", commify($shouldBeUcsc); +printf STDERR "# should be UCSC RR %s, used GenArk: %s, missed GenArk: %s\n", commify($shouldBeUcsc), commify($usedGenArk), commify($missedGenArk); printf "

\n"; if ( 1 == 0 ) { printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n", commify($ncbiTotalAssemblies); @@ -859,77 +905,92 @@ printf "\n"; my %equivalentNamesUsed; # key is NCBI sciName, value is IUCN sciName my $pageSectionCount = 0; my %checkDupAsmId; # key is asmId, value is count of times seen my %cladeSciNameCounts; # key is clade, value is number of different # scientific names my %gcfGcaCounts; # key is GCF or GCA, value is count of each my $asmCountInTable = 0; # counting the rows output my %statusCounts; # key is status: CR EN VU, value is count my $totalAssemblySize = 0; # sum of all assembly sizes +my $outputGenArkRR = 0; +my %outputGenArk; # key is asmId, value is clade +my %outputRR; # key is asmId, value is clade + foreach my $clade (@clades) { my $cPtr = $cladeToGo{$clade}; my $countThisClade = scalar(@$cPtr); printf STDERR "# working on clade '%s', count: %s\n", $clade, commify($countThisClade); ## sectionDiv($clade); # starting new clade table ++$pageSectionCount; my $totalContigCounts = 0; my $underSized = 0; + my $noCommonName = 0; + my $suppressedCount = 0; ######################## rows of per clade table output here ################ my $tsvFile = sprintf("%s.tableData.txt", $clade); open (PC, ">$tsvFile") or die "can not write to $tsvFile"; foreach my $asmId (@$cPtr) { if (defined($checkDupAsmId{$asmId})) { printf STDERR "# what: duplicate asmId: %s in clade %s\n", $asmId, $clade; $checkDupAsmId{$asmId} += 1; } else { $checkDupAsmId{$asmId} = 1; } my $assembliesAvailable = 0; if (defined($sciNames{$asmId})) { if (defined($sciNameCount{$sciNames{$asmId}})) { $assembliesAvailable = $sciNameCount{$sciNames{$asmId}}; } } if (defined ($asmSuppressed{$asmId})) { if (!defined($rrGcaGcfList{$asmId})) { - printf STDERR "# suppressed $asmId\n"; + if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) { + ++$suppressedCount; + printf STDERR "# suppressed $asmId\n" if ($suppressedCount < 5); next; + } else { + printf STDERR "# genArk/RR %s would have been suppressed\n", $asmId; + } } } my $commonName = "n/a"; if (defined($ncbiCommonName{$asmId})) { $commonName = $ncbiCommonName{$asmId}; if ("n/a" eq $commonName) { printf STDERR "# getting n/a commonName from ncbiCommonName{%s}\n", $asmId; } } elsif (defined($comName{$asmId})) { $commonName = $comName{$asmId}; if ("n/a" eq $commonName) { printf STDERR "# getting n/a commonName from comName{%s}\n", $asmId; } } else { - printf STDERR "# no commonName for %s in ncbiCommonName or comName\n", $asmId; + if (defined($genArkAsm{$asmId}) || defined($rrGcaGcfList{$asmId})) { + printf STDERR "# ACK missed genArk/RR due to no common name for %s\n", $asmId; + } + ++$noCommonName; + printf STDERR "# no commonName for %s in ncbiCommonName or comName\n", $asmId if ($noCommonName < 5); next; } # if (! defined($gcxCountry{$asmId})) { # printf STDERR "# no country for $asmId, date: %s, submitter: %s\n", $gcxDate{$asmId}, $gcxSubmitter{$asmId}; # next; # } # if ("n/a" eq $gcxCountry{$asmId}) { # printf STDERR "# country is n/a $asmId\n"; # next; # } my ($p0, $p1, $p2) = split('_', $asmId, 3); my $accessionId = "${p0}_${p1}"; my $gcX = substr($asmId, 0, 3); my $d0 = substr($asmId, 4, 3); @@ -1015,30 +1076,33 @@ if ($iucnSciName ne $sciNames{$asmId}) { $equivalentNamesUsed{$sciNames{$asmId}} = $iucnSciName; } if (defined($iucnSciNames{$iucnSciName})) { $iucnStatus = $iucnSciNames{$iucnSciName}; } } ++$asmCountInTable; my $statusColor = ""; if ($iucnStatus ne " ") { $statusColor = $statusColors{$iucnStatus}; ++$statusCounts{$iucnStatus}; } ############# starting a table row ################################# + ++$outputGenArkRR if (defined($rrGcaGcfList{$asmId}) || defined($genArkAsm{$asmId})); + $outputGenArk{$asmId} = $clade if (defined($genArkAsm{$asmId})); + $outputRR{$asmId} = $clade if (defined($rrGcaGcfList{$asmId})); if ($asmId =~ m/^GCF/) { $gcfGcaCounts{'GCF'} += 1; } elsif ($asmId =~ m/GCA/) { $gcfGcaCounts{'GCA'} += 1; } ### If equivalent to UCSC database browser, make reference to RR browser my $ucscDb = ""; $ucscDb = "/" . $rrGcaGcfList{$asmId} if (defined($rrGcaGcfList{$asmId})); if (length($ucscDb)) { $browserUrl = sprintf("/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId}); } printf PC "%d", $asmCountInTable; # start a line output to clade.tableData.tsv ## count number of different scientific names used in this clade table @@ -1211,32 +1275,36 @@ printf "", substr($submitterSortKey,0,20), $asmSubmitter{$asmId}; printf PC "\t%s", $asmSubmitter{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, clade ################ printf "\n", $clade; printf PC "\t%s", $clade; printf PC "\n"; # finished a line output to clade.tableData.txt printf "\n"; } # foreach my $asmId (@$cPtr) close (PC); # finished with clade.tableData.txt output + printf STDERR "# no commonName %s for clade %s\n", commify($noCommonName), $clade; + printf STDERR "# suppressed %s for clade %s\n", commify($suppressedCount), $clade; } # foreach my $clade (@clades) +printf STDERR "# output %s genArk or RR assemblies\n", commify($outputGenArkRR); + ########################################################################## ## single table is finished, output the end of tbody and the tfoot row ########################################################################## if ($asmCountInTable > 1) { my $crCount = 0; my $enCount = 0; my $vuCount = 0; foreach my $statId (keys %statusCounts) { $crCount = $statusCounts{$statId} if ($statId eq "CR"); $enCount = $statusCounts{$statId} if ($statId eq "EN"); $vuCount = $statusCounts{$statId} if ($statId eq "VU"); } my $sciNameTotal = 0; foreach my $c (@clades) { my $csnPtr = $cladeSciNameCounts{$c}; @@ -1302,30 +1370,53 @@ if (-s "../accumulateMetaInfo.tsv" ) { open (FH, ">>../accumulateMetaInfo.tsv") or die "can not append to ../accumulateMetaInfo.tsv"; } else { open (FH, ">../accumulateMetaInfo.tsv") or die "can not write to ../accumulateMetaInfo.tsv"; } foreach my $asmId (sort keys %newMetaInfo) { printf FH "%s\t%s\n", $asmId, $newMetaInfo{$asmId}; printf STDERR "# new metaData for %s\n", $asmId; ++$newMetaInfoCount; } close (FH); } else { printf STDERR "# no new metaInfo\n"; } +my $beenThere = 0; +printf STDERR "# checking missing genArk assemblies in output\n"; +foreach my $gAsmId (sort keys %genArkAsm) { + ++$beenThere if ($genArkClade{$gAsmId} !~ m/viral|bacteria/); + if (!defined($outputGenArk{$gAsmId})) { + printf STDERR "# not output genArk %s\t%s\n", $gAsmId, $genArkClade{$gAsmId} + if ($genArkClade{$gAsmId} !~ m/viral|bacteria/); + } +} +printf STDERR "# there were %s GenArk potential to output\n", commify($beenThere); + +$beenThere = 0; + +printf STDERR "# checking missing RR assemblies in output\n"; + +foreach my $rrAsmId (sort keys %rrGcaGcfList) { + ++$beenThere; + if (!defined($outputRR{$rrAsmId})) { + printf STDERR "# not output RR %s\t%s\n", $rrAsmId, $rrGcaGcfList{$rrAsmId}; + } +} +printf STDERR "# there were %s RR potential to output\n", commify($beenThere); + printf STDERR "# done at exit\n"; exit 0; __END__ my $count = 0; foreach my $asmId (sort keys %gcxCountry) { next if (defined ($asmSuppressed{$asmId})); next if (! defined($gcxCountry{$asmId})); next if (! defined($sciName{$asmId})); next if ("n/a" eq $gcxCountry{$asmId}); ########################################################################## # found discussion of google image search URL at: # https://stackoverflow.com/questions/21530274/format-for-a-url-that-goes-to-google-image-search # did not work:

**===== summary counts =====**
number of assemblies	category of count	table data in tsv (tab separated value) file format	assembly minimal size to filter out projects that are not whole genomes
%s	total number of NCBI assemblies under consideration
%s	n/a	%s