afe1592acd7a4dd79f8ca81b390d10113cb6aadc
hiram
Tue Sep 20 09:40:42 2022 -0700
more complete accounting of all GenArk and UCSC assemblies to get them into the page refs #28930
diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl
index 9f3b2b6..247d28b 100755
--- src/hg/gar/garTable.pl
+++ src/hg/gar/garTable.pl
@@ -81,87 +81,100 @@
###############################################################################
# output a table cell for an N50 measurement
sub n50Cell($$$) {
my ($size, $count, $fh) = @_;
if ($size > 0) {
printf "
";
printf $fh "\tn/a (n/a)"; # output to clade.tableData.txt
}
}
###############################################################################
my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi );
-# my @clades = qw( primates mammals birds );
# to help weed out some of the noise
# key is clade, value is minimal size to count as a whole genome
# these are actually pretty low to allow in some alternate haplotype
# assemblies that don't seem to be the whole assembly.
# The assemblies are also filtered by NCBI status 'full/partial' to only
# allow in the 'full' genomes meaning representation of the whole genome
my %minimalGenomeSize = (
primates => 1000000000,
mammals => 20000000,
birds => 200000000,
fish => 100000,
vertebrate => 4000000,
invertebrates => 10000,
plants => 100000,
fungi => 50000,
);
#########################################################################
## read in list of current GenArk assemblies
my %genArkAsm; # key is asmId, value is string with:
+my %genArkClade; # key is asmId, value is clade
+my %genArkCladeProfile; # key is clade, value is count
+my %genArkSciName; # key is asmId, value is sciName
+my %genArkComName; # key is asmId, value is sciName
-# accessionassemblyscientific namecommon nametaxonId
+# accessionassemblyscientific namecommon nametaxonIdclade
my $genArkCount = 0;
printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n";
open (FH, ") {
next if ($line =~ m/^#/);
chomp $line;
- my ($accession, $assembly, $sciName, $commonName, $taxonId) = split('\t', $line);
- die "ERROR: duplicate accession from UCSC_GI.assemblyHubList.txt" if (defined($genArkAsm{$accession}));
+ my ($accession, $assembly, $sciName, $commonName, $taxonId, $gClade) = split('\t', $line);
my $asmId = sprintf("%s_%s", $accession, $assembly);
+ die "ERROR: duplicate accession from UCSC_GI.assemblyHubList.txt" if (defined($genArkAsm{$asmId}));
$genArkAsm{$asmId} = sprintf("%s\t%s\t%s\t%s\t%s", $accession, $assembly, $sciName, $commonName, $taxonId);
+ $gClade =~ s/\(L\)//; # remove the 'legacy' tag
+ $gClade = "invertebrates" if ($gClade =~ m/invertebrate/);
+ $genArkClade{$asmId} = $gClade;
+ ++$genArkCladeProfile{$gClade};
++$genArkCount;
}
close (FH);
# accession assembly scientific name common name taxonId
# GCA_000001905.1 Loxafr3.0 Loxodonta africana African savanna elephant (2009 genbank) 9785
printf STDERR "# have %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount);
+printf STDERR "# genArk clade profile:\n";
+foreach my $gClade (sort keys %genArkCladeProfile) {
+ printf STDERR "# %6d\t%s\n", $genArkCladeProfile{$gClade}, $gClade;
+}
my $rrCount = 0;
my %rrGcaGcfList; # key is asmId, value is UCSC db name
open (FH, ") {
chomp $line;
my ($ucscDb, $asmId) = split('\s+', $line);
$rrGcaGcfList{$asmId} = $ucscDb;
++$rrCount;
}
close (FH);
printf STDERR "# have %d assemblies with RR equivalent databases\n", $rrCount;
+printf STDERR "# therefore, should be a total of %s existing browsers\n", commify($rrCount+$genArkCount-$genArkCladeProfile{'viral'}-$genArkCladeProfile{'bacteria'});
+printf STDERR "# %s + %s - %s - %s = %s\n", commify($genArkCount), commify($rrCount), commify($genArkCladeProfile{'viral'}), commify($genArkCladeProfile{'bacteria'}), commify($rrCount+$genArkCount-$genArkCladeProfile{'viral'}-$genArkCladeProfile{'bacteria'});
# establish a name equivalence for some NCBI three word scientific names
# to IUCN two word scientific names
my %ncbiToIucnNames; # key is NCBI name (one), value is IUCN name (many)
# for example:
# Giraffa camelopardalis antiquorum -> Giraffa camelopardalis
# Giraffa camelopardalis rothschildi -> Giraffa camelopardalis
printf STDERR "### reading iucnToNcbiEquivalent.txt\n";
open (FH, "<../iucnToNcbiEquivalent.txt") or die "can not read iucnToNcbiEquivalent.txt";
while (my $line = ) {
chomp $line;
my ($ncbiName, $iucnName) = split('\t', $line);
$ncbiToIucnNames{$ncbiName} = $iucnName;
@@ -255,93 +268,99 @@
chomp $line;
my ($id, $sci, $com) = split('\t', $line);
$sci =~ s/_/ /g;
if (defined($sciName{$id})) {
die "ERROR: duplicate asmId sciName: $id '$sci' '$sciName{$id}'";
}
$sciName{$id} = $sci;
$comName{$id} = $com;
++$hubComNames;
}
close (FH);
printf STDERR "# common names from GenArk hubs: %s\n", commify($hubComNames);
my $genArkCatchup = 0;
+my $genArkDone = 0;
### catch up for any missed genArk hubs
# accessionassemblyscientific namecommon nametaxonId
foreach my $asmIdKey (sort keys %genArkAsm) {
if (defined($sciName{$asmIdKey})) {
+ ++$genArkDone;
next;
}
if (defined($comName{$asmIdKey})) {
+ ++$genArkDone;
next;
}
my @a = split('\t', $genArkAsm{$asmIdKey});
$a[2] =~ s/_/ /g;
$sciName{$asmIdKey} = $a[2];
$comName{$asmIdKey} = $a[3];
++$genArkCatchup;
}
+printf STDERR "# %d genArk hubs from hgdownload list already done\n", $genArkDone;
printf STDERR "# catch up %d genArk hubs from hgdownload list\n", $genArkCatchup;
my %metaInfo; # key is asmId, value is a tsv string of numbers for:
# asmSize asmContigCount n50Size n50Count n50ContigSize
# n50ContigCount n50ScaffoldSize n50ScaffoldCount
my %newMetaInfo; # accumulate new metaInfo to add to end of this file
if ( -s "../accumulateMetaInfo.tsv" ) {
printf STDERR "# reading ../accumulateMetaInfo.tsv\n";
open (FH, "<../accumulateMetaInfo.tsv") or die "can not read ../accumulateMetaInfo.tsv";
while (my $line = ) {
chomp $line;
my ($asmId, $metaData) = split('\t', $line, 2);
die "ERROR: duplicate metaInfo{$asmId}" if (defined($metaInfo{$asmId}));
$metaInfo{$asmId} = $metaData;
}
close (FH);
}
+my $noDateFromSize = 0;
my $earliestDate = 22001231;
my $latestDate = 19000101;
printf STDERR "### reading asmId.sciName.comonName.date.tsv\n";
open (FH, "<../asmId.sciName.commonName.date.tsv") or die "can not read ../asmId.sciName.commonName.date.tsv";
while (my $line = ) {
chomp $line;
my @a = split('\t', $line);
next if ($a[0] !~ m/^GC/);
$a[0] =~ s/__/_/g;
++$comNameCount;
$ncbiCommonName{$a[0]} = $a[2];
if (defined($a[3]) && ($a[3] =~ m/^[12][0-9][0-9][0-9]-[0-9]+-[0-9]+$/) ) {
my @b = split('-', $a[3]);
$ncbiDate{$a[0]} = sprintf("%04d%02d%02d", $b[0], $b[1], $b[2]);
$earliestDate = $ncbiDate{$a[0]} if ($ncbiDate{$a[0]} < $earliestDate);
$latestDate = $ncbiDate{$a[0]} if ($latestDate < $ncbiDate{$a[0]});
} else {
$ncbiDate{$a[0]} = 19720101;
+ ++$noDateFromSize;
if (defined($a[3])) {
- printf STDERR "# no date '%s' for %s'\n", $a[3], $a[0];
+ printf STDERR "# no date '%s' for %s'\n", $a[3], $a[0] if ($noDateFromSize < 5);
} else {
- printf STDERR "# no date 'undef' for %s'\n", $a[0];
+ printf STDERR "# no date 'undef' for %s'\n", $a[0] if ($noDateFromSize < 5);
}
}
}
close (FH);
printf STDERR "# comNameCount %s from ../../../sizes/asmId.sciName.commonName.date.tsv\n", commify($comNameCount);
-printf STDERR "# dates from %s to %s\n", $earliestDate, $latestDate;
+printf STDERR "# dates from %s to %s, no dates: %d\n", $earliestDate, $latestDate, $noDateFromSize;;
my %sciNameCount; # key is scientific name, value is count of assemblies
# from NCBI assembly_summary .txt files
my %sciNames; # key is asmId, value is scientific name
my %sciNameAsmList; # key is scientific name, value is pointer to array
# which is a list of assemblies on this sciName
# goodToGo will be the master asmId list for output
my %goodToGoSpecies; # key is scientific name, value is asmId selected
my %goodToGo; # key is asmId, value is iucn status code CR EN VU
my %goodToGoDate; # key is sciName, value is date YYYYMMDD for selected
# assembly
# choices in order when duplicate species:
# GCA assembly
# newer GCA assembly
@@ -459,30 +478,31 @@
close (FH);
printf STDERR "# countedGoodToGo: %d, replaced: %d, noDate: %d\n", $countingGoodToGo, $goodToGoReplaced, $warningNoDate;
printf STDERR "# have %s NCBI assemblies, %s unique NCBI species\n", commify($ncbiTotalAssemblies), commify($ncbiUniqueSpecies);
printf STDERR "#\thighest species count %s : '%s'\n", commify($highestSpeciesCount), $highestCountName;
my %gcxCountry; # key is asmId, value is collection location
my %gcxDate; # key is asmId, value is collection date
my %gcxSubmitter; # key is asmId, value is collected by
my %asmSuppressed; # key is asmId, value is 1 meaning suppressed
printf STDERR "### reading asmId.suppressed.list\n";
open (FH, ") {
chomp $id;
+ next if (defined($genArkAsm{$id}) || defined($rrGcaGcfList{$id}));
$asmSuppressed{$id} = 1;
}
close (FH);
my %asmDate; # key is asmId, value is date of assembly
my %asmSubmitter; # key is asmId, value is name of assembly submitter
my %asmTaxId; # key is asmId, value is taxId of species
my $asmIdCount = 0;
printf STDERR "### reading ../asmId.date.taxId.asmSubmitter.tsv\n";
open (FH, "<../asmId.date.taxId.asmSubmitter.tsv") or die "can not read a../smId.date.taxId.asmSubmitter.tsv";
while (my $line = ) {
chomp $line;
my ($id, $da, $tid, $asmS) = split('\t', $line);
$id =~ s/__/_/g;
@@ -540,165 +560,191 @@
my $maxDupAsm = 0;
my %ncbiSpeciesRecorded; # key is NCBI sciName, value is count of those
my $ncbiSpeciesUsed = 0; # a unique count of NCBI sciName
my %iucnSpeciesRecorded; # key is IUCN sciName, value is count of those
my $iucnSpeciesUsed = 0; # a unique count of IUCN sciName
my $sciNameDisplayLimit = 5; # do not display more than 5 instances
my $checkedAsmIds = 0;
my $acceptedAsmIds = 0;
my $notInGoodToGo = 0;
my %alreadyDone; # key is asmId, value = 1 indicates already done
my %underSized; # key is clade, value is count of underSized
my $NA = "/hive/data/outside/ncbi/genomes/reports/newAsm";
-my %genArkClade; # key is asmId, value is clade
+my $examinedAsmId = 0;
my $shouldBeGenArk = 0;
my $shouldBeUcsc = 0;
my %usedGenArk; # key is asmId, value is count seen in *.today files
+my $genArkTooManySpecies = 0;
+my $countingUsedGenArk = 0;
+my $prevUsedGenArk = 0;
foreach my $clade (@clades) {
my $cladeCount = 0;
my $goodToGoCount = 0;
my $refSeq = "${NA}/all.refseq.${clade}.today";
my $genBank = "${NA}/all.genbank.${clade}.today";
my $asmHubTsv = "~/kent/src/hg/makeDb/doc/${clade}AsmHub/${clade}.orderList.tsv";
+ if ($clade eq "invertebrates") {
+ $asmHubTsv = "~/kent/src/hg/makeDb/doc/invertebrateAsmHub/invertebrate.orderList.tsv";
+ }
open (FH, "cut -f1 ${refSeq} ${genBank} ${asmHubTsv}|sort -u|") or die "can not read $refSeq or $genBank or $asmHubTsv";
while (my $asmId = ) {
chomp $asmId;
+ ++$examinedAsmId;
$asmId =~ s/\.]/_/g;
$asmId =~ s/\:/_/g;
$asmId =~ s/\%/_/g;
$asmId =~ s/\+/_/g;
$asmId =~ s/\//_/g;
$asmId =~ s/\#/_/g;
# $asmId =~ s/[.:%+/#]/_/g;
$asmId =~ s/[()]//g;
$asmId =~ s/__/_/g;
++$shouldBeGenArk if (defined($genArkAsm{$asmId}));
++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId}));
if (defined ($skipPartialGenome{$asmId})) {
- next if (!defined($genArkAsm{$asmId}));
+ next if (!defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId}));
}
if (defined ($asmSuppressed{$asmId})) {
- next if (!defined($genArkAsm{$asmId}));
+ next if (!defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId}));
}
next if (defined ($alreadyDone{$asmId}));
# something wrong with these two
# GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683
# GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679
next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/);
# verify this asmId will pass the asmSize limit
if (defined($metaInfo{$asmId})) {
my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId});
# if asmSize is below the minimum, don't use it
if ($asmSize < $minimalGenomeSize{$clade}) {
printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade});
printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId}));
printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId}));
printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId};
next;
}
}
$alreadyDone{$asmId} = 1;
if (defined($genArkClade{$asmId})) {
- die "ERROR: duplicate asmId today $asmId '$clade' '$genArkClade{$asmId}'";
+ printf STDERR "# warning genArkClade{%s} does not = %s\n", $genArkClade{$asmId}, $clade if ($genArkClade{$asmId} ne $clade);
}
++$checkedAsmIds;
my $iucnSciName = "";
if (defined($sciNames{$asmId})) {
++$ncbiSpeciesRecorded{$sciNames{$asmId}};
+ if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) {
next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit);
$iucnSciName = $sciNames{$asmId};
$iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}}));
++$iucnSpeciesRecorded{$iucnSciName};
} else {
+ ++$genArkTooManySpecies if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit);
+ }
+ } else {
printf STDERR "# no sciName for: '%s' at %s\n", $asmId, commify($goodToGoCount + 1);
}
- $genArkClade{$asmId} = $clade;
++$cladeCount;
if (! defined($cladeToGo{$clade})) {
my @a;
$cladeToGo{$clade} = \@a;
}
my $cPtr = $cladeToGo{$clade};
push (@$cPtr, $asmId);
- $usedGenArk{$asmId} += 1 if (defined($genArkAsm{$asmId}));
+ ++$usedGenArk{$asmId} if (defined($genArkAsm{$asmId}));
+ ++$countingUsedGenArk if (defined($genArkAsm{$asmId}));
++$acceptedAsmIds;
++$goodToGoCount;
++$ncbiSpeciesUsed if ( defined($sciNames{$asmId}) && defined($ncbiSpeciesRecorded{$sciNames{$asmId}}) && (1 == $ncbiSpeciesRecorded{$sciNames{$asmId}}) );
++$iucnSpeciesUsed if ( defined($iucnSpeciesRecorded{$iucnSciName}) && (1 == $iucnSpeciesRecorded{$iucnSciName}));
++$cladeCounts{$clade};
my $assembliesAvailable = 0;
if (defined($sciNames{$asmId})) {
if (defined($sciNameCount{$sciNames{$asmId}})) {
$assembliesAvailable = $sciNameCount{$sciNames{$asmId}};
$maxDupAsm = $assembliesAvailable if ($assembliesAvailable > $maxDupAsm);
}
if ($assembliesAvailable > 1) {
my $bPtr = $sciNameAsmList{$sciNames{$asmId}};
foreach my $aId (@$bPtr) {
next if (defined ($alreadyDone{$aId}));
$alreadyDone{$aId} = 1;
if ($aId ne $asmId) {
if (defined($metaInfo{$aId})) {
my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId});
# if asmSize is below the minimum, don't use it
if ($asmSize < $minimalGenomeSize{$clade}) {
+ if (! defined($genArkAsm{$aId}) && !defined($rrGcaGcfList{$aId}) ) {
printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade});
printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId}));
printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId}));
printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId};
next;
+ } else {
+ printf STDERR "# ACK %s genArk/RR too small at %s\n", $aId, commify($asmSize);
+ }
}
}
++$ncbiSpeciesRecorded{$sciNames{$aId}};
# the defined($sciName{$aId}) indicates it is a GenArk genome
# always accept those even if it goes beyond the limit
- if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($sciName{$aId}) ) {
+ if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($genArkAsm{$aId}) || defined($rrGcaGcfList{$aId})) {
push (@$cPtr, $aId);
$usedGenArk{$aId} += 1 if (defined($genArkAsm{$aId}));
+ ++$countingUsedGenArk if (defined($genArkAsm{$asmId}));
++$acceptedAsmIds;
++$goodToGoCount;
++$cladeCounts{$clade};
} # under limit count or is GenArk assembly
} # if ($aId ne $asmId)
} # foreach my $aId (@$bPtr)
} # if ($assembliesAvailable > 1)
} # if (defined($sciNames{$asmId}))
} # while (my $asmId = )
close (FH);
- printf STDERR "# %s\t%s\tgood to go %s\n", $clade, commify($cladeCount), $goodToGoCount;
$totalGoodToGo += $goodToGoCount;
+ printf STDERR "# %s\t%s\tgood to go %s, total good: %d\n", $clade, commify($cladeCount), $goodToGoCount, $totalGoodToGo;
+ printf STDERR "# examined %d asmIds from\n#%s\n# %s\n# %s\n", $examinedAsmId, $refSeq, $genBank, $asmHubTsv;
+ printf STDERR "# usedGenArk: %s total: %s\t%s\n", commify($countingUsedGenArk-$prevUsedGenArk), commify($countingUsedGenArk), $clade;
+ $prevUsedGenArk = $countingUsedGenArk;
} # foreach my $clade (@clades)
+printf STDERR "# would have knocked out %s GenArk assemblies over species count\n", commify($genArkTooManySpecies);
printf STDERR "# checkedAsmIds: %d, acceptedAsmIds: %d, notInGoodToGo: %d\n", $checkedAsmIds, $acceptedAsmIds, $notInGoodToGo;
printf STDERR "# total good to go %s, maxDupAsmCount: %s\n", commify($totalGoodToGo), commify($maxDupAsm);;
printf STDERR "# should be GenArk: %s\n", commify($shouldBeGenArk);
printf STDERR "# had originally %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount);
+my $usedGenArk = 0;
+my $missedGenArk = 0;
foreach my $genArkAsmId (sort keys %genArkAsm) {
+ my $gClade = "n/a";
+ $gClade = $genArkClade{$genArkAsmId} if (defined($genArkClade{$genArkAsmId}));
if (defined($usedGenArk{$genArkAsmId})) {
- printf STDERR "# used genArk %s %s\n", $genArkAsmId, $genArkAsm{$genArkAsmId};
+ ++$usedGenArk;
+ printf STDERR "# used genArk %s %s %s\n", $genArkAsmId, $gClade, $genArkAsm{$genArkAsmId} if ($usedGenArk < 5);
} else {
- printf STDERR "# missed genArk %s %s\n", $genArkAsmId, $genArkAsm{$genArkAsmId};
+ ++$missedGenArk;
+ printf STDERR "# missed genArk %s %s %s\n", $genArkAsmId, $gClade, $genArkAsm{$genArkAsmId} if ($gClade !~ m/viral|bacteria/);
}
}
-printf STDERR "# should be UCSC RR %s\n", commify($shouldBeUcsc);
+printf STDERR "# should be UCSC RR %s, used GenArk: %s, missed GenArk: %s\n", commify($shouldBeUcsc), commify($usedGenArk), commify($missedGenArk);
printf "\n";
if ( 1 == 0 ) {
printf "\n";
printf "
\n";
printf "
===== summary counts =====
\n";
printf "\n";
printf "
\n";
printf "
number of assemblies
\n";
printf "
category of count
\n";
printf "
table data in tsv (tab separated value) file format
\n";
printf "
assembly minimal size to filter out projects that are not whole genomes
\n";
printf "
\n";
printf "\n";
printf "
%s
total number of NCBI assemblies under consideration
\n", commify($ncbiTotalAssemblies);
@@ -859,77 +905,92 @@
printf "\n";
my %equivalentNamesUsed; # key is NCBI sciName, value is IUCN sciName
my $pageSectionCount = 0;
my %checkDupAsmId; # key is asmId, value is count of times seen
my %cladeSciNameCounts; # key is clade, value is number of different
# scientific names
my %gcfGcaCounts; # key is GCF or GCA, value is count of each
my $asmCountInTable = 0; # counting the rows output
my %statusCounts; # key is status: CR EN VU, value is count
my $totalAssemblySize = 0; # sum of all assembly sizes
+my $outputGenArkRR = 0;
+my %outputGenArk; # key is asmId, value is clade
+my %outputRR; # key is asmId, value is clade
+
foreach my $clade (@clades) {
my $cPtr = $cladeToGo{$clade};
my $countThisClade = scalar(@$cPtr);
printf STDERR "# working on clade '%s', count: %s\n", $clade, commify($countThisClade);
## sectionDiv($clade); # starting new clade table
++$pageSectionCount;
my $totalContigCounts = 0;
my $underSized = 0;
+ my $noCommonName = 0;
+ my $suppressedCount = 0;
######################## rows of per clade table output here ################
my $tsvFile = sprintf("%s.tableData.txt", $clade);
open (PC, ">$tsvFile") or die "can not write to $tsvFile";
foreach my $asmId (@$cPtr) {
if (defined($checkDupAsmId{$asmId})) {
printf STDERR "# what: duplicate asmId: %s in clade %s\n", $asmId, $clade;
$checkDupAsmId{$asmId} += 1;
} else {
$checkDupAsmId{$asmId} = 1;
}
my $assembliesAvailable = 0;
if (defined($sciNames{$asmId})) {
if (defined($sciNameCount{$sciNames{$asmId}})) {
$assembliesAvailable = $sciNameCount{$sciNames{$asmId}};
}
}
if (defined ($asmSuppressed{$asmId})) {
if (!defined($rrGcaGcfList{$asmId})) {
- printf STDERR "# suppressed $asmId\n";
+ if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) {
+ ++$suppressedCount;
+ printf STDERR "# suppressed $asmId\n" if ($suppressedCount < 5);
next;
+ } else {
+ printf STDERR "# genArk/RR %s would have been suppressed\n", $asmId;
+ }
}
}
my $commonName = "n/a";
if (defined($ncbiCommonName{$asmId})) {
$commonName = $ncbiCommonName{$asmId};
if ("n/a" eq $commonName) {
printf STDERR "# getting n/a commonName from ncbiCommonName{%s}\n", $asmId;
}
} elsif (defined($comName{$asmId})) {
$commonName = $comName{$asmId};
if ("n/a" eq $commonName) {
printf STDERR "# getting n/a commonName from comName{%s}\n", $asmId;
}
} else {
- printf STDERR "# no commonName for %s in ncbiCommonName or comName\n", $asmId;
+ if (defined($genArkAsm{$asmId}) || defined($rrGcaGcfList{$asmId})) {
+ printf STDERR "# ACK missed genArk/RR due to no common name for %s\n", $asmId;
+ }
+ ++$noCommonName;
+ printf STDERR "# no commonName for %s in ncbiCommonName or comName\n", $asmId if ($noCommonName < 5);
next;
}
# if (! defined($gcxCountry{$asmId})) {
# printf STDERR "# no country for $asmId, date: %s, submitter: %s\n", $gcxDate{$asmId}, $gcxSubmitter{$asmId};
# next;
# }
# if ("n/a" eq $gcxCountry{$asmId}) {
# printf STDERR "# country is n/a $asmId\n";
# next;
# }
my ($p0, $p1, $p2) = split('_', $asmId, 3);
my $accessionId = "${p0}_${p1}";
my $gcX = substr($asmId, 0, 3);
my $d0 = substr($asmId, 4, 3);
@@ -1015,30 +1076,33 @@
if ($iucnSciName ne $sciNames{$asmId}) {
$equivalentNamesUsed{$sciNames{$asmId}} = $iucnSciName;
}
if (defined($iucnSciNames{$iucnSciName})) {
$iucnStatus = $iucnSciNames{$iucnSciName};
}
}
++$asmCountInTable;
my $statusColor = "";
if ($iucnStatus ne " ") {
$statusColor = $statusColors{$iucnStatus};
++$statusCounts{$iucnStatus};
}
############# starting a table row #################################
+ ++$outputGenArkRR if (defined($rrGcaGcfList{$asmId}) || defined($genArkAsm{$asmId}));
+ $outputGenArk{$asmId} = $clade if (defined($genArkAsm{$asmId}));
+ $outputRR{$asmId} = $clade if (defined($rrGcaGcfList{$asmId}));
if ($asmId =~ m/^GCF/) {
$gcfGcaCounts{'GCF'} += 1;
} elsif ($asmId =~ m/GCA/) {
$gcfGcaCounts{'GCA'} += 1;
}
### If equivalent to UCSC database browser, make reference to RR browser
my $ucscDb = "";
$ucscDb = "/" . $rrGcaGcfList{$asmId} if (defined($rrGcaGcfList{$asmId}));
if (length($ucscDb)) {
$browserUrl = sprintf("/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId});
}
printf PC "%d", $asmCountInTable; # start a line output to clade.tableData.tsv
## count number of different scientific names used in this clade table
@@ -1211,32 +1275,36 @@
printf "
%s
", substr($submitterSortKey,0,20), $asmSubmitter{$asmId};
printf PC "\t%s", $asmSubmitter{$asmId}; # output to clade.tableData.txt
} else {
printf "
n/a
";
printf PC "\t%s", "n/a"; # output to clade.tableData.txt
}
############# twelveth column, clade ################
printf "
%s
\n", $clade;
printf PC "\t%s", $clade;
printf PC "\n"; # finished a line output to clade.tableData.txt
printf "\n";
} # foreach my $asmId (@$cPtr)
close (PC); # finished with clade.tableData.txt output
+ printf STDERR "# no commonName %s for clade %s\n", commify($noCommonName), $clade;
+ printf STDERR "# suppressed %s for clade %s\n", commify($suppressedCount), $clade;
} # foreach my $clade (@clades)
+printf STDERR "# output %s genArk or RR assemblies\n", commify($outputGenArkRR);
+
##########################################################################
## single table is finished, output the end of tbody and the tfoot row
##########################################################################
if ($asmCountInTable > 1) {
my $crCount = 0;
my $enCount = 0;
my $vuCount = 0;
foreach my $statId (keys %statusCounts) {
$crCount = $statusCounts{$statId} if ($statId eq "CR");
$enCount = $statusCounts{$statId} if ($statId eq "EN");
$vuCount = $statusCounts{$statId} if ($statId eq "VU");
}
my $sciNameTotal = 0;
foreach my $c (@clades) {
my $csnPtr = $cladeSciNameCounts{$c};
@@ -1302,30 +1370,53 @@
if (-s "../accumulateMetaInfo.tsv" ) {
open (FH, ">>../accumulateMetaInfo.tsv") or die "can not append to ../accumulateMetaInfo.tsv";
} else {
open (FH, ">../accumulateMetaInfo.tsv") or die "can not write to ../accumulateMetaInfo.tsv";
}
foreach my $asmId (sort keys %newMetaInfo) {
printf FH "%s\t%s\n", $asmId, $newMetaInfo{$asmId};
printf STDERR "# new metaData for %s\n", $asmId;
++$newMetaInfoCount;
}
close (FH);
} else {
printf STDERR "# no new metaInfo\n";
}
+my $beenThere = 0;
+printf STDERR "# checking missing genArk assemblies in output\n";
+foreach my $gAsmId (sort keys %genArkAsm) {
+ ++$beenThere if ($genArkClade{$gAsmId} !~ m/viral|bacteria/);
+ if (!defined($outputGenArk{$gAsmId})) {
+ printf STDERR "# not output genArk %s\t%s\n", $gAsmId, $genArkClade{$gAsmId}
+ if ($genArkClade{$gAsmId} !~ m/viral|bacteria/);
+ }
+}
+printf STDERR "# there were %s GenArk potential to output\n", commify($beenThere);
+
+$beenThere = 0;
+
+printf STDERR "# checking missing RR assemblies in output\n";
+
+foreach my $rrAsmId (sort keys %rrGcaGcfList) {
+ ++$beenThere;
+ if (!defined($outputRR{$rrAsmId})) {
+ printf STDERR "# not output RR %s\t%s\n", $rrAsmId, $rrGcaGcfList{$rrAsmId};
+ }
+}
+printf STDERR "# there were %s RR potential to output\n", commify($beenThere);
+
printf STDERR "# done at exit\n";
exit 0;
__END__
my $count = 0;
foreach my $asmId (sort keys %gcxCountry) {
next if (defined ($asmSuppressed{$asmId}));
next if (! defined($gcxCountry{$asmId}));
next if (! defined($sciName{$asmId}));
next if ("n/a" eq $gcxCountry{$asmId});
##########################################################################
# found discussion of google image search URL at:
# https://stackoverflow.com/questions/21530274/format-for-a-url-that-goes-to-google-image-search
# did not work: