src/hg/gar/garTable.pl afe1592acd7a4dd79f8ca81b390d10113cb6aadc

afe1592acd7a4dd79f8ca81b390d10113cb6aadc
hiram
  Tue Sep 20 09:40:42 2022 -0700
more complete accounting of all GenArk and UCSC assemblies to get them into the page refs #28930

diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl
index 9f3b2b6..247d28b 100755
--- src/hg/gar/garTable.pl
+++ src/hg/gar/garTable.pl
@@ -81,87 +81,100 @@
 ###############################################################################
 # output a table cell for an N50 measurement
 sub n50Cell($$$) {
   my ($size, $count, $fh) = @_;
   if ($size > 0) {
     printf "<td style='display:none; text-align:right;' sorttable_customkey='%d'>%s&nbsp;(%s)</td>", $size, gmk($size), commify($count);
     printf $fh "\t%d (%d)", $size, $count;	# output to clade.tableData.txt
   } else {
     printf "<td style='display:none;'>&nbsp;</td>";
     printf $fh "\tn/a (n/a)";	# output to clade.tableData.txt
   }
 }
 ###############################################################################
 
 my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi );
-# my @clades = qw( primates mammals birds );
 
 # to help weed out some of the noise
 # key is clade, value is minimal size to count as a whole genome
 # these are actually pretty low to allow in some alternate haplotype
 # assemblies that don't seem to be the whole assembly.
 # The assemblies are also filtered by NCBI status 'full/partial' to only
 # allow in the 'full' genomes meaning representation of the whole genome
 my %minimalGenomeSize = (
   primates => 1000000000,
   mammals => 20000000,
   birds => 200000000,
   fish => 100000,
   vertebrate => 4000000,
   invertebrates => 10000,
   plants => 100000,
   fungi => 50000,
 );
 
 #########################################################################
 ## read in list of current GenArk assemblies
 
 my %genArkAsm;	# key is asmId, value is string with:
+my %genArkClade;	# key is asmId, value is clade
+my %genArkCladeProfile;	# key is clade, value is count
+my %genArkSciName;	# key is asmId, value is sciName
+my %genArkComName;	# key is asmId, value is sciName
 
-# accession<tab>assembly<tab>scientific name<tab>common name<tab>taxonId
+# accession<tab>assembly<tab>scientific name<tab>common name<tab>taxonId<tab>clade
 
 my $genArkCount = 0;
 printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n";
 open (FH, "<UCSC_GI.assemblyHubList.txt") or die "can not read UCSC_GI.assemblyHubList.txt";
 while (my $line = <FH>) {
   next if ($line =~ m/^#/);
   chomp $line;
-  my ($accession, $assembly, $sciName, $commonName, $taxonId) = split('\t', $line);
-  die "ERROR: duplicate accession from UCSC_GI.assemblyHubList.txt" if (defined($genArkAsm{$accession}));
+  my ($accession, $assembly, $sciName, $commonName, $taxonId, $gClade) = split('\t', $line);
   my $asmId = sprintf("%s_%s", $accession, $assembly);
+  die "ERROR: duplicate accession from UCSC_GI.assemblyHubList.txt" if (defined($genArkAsm{$asmId}));
   $genArkAsm{$asmId} = sprintf("%s\t%s\t%s\t%s\t%s", $accession, $assembly, $sciName, $commonName, $taxonId);
+  $gClade =~ s/\(L\)//;	# remove the 'legacy' tag
+  $gClade = "invertebrates" if ($gClade =~ m/invertebrate/);
+  $genArkClade{$asmId} = $gClade;
+  ++$genArkCladeProfile{$gClade};
   ++$genArkCount;
 }
 close (FH);
 
 # accession     assembly        scientific name common name     taxonId
 # GCA_000001905.1 Loxafr3.0       Loxodonta africana      African savanna elephant (2009 genbank) 9785
 
 printf STDERR "# have %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount);
+printf STDERR "# genArk clade profile:\n";
+foreach my $gClade (sort keys %genArkCladeProfile) {
+  printf STDERR "# %6d\t%s\n", $genArkCladeProfile{$gClade}, $gClade;
+}
 
 my $rrCount = 0;
 my %rrGcaGcfList;	# key is asmId, value is UCSC db name
 open (FH, "<rr.GCF.GCA.list") or die "can not read rr.GCF.GCA.list";
 while (my $line = <FH>) {
   chomp $line;
   my ($ucscDb, $asmId) = split('\s+', $line);
   $rrGcaGcfList{$asmId} = $ucscDb;
   ++$rrCount;
 }
 close (FH);
 
 printf STDERR "# have %d assemblies with RR equivalent databases\n", $rrCount;
+printf STDERR "# therefore, should be a total of %s existing browsers\n", commify($rrCount+$genArkCount-$genArkCladeProfile{'viral'}-$genArkCladeProfile{'bacteria'});
+printf STDERR "# %s + %s - %s - %s = %s\n", commify($genArkCount), commify($rrCount), commify($genArkCladeProfile{'viral'}), commify($genArkCladeProfile{'bacteria'}), commify($rrCount+$genArkCount-$genArkCladeProfile{'viral'}-$genArkCladeProfile{'bacteria'});
 
 # establish a name equivalence for some NCBI three word scientific names
 # to IUCN two word scientific names
 
 my %ncbiToIucnNames;	# key is NCBI name (one), value is IUCN name (many)
 # for example:
 # Giraffa camelopardalis antiquorum -> Giraffa camelopardalis
 # Giraffa camelopardalis rothschildi -> Giraffa camelopardalis
 
 printf STDERR "### reading iucnToNcbiEquivalent.txt\n";
 open (FH, "<../iucnToNcbiEquivalent.txt") or die "can not read iucnToNcbiEquivalent.txt";
 while (my $line = <FH>) {
   chomp $line;
   my ($ncbiName, $iucnName) = split('\t', $line);
   $ncbiToIucnNames{$ncbiName} = $iucnName;
@@ -255,93 +268,99 @@
   chomp $line;
   my ($id, $sci, $com) = split('\t', $line);
   $sci =~ s/_/ /g;
   if (defined($sciName{$id})) {
     die "ERROR: duplicate asmId sciName: $id '$sci' '$sciName{$id}'";
   }
   $sciName{$id} = $sci;
   $comName{$id} = $com;
   ++$hubComNames;
 }
 close (FH);
 
 printf STDERR "# common names from GenArk hubs: %s\n", commify($hubComNames);
 
 my $genArkCatchup = 0;
+my $genArkDone = 0;
 ### catch up for any missed genArk hubs
 # accession<tab>assembly<tab>scientific name<tab>common name<tab>taxonId
 foreach my $asmIdKey (sort keys %genArkAsm) {
   if (defined($sciName{$asmIdKey})) {
+    ++$genArkDone;
     next;
   }
   if (defined($comName{$asmIdKey})) {
+    ++$genArkDone;
     next;
   }
   my @a = split('\t', $genArkAsm{$asmIdKey});
   $a[2] =~ s/_/ /g;
   $sciName{$asmIdKey} = $a[2];
   $comName{$asmIdKey} = $a[3];
   ++$genArkCatchup;
 }
 
+printf STDERR "# %d genArk hubs from hgdownload list already done\n", $genArkDone;
 printf STDERR "# catch up %d genArk hubs from hgdownload list\n", $genArkCatchup;
 
 my %metaInfo;	# key is asmId, value is a tsv string of numbers for:
                 # asmSize asmContigCount n50Size n50Count n50ContigSize
 		# n50ContigCount n50ScaffoldSize n50ScaffoldCount
 my %newMetaInfo;	# accumulate new metaInfo to add to end of this file
 
 if ( -s "../accumulateMetaInfo.tsv" ) {
   printf STDERR "# reading ../accumulateMetaInfo.tsv\n";
   open (FH, "<../accumulateMetaInfo.tsv") or die "can not read ../accumulateMetaInfo.tsv";
   while (my $line = <FH>) {
     chomp $line;
     my ($asmId, $metaData) = split('\t', $line, 2);
     die "ERROR: duplicate metaInfo{$asmId}" if (defined($metaInfo{$asmId}));
     $metaInfo{$asmId} = $metaData;
   }
   close (FH);
 }
 
+my $noDateFromSize = 0;
 my $earliestDate = 22001231;
 my $latestDate = 19000101;
 printf STDERR "### reading asmId.sciName.comonName.date.tsv\n";
 open (FH, "<../asmId.sciName.commonName.date.tsv") or die "can not read ../asmId.sciName.commonName.date.tsv";
 while (my $line = <FH>) {
   chomp $line;
   my @a = split('\t', $line);
   next if ($a[0] !~ m/^GC/);
   $a[0] =~ s/__/_/g;
   ++$comNameCount;
   $ncbiCommonName{$a[0]} = $a[2];
   if (defined($a[3]) && ($a[3] =~ m/^[12][0-9][0-9][0-9]-[0-9]+-[0-9]+$/) ) {
      my @b = split('-', $a[3]);
      $ncbiDate{$a[0]} = sprintf("%04d%02d%02d", $b[0], $b[1], $b[2]);
      $earliestDate = $ncbiDate{$a[0]} if ($ncbiDate{$a[0]} < $earliestDate);
      $latestDate = $ncbiDate{$a[0]} if ($latestDate < $ncbiDate{$a[0]});
   } else {
      $ncbiDate{$a[0]} = 19720101;
+     ++$noDateFromSize;
      if (defined($a[3])) {
-       printf STDERR "# no date '%s' for %s'\n", $a[3], $a[0];
+       printf STDERR "# no date '%s' for %s'\n", $a[3], $a[0] if ($noDateFromSize < 5);
      } else {
-       printf STDERR "# no date 'undef' for %s'\n", $a[0];
+       printf STDERR "# no date 'undef' for %s'\n", $a[0] if ($noDateFromSize < 5);
      }
   }
 }
 close (FH);
 printf STDERR "# comNameCount %s from ../../../sizes/asmId.sciName.commonName.date.tsv\n", commify($comNameCount);
-printf STDERR "# dates from %s to %s\n", $earliestDate, $latestDate;
+printf STDERR "# dates from %s to %s, no dates: %d\n", $earliestDate, $latestDate, $noDateFromSize;;
 
 my %sciNameCount;	# key is scientific name, value is count of assemblies
 			# from NCBI assembly_summary .txt files
 my %sciNames;	# key is asmId, value is scientific name
 my %sciNameAsmList;	# key is scientific name, value is pointer to array
 			# which is a list of assemblies on this sciName
 
 # goodToGo will be the master asmId list for output
 my %goodToGoSpecies;	# key is scientific name, value is asmId selected
 my %goodToGo;	# key is asmId, value is iucn status code CR EN VU
 my %goodToGoDate;	# key is sciName, value is date YYYYMMDD for selected
 			# assembly
 # choices in order when duplicate species:
 # GCA assembly
 # newer GCA assembly
@@ -459,30 +478,31 @@
 close (FH);
 
 printf STDERR "# countedGoodToGo: %d, replaced: %d, noDate: %d\n", $countingGoodToGo, $goodToGoReplaced, $warningNoDate;
 printf STDERR "# have %s NCBI assemblies, %s unique NCBI species\n", commify($ncbiTotalAssemblies), commify($ncbiUniqueSpecies);
 printf STDERR "#\thighest species count %s : '%s'\n", commify($highestSpeciesCount), $highestCountName;
 
 my %gcxCountry;	# key is asmId, value is collection location
 my %gcxDate;	# key is asmId, value is collection date
 my %gcxSubmitter;	# key is asmId, value is collected by
 my %asmSuppressed;	# key is asmId, value is 1 meaning suppressed
 
 printf STDERR "### reading asmId.suppressed.list\n";
 open (FH, "<asmId.suppressed.list") or die "can not read asmId.suppressed.list";
 while (my $id = <FH>) {
   chomp $id;
+  next if (defined($genArkAsm{$id}) || defined($rrGcaGcfList{$id}));
   $asmSuppressed{$id} = 1;
 }
 close (FH);
 
 my %asmDate;	# key is asmId, value is date of assembly
 my %asmSubmitter;	# key is asmId, value is name of assembly submitter
 my %asmTaxId;	# key is asmId, value is taxId of species
 
 my $asmIdCount = 0;
 printf STDERR "### reading ../asmId.date.taxId.asmSubmitter.tsv\n";
 open (FH, "<../asmId.date.taxId.asmSubmitter.tsv") or die "can not read a../smId.date.taxId.asmSubmitter.tsv";
 while (my $line = <FH>) {
   chomp $line;
   my ($id, $da, $tid, $asmS) = split('\t', $line);
   $id =~ s/__/_/g;
@@ -540,165 +560,191 @@
 my $maxDupAsm = 0;
 my %ncbiSpeciesRecorded;	# key is NCBI sciName, value is count of those
 my $ncbiSpeciesUsed = 0;	# a unique count of NCBI sciName
 my %iucnSpeciesRecorded;	# key is IUCN sciName, value is count of those
 my $iucnSpeciesUsed = 0;	# a unique count of IUCN sciName
 
 my $sciNameDisplayLimit = 5;    # do not display more than 5 instances
 
 my $checkedAsmIds = 0;
 my $acceptedAsmIds = 0;
 my $notInGoodToGo = 0;
 my %alreadyDone;	# key is asmId, value = 1 indicates already done
 my %underSized;		# key is clade, value is count of underSized
 
 my $NA = "/hive/data/outside/ncbi/genomes/reports/newAsm";
-my %genArkClade;	# key is asmId, value is clade
 
+my $examinedAsmId = 0;
 my $shouldBeGenArk = 0;
 my $shouldBeUcsc = 0;
 my %usedGenArk;	# key is asmId, value is count seen in *.today files
+my $genArkTooManySpecies = 0;
+my $countingUsedGenArk = 0;
+my $prevUsedGenArk = 0;
 
 foreach my $clade (@clades) {
   my $cladeCount = 0;
   my $goodToGoCount = 0;
   my $refSeq = "${NA}/all.refseq.${clade}.today";
   my $genBank = "${NA}/all.genbank.${clade}.today";
   my $asmHubTsv = "~/kent/src/hg/makeDb/doc/${clade}AsmHub/${clade}.orderList.tsv";
+  if ($clade eq "invertebrates") {
+    $asmHubTsv = "~/kent/src/hg/makeDb/doc/invertebrateAsmHub/invertebrate.orderList.tsv";
+  }
   open (FH, "cut -f1 ${refSeq} ${genBank} ${asmHubTsv}|sort -u|") or die "can not read $refSeq or $genBank or $asmHubTsv";
   while (my $asmId = <FH>) {
      chomp $asmId;
+     ++$examinedAsmId;
      $asmId =~ s/\.]/_/g;
      $asmId =~ s/\:/_/g;
      $asmId =~ s/\%/_/g;
      $asmId =~ s/\+/_/g;
      $asmId =~ s/\//_/g;
      $asmId =~ s/\#/_/g;
 #      $asmId =~ s/[.:%+/#]/_/g;
      $asmId =~ s/[()]//g;
      $asmId =~ s/__/_/g;
      ++$shouldBeGenArk if (defined($genArkAsm{$asmId}));
      ++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId}));
      if (defined ($skipPartialGenome{$asmId})) {
-       next if (!defined($genArkAsm{$asmId}));
+       next if (!defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId}));
      }
      if (defined ($asmSuppressed{$asmId})) {
-       next if (!defined($genArkAsm{$asmId}));
+       next if (!defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId}));
      }
      next if (defined ($alreadyDone{$asmId}));
      # something wrong with these two
 # GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683
 # GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679
      next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/);
      # verify this asmId will pass the asmSize limit
      if (defined($metaInfo{$asmId})) {
       my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId});
       # if asmSize is below the minimum, don't use it
        if ($asmSize < $minimalGenomeSize{$clade}) {
          printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade});
      printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId}));
      printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId}));
 printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId};
          next;
        }
      }
      $alreadyDone{$asmId} = 1;
      if (defined($genArkClade{$asmId})) {
-      die "ERROR: duplicate asmId today $asmId '$clade' '$genArkClade{$asmId}'";
+          printf STDERR "# warning genArkClade{%s} does not = %s\n", $genArkClade{$asmId}, $clade if ($genArkClade{$asmId} ne $clade);
      }
      ++$checkedAsmIds;
      my $iucnSciName = "";
      if (defined($sciNames{$asmId})) {
        ++$ncbiSpeciesRecorded{$sciNames{$asmId}};
+       if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) {
          next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit);
          $iucnSciName = $sciNames{$asmId};
    $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}}));
          ++$iucnSpeciesRecorded{$iucnSciName};
        } else { 
+         ++$genArkTooManySpecies if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit);
+       }
+     } else {
        printf STDERR "# no sciName for: '%s' at %s\n", $asmId, commify($goodToGoCount + 1);
      }
-     $genArkClade{$asmId} = $clade;
      ++$cladeCount;
      if (! defined($cladeToGo{$clade})) {
        my @a;
        $cladeToGo{$clade} = \@a;
      }
      my $cPtr = $cladeToGo{$clade};
      push (@$cPtr, $asmId);
-     $usedGenArk{$asmId} += 1 if (defined($genArkAsm{$asmId}));
+     ++$usedGenArk{$asmId} if (defined($genArkAsm{$asmId}));
+     ++$countingUsedGenArk if (defined($genArkAsm{$asmId}));
      ++$acceptedAsmIds;
      ++$goodToGoCount;
      ++$ncbiSpeciesUsed if ( defined($sciNames{$asmId}) && defined($ncbiSpeciesRecorded{$sciNames{$asmId}}) && (1 == $ncbiSpeciesRecorded{$sciNames{$asmId}}) );
      ++$iucnSpeciesUsed if ( defined($iucnSpeciesRecorded{$iucnSciName}) && (1 == $iucnSpeciesRecorded{$iucnSciName}));
      ++$cladeCounts{$clade};
      my $assembliesAvailable = 0;
      if (defined($sciNames{$asmId})) {
        if (defined($sciNameCount{$sciNames{$asmId}})) {
          $assembliesAvailable = $sciNameCount{$sciNames{$asmId}};
        $maxDupAsm = $assembliesAvailable if ($assembliesAvailable > $maxDupAsm);
        }
        if ($assembliesAvailable > 1) {
           my $bPtr = $sciNameAsmList{$sciNames{$asmId}};
           foreach my $aId (@$bPtr) {
              next if (defined ($alreadyDone{$aId}));
              $alreadyDone{$aId} = 1;
              if ($aId ne $asmId) {
                if (defined($metaInfo{$aId})) {
                  my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId});
                  # if asmSize is below the minimum, don't use it
                  if ($asmSize < $minimalGenomeSize{$clade}) {
+                   if (! defined($genArkAsm{$aId}) && !defined($rrGcaGcfList{$aId}) ) {
                    printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade});
      printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId}));
      printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId}));
 printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId};
                    next;
+                   } else {
+                     printf STDERR "# ACK %s genArk/RR too small at %s\n", $aId, commify($asmSize);
+                   }
                  }
                }
                ++$ncbiSpeciesRecorded{$sciNames{$aId}};
                # the defined($sciName{$aId}) indicates it is a GenArk genome
                # always accept those even if it goes beyond the limit
-               if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($sciName{$aId}) ) {
+               if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($genArkAsm{$aId}) || defined($rrGcaGcfList{$aId})) {
                  push (@$cPtr, $aId);
                  $usedGenArk{$aId} += 1 if (defined($genArkAsm{$aId}));
+                 ++$countingUsedGenArk if (defined($genArkAsm{$asmId}));
                  ++$acceptedAsmIds;
                  ++$goodToGoCount;
                  ++$cladeCounts{$clade};
                }	#	under limit count or is GenArk assembly
              }	#	if ($aId ne $asmId)
           }	#	foreach my $aId (@$bPtr)
         }	#	if ($assembliesAvailable > 1)
      }	#	if (defined($sciNames{$asmId}))
   }	#	while (my $asmId = <FH>)
   close (FH);
-  printf STDERR "# %s\t%s\tgood to go %s\n", $clade, commify($cladeCount), $goodToGoCount;
   $totalGoodToGo += $goodToGoCount;
+  printf STDERR "# %s\t%s\tgood to go %s, total good: %d\n", $clade, commify($cladeCount), $goodToGoCount, $totalGoodToGo;
+  printf STDERR "# examined %d asmIds from\n#%s\n# %s\n# %s\n", $examinedAsmId, $refSeq, $genBank, $asmHubTsv;
+  printf STDERR "# usedGenArk: %s total: %s\t%s\n", commify($countingUsedGenArk-$prevUsedGenArk), commify($countingUsedGenArk), $clade;
+  $prevUsedGenArk = $countingUsedGenArk;
 }	#	foreach my $clade (@clades)
 
+printf STDERR "# would have knocked out %s GenArk assemblies over species count\n", commify($genArkTooManySpecies);
 printf STDERR "# checkedAsmIds: %d, acceptedAsmIds: %d, notInGoodToGo: %d\n", $checkedAsmIds, $acceptedAsmIds, $notInGoodToGo;
 
 printf STDERR "# total good to go %s, maxDupAsmCount: %s\n", commify($totalGoodToGo), commify($maxDupAsm);;
 
 printf STDERR "# should be GenArk: %s\n", commify($shouldBeGenArk);
 printf STDERR "# had originally %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount);
+my $usedGenArk = 0;
+my $missedGenArk = 0;
 foreach my $genArkAsmId (sort keys %genArkAsm) {
+  my $gClade = "n/a";
+  $gClade = $genArkClade{$genArkAsmId} if (defined($genArkClade{$genArkAsmId}));
   if (defined($usedGenArk{$genArkAsmId})) {
-    printf STDERR "# used genArk %s %s\n", $genArkAsmId, $genArkAsm{$genArkAsmId};
+    ++$usedGenArk;
+    printf STDERR "# used genArk %s %s %s\n", $genArkAsmId, $gClade, $genArkAsm{$genArkAsmId} if ($usedGenArk < 5);
   } else {
-    printf STDERR "# missed genArk %s %s\n", $genArkAsmId, $genArkAsm{$genArkAsmId};
+    ++$missedGenArk;
+    printf STDERR "# missed genArk %s %s %s\n", $genArkAsmId, $gClade, $genArkAsm{$genArkAsmId} if ($gClade !~ m/viral|bacteria/);
   }
 }
-printf STDERR "# should be UCSC RR %s\n", commify($shouldBeUcsc);
+printf STDERR "# should be UCSC RR %s, used GenArk: %s, missed GenArk: %s\n", commify($shouldBeUcsc), commify($usedGenArk), commify($missedGenArk);
 
 printf "<hr>\n";
 if ( 1 == 0 ) {
 printf "<button id='summaryCountsHideShow' onclick='gar.hideTable(\"summaryCounts\")'>[hide]</button>\n";
 printf "<table class='sortable borderOne' id='summaryCounts'>\n";
 printf "<caption style='text-align:center;'><b>===== summary counts =====</b></caption>\n";
 printf "<thead>\n";
 printf "<tr>\n";
 printf "<th>number of assemblies</th>\n";
 printf "<th>category of count</th>\n";
 printf "<th>table data in tsv<br>(tab separated value)<br>file format</th>\n";
 printf "<th>assembly minimal size<br>to filter out projects that<br>are not whole genomes</th>\n";
 printf "</tr>\n";
 printf "</thead><tbody>\n";
 printf "<tr><th style='text-align:right;'>%s</th><th>total number of NCBI assemblies under consideration</th><th>&nbsp;</th><th>&nbsp;</th></tr>\n", commify($ncbiTotalAssemblies);
@@ -859,77 +905,92 @@
 printf "</thead><tbody>\n";
 
 my %equivalentNamesUsed;	# key is NCBI sciName, value is IUCN sciName
 my $pageSectionCount = 0;
 
 my %checkDupAsmId;	# key is asmId, value is count of times seen
 
 my %cladeSciNameCounts;	# key is clade, value is number of different
 			# scientific names
 
 my %gcfGcaCounts;	# key is GCF or GCA, value is count of each
 my $asmCountInTable = 0;	# counting the rows output
 my %statusCounts;	# key is status: CR EN VU, value is count
 my $totalAssemblySize = 0;	# sum of all assembly sizes
 
+my $outputGenArkRR = 0;
+my %outputGenArk;	# key is asmId, value is clade
+my %outputRR;	# key is asmId, value is clade
+
 foreach my $clade (@clades) {
   my $cPtr = $cladeToGo{$clade};
   my $countThisClade = scalar(@$cPtr);
   printf STDERR "# working on clade '%s', count: %s\n", $clade, commify($countThisClade);
 
 ##  sectionDiv($clade);	# starting new clade table
 
   ++$pageSectionCount;
   my $totalContigCounts = 0;
   my $underSized = 0;
+  my $noCommonName = 0;
+  my $suppressedCount = 0;
 
   ######################## rows of per clade table output here ################
   my $tsvFile = sprintf("%s.tableData.txt", $clade);
   open (PC, ">$tsvFile") or die "can not write to $tsvFile";
 
   foreach my $asmId (@$cPtr) {
      if (defined($checkDupAsmId{$asmId})) {
       printf STDERR "# what: duplicate asmId: %s in clade %s\n", $asmId, $clade;
         $checkDupAsmId{$asmId} += 1;
      } else {
         $checkDupAsmId{$asmId} = 1;
      }
     my $assembliesAvailable = 0;
     if (defined($sciNames{$asmId})) {
       if (defined($sciNameCount{$sciNames{$asmId}})) {
         $assembliesAvailable = $sciNameCount{$sciNames{$asmId}};
       }
     }
     if (defined ($asmSuppressed{$asmId})) {
       if (!defined($rrGcaGcfList{$asmId})) {
-        printf STDERR "# suppressed $asmId\n";
+        if (! defined($genArkAsm{$asmId}) && !defined($rrGcaGcfList{$asmId})) {
+        ++$suppressedCount;
+        printf STDERR "# suppressed $asmId\n" if ($suppressedCount < 5);
         next;
+        } else {
+          printf STDERR "# genArk/RR %s would have been suppressed\n", $asmId;
+        }
       }
     }
     my $commonName = "n/a";
     if (defined($ncbiCommonName{$asmId})) {
       $commonName = $ncbiCommonName{$asmId};
       if ("n/a" eq $commonName) {
         printf STDERR "# getting n/a commonName from ncbiCommonName{%s}\n", $asmId;
       }
     } elsif (defined($comName{$asmId})) {
       $commonName = $comName{$asmId};
       if ("n/a" eq $commonName) {
         printf STDERR "# getting n/a commonName from comName{%s}\n", $asmId;
       }
     } else {
-      printf STDERR "# no commonName for %s in ncbiCommonName or comName\n", $asmId;
+      if (defined($genArkAsm{$asmId}) || defined($rrGcaGcfList{$asmId})) {
+        printf STDERR "# ACK missed genArk/RR due to no common name for %s\n", $asmId;
+      }
+      ++$noCommonName;
+      printf STDERR "# no commonName for %s in ncbiCommonName or comName\n", $asmId if ($noCommonName < 5);
       next;
     }
 
 #   if (! defined($gcxCountry{$asmId})) {
 #      printf STDERR "# no country for $asmId, date: %s, submitter: %s\n", $gcxDate{$asmId}, $gcxSubmitter{$asmId};
 #      next;
 #    }
 #    if ("n/a" eq $gcxCountry{$asmId}) {
 #      printf STDERR "# country is n/a $asmId\n";
 #      next;
 #    }
   my ($p0, $p1, $p2) = split('_', $asmId, 3);
   my $accessionId = "${p0}_${p1}";
   my $gcX = substr($asmId, 0, 3);
   my $d0 = substr($asmId, 4, 3);
@@ -1015,30 +1076,33 @@
      if ($iucnSciName ne $sciNames{$asmId}) {
        $equivalentNamesUsed{$sciNames{$asmId}} = $iucnSciName;
      }
      if (defined($iucnSciNames{$iucnSciName})) {
        $iucnStatus = $iucnSciNames{$iucnSciName};
      }
   }
   ++$asmCountInTable;
   my $statusColor = "";
   if ($iucnStatus ne "&nbsp;") {
     $statusColor = $statusColors{$iucnStatus};
     ++$statusCounts{$iucnStatus};
   }
 
   ############# starting a table row  #################################
+  ++$outputGenArkRR if (defined($rrGcaGcfList{$asmId}) || defined($genArkAsm{$asmId}));
+  $outputGenArk{$asmId} = $clade if (defined($genArkAsm{$asmId}));
+  $outputRR{$asmId} = $clade if (defined($rrGcaGcfList{$asmId}));
   if ($asmId =~ m/^GCF/) {
     $gcfGcaCounts{'GCF'} += 1;
   } elsif ($asmId =~ m/GCA/) {
     $gcfGcaCounts{'GCA'} += 1;
   }
   ### If equivalent to UCSC database browser, make reference to RR browser
   my $ucscDb = "";
   $ucscDb = "/" . $rrGcaGcfList{$asmId} if (defined($rrGcaGcfList{$asmId}));
   if (length($ucscDb)) {
      $browserUrl = sprintf("/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId});
   }
 
   printf PC "%d", $asmCountInTable;	# start a line output to clade.tableData.tsv
 
   ## count number of different scientific names used in this clade table
@@ -1211,32 +1275,36 @@
     printf "<td sorttable_customkey='%s' style='display:none;'>%s</td>", substr($submitterSortKey,0,20), $asmSubmitter{$asmId};
     printf PC "\t%s", $asmSubmitter{$asmId};	# output to clade.tableData.txt
   } else {
     printf "<td sorttable_customkey='n/a' style='display:none;'>n/a</td>";
     printf PC "\t%s", "n/a";	# output to clade.tableData.txt
   }
 
   ############# twelveth column,  clade ################
   printf "<td style='display:none;'>%s</td>\n", $clade;
   printf PC "\t%s", $clade;
 
   printf PC "\n";	# finished a line output to clade.tableData.txt
   printf "</tr>\n";
   }	#	foreach my $asmId (@$cPtr)
   close (PC);	# finished with clade.tableData.txt output
+  printf STDERR "# no commonName %s for clade %s\n", commify($noCommonName), $clade;
+  printf STDERR "# suppressed %s for clade %s\n", commify($suppressedCount), $clade;
 }	#	foreach my $clade (@clades)
 
+printf STDERR "# output %s genArk or RR assemblies\n", commify($outputGenArkRR);
+
 ##########################################################################
 ## single table is finished, output the end of tbody and the tfoot row
 ##########################################################################
 if ($asmCountInTable > 1) {
   my $crCount = 0;
   my $enCount = 0;
   my $vuCount = 0;
   foreach my $statId (keys %statusCounts) {
     $crCount = $statusCounts{$statId} if ($statId eq "CR");
     $enCount = $statusCounts{$statId} if ($statId eq "EN");
     $vuCount = $statusCounts{$statId} if ($statId eq "VU");
   }
   my $sciNameTotal = 0;
   foreach my $c (@clades) {
     my $csnPtr = $cladeSciNameCounts{$c};
@@ -1302,30 +1370,53 @@
 if (-s "../accumulateMetaInfo.tsv" ) {
   open (FH, ">>../accumulateMetaInfo.tsv") or die "can not append to ../accumulateMetaInfo.tsv";
 } else {
   open (FH, ">../accumulateMetaInfo.tsv") or die "can not write to ../accumulateMetaInfo.tsv";
 }
 foreach my $asmId (sort keys %newMetaInfo) {
   printf FH "%s\t%s\n", $asmId, $newMetaInfo{$asmId};
   printf STDERR "# new metaData for %s\n", $asmId;
   ++$newMetaInfoCount;
 }
 close (FH);
 } else {
   printf STDERR "# no new metaInfo\n";
 }
 
+my $beenThere = 0;
+printf STDERR "# checking missing genArk assemblies in output\n";
+foreach my $gAsmId (sort keys %genArkAsm) {
+  ++$beenThere if ($genArkClade{$gAsmId} !~ m/viral|bacteria/);
+  if (!defined($outputGenArk{$gAsmId})) {
+    printf STDERR "# not output genArk %s\t%s\n", $gAsmId, $genArkClade{$gAsmId}
+ if ($genArkClade{$gAsmId} !~ m/viral|bacteria/);
+  }
+}
+printf STDERR "# there were %s GenArk potential to output\n", commify($beenThere);
+
+$beenThere = 0;
+
+printf STDERR "# checking missing RR assemblies in output\n";
+
+foreach my $rrAsmId (sort keys %rrGcaGcfList) {
+  ++$beenThere;
+  if (!defined($outputRR{$rrAsmId})) {
+   printf STDERR "# not output RR %s\t%s\n", $rrAsmId, $rrGcaGcfList{$rrAsmId};
+  }
+}
+printf STDERR "# there were %s RR potential to output\n", commify($beenThere);
+
 printf STDERR "# done at exit\n";
 exit 0;
 
 __END__
 
 my $count = 0;
 foreach my $asmId (sort keys %gcxCountry) {
   next if (defined ($asmSuppressed{$asmId}));
   next if (! defined($gcxCountry{$asmId}));
   next if (! defined($sciName{$asmId}));
   next if ("n/a" eq $gcxCountry{$asmId});
 ##########################################################################
 # found discussion of google image search URL at:
 # https://stackoverflow.com/questions/21530274/format-for-a-url-that-goes-to-google-image-search
 # did not work: