d4fdeadb1c734b4bf4d9885f462fa93c79df55b7
hiram
Mon Feb 21 16:02:14 2022 -0800
add tooltip doc on column headers refs #28930
diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl
index 1db498f..c5f7856 100755
--- src/hg/gar/garTable.pl
+++ src/hg/gar/garTable.pl
@@ -71,53 +71,57 @@
return sprintf("%d", $n) if ($n < 1000);
my $m = $n/1000;
return sprintf("%.2fK", $m) if ($m < 1000);
$m = $n/1000000;
return sprintf("%.2fM", $m) if ($m < 1000);
$m = $n/1000000000;
return sprintf("%.3fG", $m);
}
###############################################################################
###############################################################################
# output a table cell for an N50 measurement
sub n50Cell($$$) {
my ($size, $count, $fh) = @_;
if ($size > 0) {
- printf "
%s (%s) | ", $size, gmk($size), $count;
+ printf "%s (%s) | ", $size, gmk($size), commify($count);
printf $fh "\t%d (%d)", $size, $count; # output to clade.tableData.txt
} else {
printf " | ";
printf $fh "\tn/a (n/a)"; # output to clade.tableData.txt
}
}
###############################################################################
my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi );
# my @clades = qw( primates mammals birds );
# to help weed out some of the noise
# key is clade, value is minimal size to count as a whole genome
+# these are actually pretty low to allow in some alternate haplotype
+# assemblies that don't seem to be the whole assembly.
+# The assemblies are also filtered by NCBI status 'full/partial' to only
+# allow in the 'full' genomes meaning representation of the whole genome
my %minimalGenomeSize = (
primates => 1000000000,
- mammals => 200000000,
+ mammals => 20000000,
birds => 200000000,
- fish => 1000000,
- vertebrate => 400000000,
- invertebrates => 10000000,
- plants => 10000000,
- fungi => 1000000,
+ fish => 100000,
+ vertebrate => 4000000,
+ invertebrates => 10000,
+ plants => 100000,
+ fungi => 50000,
);
#########################################################################
## read in list of current GenArk assemblies
my %genArkAsm; # key is asmId, value is string with:
# accessionassemblyscientific namecommon nametaxonId
my $genArkCount = 0;
printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n";
open (FH, ") {
next if ($line =~ m/^#/);
chomp $line;
@@ -548,43 +552,39 @@
$asmId =~ s/\//_/g;
$asmId =~ s/\#/_/g;
# $asmId =~ s/[.:%+/#]/_/g;
$asmId =~ s/[()]//g;
$asmId =~ s/__/_/g;
++$shouldBeGenArk if (defined($genArkAsm{$asmId}));
++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId}));
next if (defined ($skipPartialGenome{$asmId}));
next if (defined ($asmSuppressed{$asmId}));
next if (defined ($alreadyDone{$asmId}));
# something wrong with these two
# GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683
# GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679
next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/);
# verify this asmId will pass the asmSize limit
-### if (defined($metaInfo{$asmId})) {
-### my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId});
-### next if ($asmSize < $minimalGenomeSize{$clade}); # too small
-### }
if (defined($metaInfo{$asmId})) {
my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId});
# if asmSize is below the minimum, don't use it
if ($asmSize < $minimalGenomeSize{$clade}) {
printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade});
printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId}));
printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId}));
printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId};
-### XXX next;
+ next;
}
}
$alreadyDone{$asmId} = 1;
if (defined($genArkClade{$asmId})) {
die "ERROR: duplicate asmId today $asmId '$clade' '$genArkClade{$asmId}'";
}
++$checkedAsmIds;
my $iucnSciName = "";
if (defined($sciNames{$asmId})) {
++$ncbiSpeciesRecorded{$sciNames{$asmId}};
next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit);
$iucnSciName = $sciNames{$asmId};
$iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}}));
++$iucnSpeciesRecorded{$iucnSciName};
} else {
@@ -611,31 +611,31 @@
}
if ($assembliesAvailable > 1) {
my $bPtr = $sciNameAsmList{$sciNames{$asmId}};
foreach my $aId (@$bPtr) {
next if (defined ($alreadyDone{$aId}));
$alreadyDone{$aId} = 1;
if ($aId ne $asmId) {
if (defined($metaInfo{$aId})) {
my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId});
# if asmSize is below the minimum, don't use it
if ($asmSize < $minimalGenomeSize{$clade}) {
printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade});
printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId}));
printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId}));
printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId};
-### XXX next;
+ next;
}
}
++$ncbiSpeciesRecorded{$sciNames{$aId}};
# the defined($sciName{$aId}) indicates it is a GenArk genome
# always accept those even if it goes beyond the limit
if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($sciName{$aId}) ) {
push (@$cPtr, $aId);
++$acceptedAsmIds;
++$goodToGoCount;
++$cladeCounts{$clade};
} # under limit count or is GenArk assembly
} # if ($aId ne $asmId)
} # foreach my $aId (@$bPtr)
} # if ($assembliesAvailable > 1)
} # if (defined($sciNames{$asmId}))
@@ -699,49 +699,30 @@
printf "top of page | \n";
printf "\n";
}
} # if ( 1 == 0 )
# count all assemblies in all clades
my $totalAssemblies = 0;
foreach my $c (@clades) {
$totalAssemblies += $cladeCounts{$c};
}
printf "\n";
printf "\n";
printf "
\n";
-
-printf "\n";
-
-
printf "