32bd9b7bea2ca6154383f124a803fdd6ad04c77b hiram Sat Feb 29 16:41:43 2020 -0800 add allGaps column and use featureBits result when present and zero when track is empty refs #23891 diff --git src/hg/makeDb/doc/asmHubs/trackData.pl src/hg/makeDb/doc/asmHubs/trackData.pl index 074214a..326e842 100755 --- src/hg/makeDb/doc/asmHubs/trackData.pl +++ src/hg/makeDb/doc/asmHubs/trackData.pl @@ -1,360 +1,385 @@ #!/usr/bin/env perl use strict; use warnings; use File::stat; my $argc = scalar(@ARGV); if ($argc != 2) { printf STDERR "usage: trackData.pl Name asmHubName > trackData.html\n"; printf STDERR "e.g.: trackData.pl Mammals mammals > trackData.html\n"; exit 255; } my $Name = shift; my $asmHubName = shift; my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; my $commonNameList = "$asmHubName.asmId.commonName.tsv"; my $commonNameOrder = "$asmHubName.commonName.asmId.orderList.tsv"; my @orderList; # asmId of the assemblies in order from the *.list files # the order to read the different .list files: my %betterName; # key is asmId, value is better common name than found in # assembly_report my $assemblyTotal = 0; # complete list of assemblies in this group my $asmCount = 0; # count of assemblies completed and in the table my $overallNucleotides = 0; my $overallSeqCount = 0; my $overallGapSize = 0; my $overallGapCount = 0; ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } -sub oneTrackData($$) { - my ($file, $genomeSize) = @_; +# ($itemCount, $percentCover) = oneTrackData($trackFile, $sizeNoGaps, $trackFb); +# might have a track feature bits file (trackFb), maybe not +sub oneTrackData($$$$$$) { + my ($asmId, $trackName, $file, $genomeSize, $trackFb, $runDir) = @_; # printf STDERR "# %s\n", $file; my $itemCount = 0; my $percentCover = 0; if (! -s "${file}") { + if ($trackName eq "gapOverlap") { + if (-s "${runDir}/$asmId.gapOverlap.bed.gz" ) { + my $lineCount=`zcat "${runDir}/$asmId.gapOverlap.bed.gz" | head | wc -l`; + chomp $lineCount; + if (0 == $lineCount) { + return("0", "0 %"); + } else { return("n/a", "n/a"); } + } + } else { + return("n/a", "n/a"); + } + } if ($file =~ m/.bw$/) { my $bigWigInfo = `bigWigInfo "$file" | egrep "basesCovered:|mean:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; chomp $bigWigInfo; my ($bases, $mean) = split('\s+', $bigWigInfo); $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); $itemCount = sprintf ("%.2f", $mean); # printf STDERR "# bigWigInfo %s %s %s\n", $itemCount, $percentCover, $file; } else { my $bigBedInfo = `bigBedInfo "$file" | egrep "itemCount:|basesCovered:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; chomp $bigBedInfo; my ($items, $bases) = split('\s', $bigBedInfo); $itemCount = commify($items); $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); +# 56992654 bases of 2616369673 (2.178%) in intersection + if ( -s "${trackFb}" ) { +printf STDERR "# $trackFb\n"; + my ($itemBases, undef, undef, $noGapSize, undef) = split('\s+', `cat $trackFb`, 5); + $percentCover = sprintf("%.2f %%", 100.0 * $itemBases / $noGapSize); + } # printf STDERR "# bigBedInfo %s %s %s\n", $itemCount, $percentCover, $file; } return ($itemCount, $percentCover); } ############################################################################## ### start the HTML output ############################################################################## sub startHtml() { my $timeStamp = `date "+%F"`; chomp $timeStamp; my $subSetMessage = "subset of $asmHubName only"; if ($asmHubName eq "vertebrate") { $subSetMessage = "subset of other ${asmHubName}s only"; } print <<"END"
Assemblies from NCBI/Genbank/Refseq sources, $subSetMessage.
count | common name link to genome browser |
gc5 base | -gap | -assembly | +AGP gap |
+ all gaps |
+ assembly sequences |
rmsk | TRF simpleRepeat |
- windowMasker | -gapOverlap | -tandemDups | +window Masker |
+ gap Overlap |
+ tandem Dups |
cpg unmasked |
cpg island |
- ncbiGene | +genes ncbi |
ncbiRefSeq | xenoRefGene | augustus |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
TOTALS: | total assembly count ${assemblyTotal} |
\nOther assembly hubs available:
\n
Primates | \n" if ($asmHubName ne "primates"); printf "Mammals | \n" if ($asmHubName ne "mammals"); printf "Birds | \n" if ($asmHubName ne "birds"); printf "Fish | \n" if ($asmHubName ne "fish"); printf "other vertebrates | \n" if ($asmHubName ne "vertebrate"); printf "
---|