32bd9b7bea2ca6154383f124a803fdd6ad04c77b hiram Sat Feb 29 16:41:43 2020 -0800 add allGaps column and use featureBits result when present and zero when track is empty refs #23891 diff --git src/hg/makeDb/doc/asmHubs/trackData.pl src/hg/makeDb/doc/asmHubs/trackData.pl index 074214a..326e842 100755 --- src/hg/makeDb/doc/asmHubs/trackData.pl +++ src/hg/makeDb/doc/asmHubs/trackData.pl @@ -27,51 +27,71 @@ my $asmCount = 0; # count of assemblies completed and in the table my $overallNucleotides = 0; my $overallSeqCount = 0; my $overallGapSize = 0; my $overallGapCount = 0; ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } -sub oneTrackData($$) { - my ($file, $genomeSize) = @_; +# ($itemCount, $percentCover) = oneTrackData($trackFile, $sizeNoGaps, $trackFb); +# might have a track feature bits file (trackFb), maybe not +sub oneTrackData($$$$$$) { + my ($asmId, $trackName, $file, $genomeSize, $trackFb, $runDir) = @_; # printf STDERR "# %s\n", $file; my $itemCount = 0; my $percentCover = 0; if (! -s "${file}") { + if ($trackName eq "gapOverlap") { + if (-s "${runDir}/$asmId.gapOverlap.bed.gz" ) { + my $lineCount=`zcat "${runDir}/$asmId.gapOverlap.bed.gz" | head | wc -l`; + chomp $lineCount; + if (0 == $lineCount) { + return("0", "0 %"); + } else { return("n/a", "n/a"); } + } + } else { + return("n/a", "n/a"); + } + } if ($file =~ m/.bw$/) { my $bigWigInfo = `bigWigInfo "$file" | egrep "basesCovered:|mean:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; chomp $bigWigInfo; my ($bases, $mean) = split('\s+', $bigWigInfo); $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); $itemCount = sprintf ("%.2f", $mean); # printf STDERR "# bigWigInfo %s %s %s\n", $itemCount, $percentCover, $file; } else { my $bigBedInfo = `bigBedInfo "$file" | egrep "itemCount:|basesCovered:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; chomp $bigBedInfo; my ($items, $bases) = split('\s', $bigBedInfo); $itemCount = commify($items); $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); +# 56992654 bases of 2616369673 (2.178%) in intersection + if ( -s "${trackFb}" ) { +printf STDERR "# $trackFb\n"; + my ($itemBases, undef, undef, $noGapSize, undef) = split('\s+', `cat $trackFb`, 5); + $percentCover = sprintf("%.2f %%", 100.0 * $itemBases / $noGapSize); + } # printf STDERR "# bigBedInfo %s %s %s\n", $itemCount, $percentCover, $file; } return ($itemCount, $percentCover); } ############################################################################## ### start the HTML output ############################################################################## sub startHtml() { my $timeStamp = `date "+%F"`; chomp $timeStamp; my $subSetMessage = "subset of $asmHubName only"; if ($asmHubName eq "vertebrate") { @@ -98,40 +118,41 @@ the genome browser.
The numbers are: item count (percent coverage)
Except for the gc5Base column which is: overall GC % average (percent coverage) END } ############################################################################## ### start the table output ############################################################################## sub startTable() { print <<"END" - - + + + - - - + + + - + END } ############################################################################## ### end the table output ############################################################################## sub endTable() { my $commaNuc = commify($overallNucleotides); my $commaSeqCount = commify($overallSeqCount); my $commaGapSize = commify($overallGapSize); @@ -181,96 +202,97 @@ print <<"END" END } sub asmCounts($) { my ($chromSizes) = @_; my ($sequenceCount, $totalSize) = split('\s+', `ave -col=2 $chromSizes | egrep "^count|^total" | awk '{printf "%d\\n", \$NF}' | xargs echo`); return ($sequenceCount, $totalSize); } -# my ($gapSize) = maskStats($faSizeTxt); +# my ($gapSize, $maskPerCent, $sizeNoGaps) = maskStats($faSizeTxt); sub maskStats($) { my ($faSizeFile) = @_; + my $sizeNoGaps = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$4}'`; my $gapSize = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$3}'`; chomp $gapSize; $gapSize =~ s/\(//; my $totalBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$1}'`; chomp $totalBases; my $maskedBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$9}'`; chomp $maskedBases; my $maskPerCent = 100.0 * $maskedBases / $totalBases; - return ($gapSize, $maskPerCent); + return ($gapSize, $maskPerCent, $sizeNoGaps); } # grep "sequences in 1 file" GCA_900324465.2_fAnaTes1.2.faSize.txt # 555641398 bases (3606496 N's 552034902 real 433510637 upper 118524265 lower) in 50 sequences in 1 files sub gapStats($$) { my ($buildDir, $asmId) = @_; my $gapBed = "$buildDir/trackData/allGaps/$asmId.allGaps.bed.gz"; my $gapCount = 0; if ( -s "$gapBed" ) { $gapCount = `zcat $gapBed | awk '{print \$3-\$2}' | ave stdin | grep '^count' | awk '{print \$2}'`; } chomp $gapCount; return ($gapCount); } ############################################################################## ### tableContents() ############################################################################## sub tableContents() { - my @trackList = qw(gc5Base gap assembly rmsk simpleRepeat windowMasker gapOverlap tandemDups cpgIslandExtUnmasked cpgIslandExt ncbiGene ncbiRefSeq xenoRefGene augustus); + my @trackList = qw(gc5Base gap allGaps assembly rmsk simpleRepeat windowMasker gapOverlap tandemDups cpgIslandExtUnmasked cpgIslandExt ncbiGene ncbiRefSeq xenoRefGene augustus); foreach my $asmId (reverse(@orderList)) { my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); my $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; # next if ($asmId ne "GCF_000001405.39_GRCh38.p13"); my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; if (! -s "$asmReport") { printf STDERR "# no assembly report:\n# %s\n", $asmReport; next; } my $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; my $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; if (! -s "$twoBit") { printf STDERR "# no 2bit file:\n# %s\n", $twoBit; printf "\n", ++$asmCount; printf "\n", $accessionId; - printf "\n"; + printf "\n"; printf "\n"; next; } my $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; if ( ! -s "$faSizeTxt" ) { printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n"; print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`; } - my ($gapSize, $maskPerCent) = maskStats($faSizeTxt); + my ($gapSize, $maskPerCent, $sizeNoGaps) = maskStats($faSizeTxt); $overallGapSize += $gapSize; my ($seqCount, $totalSize) = asmCounts($chromSizes); $overallSeqCount += $seqCount; # my $totalSize=`ave -col=2 $chromSizes | grep "^total" | awk '{printf "%d", \$NF}'`; $overallNucleotides += $totalSize; my $gapCount = gapStats($buildDir, $asmId); $overallGapCount += $gapCount; my $sciName = "notFound"; my $commonName = "notFound"; my $asmDate = "notFound"; my $itemsFound = 0; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = ) { last if ($itemsFound > 5); chomp $line; @@ -289,60 +311,63 @@ $sciName = $line; $commonName =~ s/.*$//; $commonName =~ s/$//; $commonName = $betterName{$asmId} if (exists($betterName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } } close (FH); my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; printf "\n", ++$asmCount; printf "\n", $accessionId, $commonName, $accessionId; foreach my $track (@trackList) { my $trackFile = "$buildDir/bbi/$asmId.$track"; + my $trackFb = "$buildDir/trackData/$track/fb.$asmId.$track.txt"; + my $runDir = "$buildDir/trackData/$track"; my ($itemCount, $percentCover); if ( "$track" eq "gc5Base" ) { $trackFile .= ".bw"; } else { $trackFile .= ".bb"; } if ( "$track" eq "rmsk") { my $rmskStats = "$buildDir/trackData/repeatMasker/$asmId.rmsk.stats"; if (! -s "${rmskStats}") { my $faOut = "$buildDir/trackData/repeatMasker/$asmId.sorted.fa.out.gz"; if ( -s "$faOut") { my $items = `zcat "$faOut" | wc -l`; chomp $items; $itemCount = commify($items); my $masked = `grep masked "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" | awk '{print \$4}' | sed -e 's/%//;'`; chomp $masked; $percentCover = sprintf("%.2f %%", $masked); open (RS, ">$rmskStats") or die "can now write to $rmskStats"; printf RS "%s\t%s\n", $itemCount, $percentCover; close (RS); } else { $itemCount = "n/a"; $percentCover = "n/a"; } } else { ($itemCount, $percentCover) = split('\s+', `cat $rmskStats`); + $percentCover = sprintf("%.2f %%", $percentCover); chomp $percentCover; } } else { - ($itemCount, $percentCover) = oneTrackData($trackFile, $totalSize); + ($itemCount, $percentCover) = oneTrackData($asmId, $track, $trackFile, $totalSize, $trackFb, $runDir); } printf " \n", $itemCount, $percentCover; } printf "\n"; } } ############################################################################## ### main() ############################################################################## open (FH, "<$toolsDir/${commonNameOrder}") or die "can not read ${commonNameOrder}"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line;

count	common name link to genome browser	gc5 base	gap	assembly	AGP gap	all gaps	assembly sequences	rmsk	TRF simpleRepeat	windowMasker	gapOverlap	tandemDups	window Masker	gap Overlap	tandem Dups	cpg unmasked	cpg island	ncbiGene	genes ncbi	ncbiRefSeq	xenoRefGene	augustus
%d	%s	missing masked 2bit file														missing masked 2bit file
%d	%s %s	%s (%s)