32bd9b7bea2ca6154383f124a803fdd6ad04c77b hiram Sat Feb 29 16:41:43 2020 -0800 add allGaps column and use featureBits result when present and zero when track is empty refs #23891 diff --git src/hg/makeDb/doc/asmHubs/trackData.pl src/hg/makeDb/doc/asmHubs/trackData.pl index 074214a..326e842 100755 --- src/hg/makeDb/doc/asmHubs/trackData.pl +++ src/hg/makeDb/doc/asmHubs/trackData.pl @@ -1,360 +1,385 @@ #!/usr/bin/env perl use strict; use warnings; use File::stat; my $argc = scalar(@ARGV); if ($argc != 2) { printf STDERR "usage: trackData.pl Name asmHubName > trackData.html\n"; printf STDERR "e.g.: trackData.pl Mammals mammals > trackData.html\n"; exit 255; } my $Name = shift; my $asmHubName = shift; my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; my $commonNameList = "$asmHubName.asmId.commonName.tsv"; my $commonNameOrder = "$asmHubName.commonName.asmId.orderList.tsv"; my @orderList; # asmId of the assemblies in order from the *.list files # the order to read the different .list files: my %betterName; # key is asmId, value is better common name than found in # assembly_report my $assemblyTotal = 0; # complete list of assemblies in this group my $asmCount = 0; # count of assemblies completed and in the table my $overallNucleotides = 0; my $overallSeqCount = 0; my $overallGapSize = 0; my $overallGapCount = 0; ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } -sub oneTrackData($$) { - my ($file, $genomeSize) = @_; +# ($itemCount, $percentCover) = oneTrackData($trackFile, $sizeNoGaps, $trackFb); +# might have a track feature bits file (trackFb), maybe not +sub oneTrackData($$$$$$) { + my ($asmId, $trackName, $file, $genomeSize, $trackFb, $runDir) = @_; # printf STDERR "# %s\n", $file; my $itemCount = 0; my $percentCover = 0; if (! -s "${file}") { + if ($trackName eq "gapOverlap") { + if (-s "${runDir}/$asmId.gapOverlap.bed.gz" ) { + my $lineCount=`zcat "${runDir}/$asmId.gapOverlap.bed.gz" | head | wc -l`; + chomp $lineCount; + if (0 == $lineCount) { + return("0", "0 %"); + } else { return("n/a", "n/a"); } + } + } else { + return("n/a", "n/a"); + } + } if ($file =~ m/.bw$/) { my $bigWigInfo = `bigWigInfo "$file" | egrep "basesCovered:|mean:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; chomp $bigWigInfo; my ($bases, $mean) = split('\s+', $bigWigInfo); $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); $itemCount = sprintf ("%.2f", $mean); # printf STDERR "# bigWigInfo %s %s %s\n", $itemCount, $percentCover, $file; } else { my $bigBedInfo = `bigBedInfo "$file" | egrep "itemCount:|basesCovered:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; chomp $bigBedInfo; my ($items, $bases) = split('\s', $bigBedInfo); $itemCount = commify($items); $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); +# 56992654 bases of 2616369673 (2.178%) in intersection + if ( -s "${trackFb}" ) { +printf STDERR "# $trackFb\n"; + my ($itemBases, undef, undef, $noGapSize, undef) = split('\s+', `cat $trackFb`, 5); + $percentCover = sprintf("%.2f %%", 100.0 * $itemBases / $noGapSize); + } # printf STDERR "# bigBedInfo %s %s %s\n", $itemCount, $percentCover, $file; } return ($itemCount, $percentCover); } ############################################################################## ### start the HTML output ############################################################################## sub startHtml() { my $timeStamp = `date "+%F"`; chomp $timeStamp; my $subSetMessage = "subset of $asmHubName only"; if ($asmHubName eq "vertebrate") { $subSetMessage = "subset of other ${asmHubName}s only"; } print <<"END"

$Name Genomes assembly hubs

Assemblies from NCBI/Genbank/Refseq sources, $subSetMessage.

See also: hub access


Data resource links

NOTE: Click on the column headers to sort the table by that column
The link to genome browser will attach only that single assembly to the genome browser.
The numbers are: item count (percent coverage)
Except for the gc5Base column which is: overall GC % average (percent coverage) END } ############################################################################## ### start the table output ############################################################################## sub startTable() { print <<"END" - - + + + - - - + + + - + END } ############################################################################## ### end the table output ############################################################################## sub endTable() { my $commaNuc = commify($overallNucleotides); my $commaSeqCount = commify($overallSeqCount); my $commaGapSize = commify($overallGapSize); my $commaGapCount = commify($overallGapCount); my $percentDone = 100.0 * $asmCount / $assemblyTotal; if ($assemblyTotal > 1) { print <<"END"
count common name
link to genome browser
gc5 basegapassemblyAGP
gap
all
gaps
assembly
sequences
rmsk TRF
simpleRepeat
windowMaskergapOverlaptandemDupswindow
Masker
gap
Overlap
tandem
Dups
cpg
unmasked
cpg
island
ncbiGenegenes
ncbi
ncbiRefSeq xenoRefGene augustus
TOTALS:total assembly count ${assemblyTotal}
END } else { print <<"END" END } } # sub endTable() ############################################################################## ### end the HTML output ############################################################################## sub endHtml() { if ($asmHubName ne "viral") { printf "

\nOther assembly hubs available:
\n\n"; printf "\n" if ($asmHubName ne "primates"); printf "\n" if ($asmHubName ne "mammals"); printf "\n" if ($asmHubName ne "birds"); printf "\n" if ($asmHubName ne "fish"); printf "\n" if ($asmHubName ne "vertebrate"); printf "\n
PrimatesMammalsBirdsFishother vertebrates
\n

\n"; } print <<"END" END } sub asmCounts($) { my ($chromSizes) = @_; my ($sequenceCount, $totalSize) = split('\s+', `ave -col=2 $chromSizes | egrep "^count|^total" | awk '{printf "%d\\n", \$NF}' | xargs echo`); return ($sequenceCount, $totalSize); } -# my ($gapSize) = maskStats($faSizeTxt); +# my ($gapSize, $maskPerCent, $sizeNoGaps) = maskStats($faSizeTxt); sub maskStats($) { my ($faSizeFile) = @_; + my $sizeNoGaps = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$4}'`; my $gapSize = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$3}'`; chomp $gapSize; $gapSize =~ s/\(//; my $totalBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$1}'`; chomp $totalBases; my $maskedBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$9}'`; chomp $maskedBases; my $maskPerCent = 100.0 * $maskedBases / $totalBases; - return ($gapSize, $maskPerCent); + return ($gapSize, $maskPerCent, $sizeNoGaps); } # grep "sequences in 1 file" GCA_900324465.2_fAnaTes1.2.faSize.txt # 555641398 bases (3606496 N's 552034902 real 433510637 upper 118524265 lower) in 50 sequences in 1 files sub gapStats($$) { my ($buildDir, $asmId) = @_; my $gapBed = "$buildDir/trackData/allGaps/$asmId.allGaps.bed.gz"; my $gapCount = 0; if ( -s "$gapBed" ) { $gapCount = `zcat $gapBed | awk '{print \$3-\$2}' | ave stdin | grep '^count' | awk '{print \$2}'`; } chomp $gapCount; return ($gapCount); } ############################################################################## ### tableContents() ############################################################################## sub tableContents() { - my @trackList = qw(gc5Base gap assembly rmsk simpleRepeat windowMasker gapOverlap tandemDups cpgIslandExtUnmasked cpgIslandExt ncbiGene ncbiRefSeq xenoRefGene augustus); + my @trackList = qw(gc5Base gap allGaps assembly rmsk simpleRepeat windowMasker gapOverlap tandemDups cpgIslandExtUnmasked cpgIslandExt ncbiGene ncbiRefSeq xenoRefGene augustus); foreach my $asmId (reverse(@orderList)) { my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); my $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; # next if ($asmId ne "GCF_000001405.39_GRCh38.p13"); my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; if (! -s "$asmReport") { printf STDERR "# no assembly report:\n# %s\n", $asmReport; next; } my $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; my $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; if (! -s "$twoBit") { printf STDERR "# no 2bit file:\n# %s\n", $twoBit; printf "%d\n", ++$asmCount; printf "%s\n", $accessionId; - printf "missing masked 2bit file\n"; + printf "missing masked 2bit file\n"; printf "\n"; next; } my $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; if ( ! -s "$faSizeTxt" ) { printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n"; print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`; } - my ($gapSize, $maskPerCent) = maskStats($faSizeTxt); + my ($gapSize, $maskPerCent, $sizeNoGaps) = maskStats($faSizeTxt); $overallGapSize += $gapSize; my ($seqCount, $totalSize) = asmCounts($chromSizes); $overallSeqCount += $seqCount; # my $totalSize=`ave -col=2 $chromSizes | grep "^total" | awk '{printf "%d", \$NF}'`; $overallNucleotides += $totalSize; my $gapCount = gapStats($buildDir, $asmId); $overallGapCount += $gapCount; my $sciName = "notFound"; my $commonName = "notFound"; my $asmDate = "notFound"; my $itemsFound = 0; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = ) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; $line =~ s/\s+$//g;; if ($line =~ m/Date:/) { if ($asmDate =~ m/notFound/) { ++$itemsFound; $asmDate = $line; $asmDate =~ s/.*:\s+//; } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*\(//; $commonName =~ s/\)//; $commonName = $betterName{$asmId} if (exists($betterName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } } close (FH); my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; printf "%d\n", ++$asmCount; printf "%s
%s
\n", $accessionId, $commonName, $accessionId; foreach my $track (@trackList) { my $trackFile = "$buildDir/bbi/$asmId.$track"; + my $trackFb = "$buildDir/trackData/$track/fb.$asmId.$track.txt"; + my $runDir = "$buildDir/trackData/$track"; my ($itemCount, $percentCover); if ( "$track" eq "gc5Base" ) { $trackFile .= ".bw"; } else { $trackFile .= ".bb"; } if ( "$track" eq "rmsk") { my $rmskStats = "$buildDir/trackData/repeatMasker/$asmId.rmsk.stats"; if (! -s "${rmskStats}") { my $faOut = "$buildDir/trackData/repeatMasker/$asmId.sorted.fa.out.gz"; if ( -s "$faOut") { my $items = `zcat "$faOut" | wc -l`; chomp $items; $itemCount = commify($items); my $masked = `grep masked "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" | awk '{print \$4}' | sed -e 's/%//;'`; chomp $masked; $percentCover = sprintf("%.2f %%", $masked); open (RS, ">$rmskStats") or die "can now write to $rmskStats"; printf RS "%s\t%s\n", $itemCount, $percentCover; close (RS); } else { $itemCount = "n/a"; $percentCover = "n/a"; } } else { ($itemCount, $percentCover) = split('\s+', `cat $rmskStats`); + $percentCover = sprintf("%.2f %%", $percentCover); chomp $percentCover; } } else { - ($itemCount, $percentCover) = oneTrackData($trackFile, $totalSize); + ($itemCount, $percentCover) = oneTrackData($asmId, $track, $trackFile, $totalSize, $trackFb, $runDir); } printf " %s
(%s)\n", $itemCount, $percentCover; } printf "\n"; } } ############################################################################## ### main() ############################################################################## open (FH, "<$toolsDir/${commonNameOrder}") or die "can not read ${commonNameOrder}"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($commonName, $asmId) = split('\t', $line); push @orderList, $asmId; $betterName{$asmId} = $commonName; ++$assemblyTotal; } close (FH); startHtml(); startTable(); tableContents(); endTable(); endHtml();