982d69ac4a666c4ff32f7ed167f788b81d960285 hiram Wed Jun 17 12:30:05 2020 -0700 reduce the vertical space consumed by options and explanations refs #25555 diff --git src/hg/makeDb/doc/asmHubs/mkAsmStats.pl src/hg/makeDb/doc/asmHubs/mkAsmStats.pl index 8cfdb7a..9f4ddf6 100755 --- src/hg/makeDb/doc/asmHubs/mkAsmStats.pl +++ src/hg/makeDb/doc/asmHubs/mkAsmStats.pl @@ -1,360 +1,360 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use commonHtml; use File::stat; my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "mkAsmStats Name asmHubName [two column name list]\n"; printf STDERR "e.g.: mkAsmStats Mammals mammals mammals.asmId.commonName.tsv\n"; printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n"; printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n"; printf STDERR "column 2: common name for species, columns separated by tab\n"; exit 255; } my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; my $Name = shift; my $asmHubName = shift; my $inputList = shift; my $orderList = $inputList; if ( ! -s "$orderList" ) { $orderList = $toolsDir/$inputList; } my @orderList; # asmId of the assemblies in order from the orderList file my %commonName; # key is asmId, value is a common name, perhaps more appropriate # than found in assembly_report file my $vgpIndex = 0; $vgpIndex = 1 if ($Name =~ m/vgp/i); my $assemblyTotal = 0; # complete list of assemblies in this group my $asmCount = 0; # count of assemblies completed and in the table my $overallNucleotides = 0; my $overallSeqCount = 0; my $overallGapSize = 0; my $overallGapCount = 0; ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } ############################################################################## ### start the HTML output ############################################################################## sub startHtml() { my $timeStamp = `date "+%F"`; chomp $timeStamp; my $subSetMessage = "subset of $asmHubName only"; if ($asmHubName eq "vertebrate") { $subSetMessage = "subset of other ${asmHubName}s only"; } if ($vgpIndex) { my $vgpSubset = "(set of primary assemblies)"; if ($orderList =~ m/vgp.alternate/) { $vgpSubset = "(set of alternate/haplotype assemblies)"; } elsif ($orderList =~ m/vgp.trio/) { $vgpSubset = "(set of trio assemblies, maternal/paternal)"; } elsif ($orderList =~ m/vgp.legacy/) { $vgpSubset = "(set of legacy/superseded assemblies)"; } print <<"END"

VGP - Vertebrate Genomes Project assembly hubs, assembly statistics

This assembly hub contains assemblies released by the Vertebrate Genomes Project. $vgpSubset

END } else { print <<"END"

$Name Genomes assembly hubs, assembly statistics

Assemblies from NCBI/Genbank/Refseq sources, $subSetMessage.

END } print <<"END"

Data resource links

NOTE: Click on the column headers to sort the table by that column
The link to genome browser will attach only that single assembly to the genome browser. END } ############################################################################## ### start the table output ############################################################################## sub startTable() { print <<"END" END } ############################################################################## ### end the table output ############################################################################## sub endTable() { my $commaNuc = commify($overallNucleotides); my $commaSeqCount = commify($overallSeqCount); my $commaGapSize = commify($overallGapSize); my $commaGapCount = commify($overallGapCount); my $percentDone = 100.0 * $asmCount / $assemblyTotal; my $doneMsg = ""; if ($asmCount < $assemblyTotal) { $doneMsg = sprintf(" (%d build completed, %.2f %% finished)", $asmCount, $percentDone); } if ($assemblyTotal > 1) { print " "; # try extra column headers as last row for this very large index page - if ($vgpIndex) { + if ($vgpIndex && ($asmCount > 15)) { print " "; } print "

count	common name link to genome browser	scientific name and data download	NCBI assembly	sequence count	genome size nucleotides	gap count	unknown bases (gap size sum)	masking percent
TOTALS:	total assembly count ${assemblyTotal}${doneMsg}			$commaSeqCount	$commaNuc	$commaGapCount	$commaGapSize
count	common name link to genome browser	scientific name and data download	NCBI assembly	sequence count	genome size nucleotides	gap count	unknown bases (gap size sum)	masking percent

"; } else { print " "; } # $assemblyTotal <= 1 } # sub endTable() ############################################################################## ### end the HTML output ############################################################################## sub endHtml() { &commonHtml::otherHubLinks($vgpIndex, $asmHubName); &commonHtml::htmlFooter($vgpIndex, $asmHubName); } sub asmCounts($) { my ($chromSizes) = @_; my ($sequenceCount, $totalSize) = split('\s+', `ave -col=2 $chromSizes | egrep "^count|^total" | awk '{printf "%d\\n", \$NF}' | xargs echo`); return ($sequenceCount, $totalSize); } # my ($gapSize) = maskStats($faSizeTxt); sub maskStats($) { my ($faSizeFile) = @_; my $gapSize = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$3}'`; chomp $gapSize; $gapSize =~ s/\(//; my $totalBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$1}'`; chomp $totalBases; my $maskedBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$9}'`; chomp $maskedBases; my $maskPerCent = 100.0 * $maskedBases / $totalBases; return ($gapSize, $maskPerCent); } # grep "sequences in 1 file" GCA_900324465.2_fAnaTes1.2.faSize.txt # 555641398 bases (3606496 N's 552034902 real 433510637 upper 118524265 lower) in 50 sequences in 1 files sub gapStats($$) { my ($buildDir, $asmId) = @_; my $gapBed = "$buildDir/trackData/allGaps/$asmId.allGaps.bed.gz"; my $gapCount = 0; if ( -s "$gapBed" ) { $gapCount = `zcat $gapBed | awk '{print \$3-\$2}' | ave stdin | grep '^count' | awk '{print \$2}'`; } chomp $gapCount; return ($gapCount); } ############################################################################## ### tableContents() ############################################################################## sub tableContents() { foreach my $asmId (@orderList) { printf STDERR "# asmId: '%s'\n", $asmId; my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); my $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; if ($gcPrefix eq "GCA") { $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId"; } my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; if (! -s "$asmReport") { printf STDERR "# no assembly report:\n# %s\n", $asmReport; next; } my $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; my $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; if (! -s "$twoBit") { printf STDERR "# no 2bit file:\n# %s\n", $twoBit; next; } my $trackDb="$buildDir/${asmId}.trackDb.txt"; next if (! -s "$trackDb"); # assembly build not complete my $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; if ( ! -s "$faSizeTxt" ) { printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n"; print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`; } my ($gapSize, $maskPerCent) = maskStats($faSizeTxt); $overallGapSize += $gapSize; my ($seqCount, $totalSize) = asmCounts($chromSizes); $overallSeqCount += $seqCount; # my $totalSize=`ave -col=2 $chromSizes | grep "^total" | awk '{printf "%d", \$NF}'`; $overallNucleotides += $totalSize; my $gapCount = gapStats($buildDir, $asmId); $overallGapCount += $gapCount; my $sciName = "notFound"; my $commonName = "notFound"; my $bioSample = "notFound"; my $bioProject = "notFound"; my $taxId = "notFound"; my $asmDate = "notFound"; my $itemsFound = 0; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = ) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; $line =~ s/\s+$//g;; if ($line =~ m/Date:/) { if ($asmDate =~ m/notFound/) { ++$itemsFound; $asmDate = $line; $asmDate =~ s/.*:\s+//; } } elsif ($line =~ m/BioSample:/) { if ($bioSample =~ m/notFound/) { ++$itemsFound; $bioSample = $line; $bioSample =~ s/.*:\s+//; } } elsif ($line =~ m/BioProject:/) { if ($bioProject =~ m/notFound/) { ++$itemsFound; $bioProject = $line; $bioProject =~ s/.*:\s+//; } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*$//; $commonName =~ s/$//; $commonName = $commonName{$asmId} if (exists($commonName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } elsif ($line =~ m/Taxid:/) { if ($taxId =~ m/notFound/) { ++$itemsFound; $taxId = $line; $taxId =~ s/.*:\s+//; } } } close (FH); my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; printf "%d\n", ++$asmCount; # printf "%s\n", $hubUrl, $accessionId, $commonName; printf "%s\n", $accessionId, $commonName; printf " %s\n", $hubUrl, $sciName; printf " %s\n", $accessionId, $asmId; printf " %s\n", commify($seqCount); printf " %s\n", commify($totalSize); printf " %s\n", commify($gapCount); printf " %s\n", commify($gapSize); printf " %.2f\n", $maskPerCent; printf "\n"; } } # sub tableContents() ############################################################################## ### main() ############################################################################## open (FH, "<${orderList}") or die "can not read ${orderList}"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($asmId, $commonName) = split('\t', $line); push @orderList, $asmId; $commonName{$asmId} = $commonName; ++$assemblyTotal; } close (FH); startHtml(); startTable(); tableContents(); endTable(); endHtml();

VGP - Vertebrate Genomes Project assembly hubs, assembly statistics

$Name Genomes assembly hubs, assembly statistics

See also: hub access, track statistics

Data resource links