026fea8dd26483276dbc4d8c860ff0d11e1a640e hiram Thu Jan 23 15:27:43 2020 -0800 now using new minimal makefile with single set of scripts refs #24748 diff --git src/hg/makeDb/doc/vertebratesAsmHub/mkAsmStats.pl src/hg/makeDb/doc/vertebratesAsmHub/mkAsmStats.pl deleted file mode 100755 index 1f255f3..0000000 --- src/hg/makeDb/doc/vertebratesAsmHub/mkAsmStats.pl +++ /dev/null @@ -1,262 +0,0 @@ -#!/usr/bin/env perl - -use strict; -use warnings; -use File::stat; - -my $home = $ENV{'HOME'}; -my $Name = "Vertebrate"; -my $asmHubName = "vertebrate"; -my $srcDocDir = "${asmHubName}sAsmHub"; -my $asmHubDocDir = "$home/kent/src/hg/makeDb/doc/$srcDocDir"; - -my $commonNameList = "$asmHubName.asmId.commonName.tsv"; -my $commonNameOrder = "$asmHubName.commonName.asmId.orderList.tsv"; -my @orderList; # asmId of the assemblies in order from the *.list files -# the order to read the different .list files: - -my $assemblyCount = 0; -my $overallNucleotides = 0; -my $overallSeqCount = 0; -my $overallGapSize = 0; -my $overallGapCount = 0; - -############################################################################## -# from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: -############################################################################## -sub commify($) { - my $text = reverse $_[0]; - $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; - return scalar reverse $text -} - -############################################################################## -### start the HTML output -############################################################################## -sub startHtml() { - -my $timeStamp = `date "+%F"`; -chomp $timeStamp; - -print <<"END" - - - - - - -

$Name Genomes assembly hubs

-

-Assemblies from NCBI/Genbank/Refseq sources, subset of other $asmHubName only. -

- -

See also: hub access


- -

Data resource links

-NOTE: Click on the column headers to sort the table by that column
-The link to genome browser will attach only that single assembly to -the genome browser. -END -} - -############################################################################## -### start the table output -############################################################################## -sub startTable() { -print <<"END" - - - - - - - - -END -} - -############################################################################## -### end the table output -############################################################################## -sub endTable() { - -my $commaNuc = commify($overallNucleotides); -my $commaSeqCount = commify($overallSeqCount); -my $commaGapSize = commify($overallGapSize); -my $commaGapCount = commify($overallGapCount); - -print <<"END" - - - - - - - - - -
countcommon name
link to genome browser
scientific name
and data download
NCBI assemblysequence
count
genome size
nucleotides
gap
count
unknown bases
(gap size sum)
masking
percent
TOTALS:assembly count $assemblyCount$commaSeqCount$commaNuc$commaGapCount$commaGapSize 
-END -} - -############################################################################## -### end the HTML output -############################################################################## -sub endHtml() { -print <<"END" - - - - - -END -} - -sub asmCounts($) { - my ($chromSizes) = @_; - my ($sequenceCount, $totalSize) = split('\s+', `ave -col=2 $chromSizes | egrep "^count|^total" | awk '{printf "%d\\n", \$NF}' | xargs echo`); - return ($sequenceCount, $totalSize); -} - -# my ($gapSize) = maskStats($faSizeTxt); -sub maskStats($) { - my ($faSizeFile) = @_; - my $gapSize = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$3}'`; - chomp $gapSize; - $gapSize =~ s/\(//; - my $totalBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$1}'`; - chomp $totalBases; - my $maskedBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$9}'`; - chomp $maskedBases; - my $maskPerCent = 100.0 * $maskedBases / $totalBases; - return ($gapSize, $maskPerCent); -} - -# grep "sequences in 1 file" GCA_900324465.2_fAnaTes1.2.faSize.txt -# 555641398 bases (3606496 N's 552034902 real 433510637 upper 118524265 lower) in 50 sequences in 1 files - -sub gapStats($$) { - my ($buildDir, $asmId) = @_; - my $gapBed = "$buildDir/trackData/allGaps/$asmId.allGaps.bed.gz"; - my $gapCount = 0; - if ( -s "$gapBed" ) { - $gapCount = `zcat $gapBed | awk '{print \$3-\$2}' | ave stdin | grep '^count' | awk '{print \$2}'`; - } - chomp $gapCount; - return ($gapCount); -} - -# GCA_901709675.1_fSynAcu1.1/trackData/allGaps] zcat GCA_901709675.1_fSynAcu1.1.allGaps.bed.gz | awk '{print $3-$2,$0}' | ave stdin | grep "^count" | awk '{print $2}' - -############################################################################## -### tableContents() -############################################################################## -sub tableContents() { - - my $asmCount = 0; - foreach my $asmId (reverse(@orderList)) { - my $accessionDir = substr($asmId, 0 ,3); - $accessionDir .= "/" . substr($asmId, 4 ,3); - $accessionDir .= "/" . substr($asmId, 7 ,3); - $accessionDir .= "/" . substr($asmId, 10 ,3); - $accessionDir .= "/" . $asmId; - my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir"; - my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; - my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); - my $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; - my $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; - my $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; - if ( ! -s "$faSizeTxt" ) { - printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n"; - print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`; - } - my ($gapSize, $maskPerCent) = maskStats($faSizeTxt); - $overallGapSize += $gapSize; - my ($seqCount, $totalSize) = asmCounts($chromSizes); - $overallSeqCount += $seqCount; -# my $totalSize=`ave -col=2 $chromSizes | grep "^total" | awk '{printf "%d", \$NF}'`; - $overallNucleotides += $totalSize; - my $gapCount = gapStats($buildDir, $asmId); - $overallGapCount += $gapCount; - my $sciName = "notFound"; - my $commonName = "notFound"; - my $bioSample = "notFound"; - my $bioProject = "notFound"; - my $taxId = "notFound"; - my $asmDate = "notFound"; - my $itemsFound = 0; - open (FH, "<$asmReport") or die "can not read $asmReport"; - while (my $line = ) { - last if ($itemsFound > 5); - chomp $line; - $line =~ s/ //g;; - $line =~ s/\s+$//g;; - if ($line =~ m/Date:/) { - if ($asmDate =~ m/notFound/) { - ++$itemsFound; - $asmDate = $line; - $asmDate =~ s/.*:\s+//; - } - } elsif ($line =~ m/BioSample:/) { - if ($bioSample =~ m/notFound/) { - ++$itemsFound; - $bioSample = $line; - $bioSample =~ s/.*:\s+//; - } - } elsif ($line =~ m/BioProject:/) { - if ($bioProject =~ m/notFound/) { - ++$itemsFound; - $bioProject = $line; - $bioProject =~ s/.*:\s+//; - } - } elsif ($line =~ m/Organism name:/) { - if ($sciName =~ m/notFound/) { - ++$itemsFound; - $commonName = $line; - $sciName = $line; - $commonName =~ s/.*\(//; - $commonName =~ s/\)//; - $sciName =~ s/.*:\s+//; - $sciName =~ s/\s+\(.*//; - } - } elsif ($line =~ m/Taxid:/) { - if ($taxId =~ m/notFound/) { - ++$itemsFound; - $taxId = $line; - $taxId =~ s/.*:\s+//; - } - } - } - close (FH); - my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir"; - printf "%d\n", ++$asmCount; - printf "%s\n", $hubUrl, $asmId, $asmId, $commonName; - printf " %s\n", $asmHubName, $asmId, $sciName; - printf " %s\n", $gcPrefix, $asmAcc, $asmId; - printf " %s\n", commify($seqCount); - printf " %s\n", commify($totalSize); - printf " %s\n", commify($gapCount); - printf " %s\n", commify($gapSize); - printf " %.2f\n", $maskPerCent; - printf "\n"; - } -} - -############################################################################## -### main() -############################################################################## - -open (FH, "<$asmHubDocDir/${commonNameOrder}") or die "can not read ${commonNameOrder}"; -while (my $line = ) { - chomp $line; - my ($commonName, $asmId) = split('\t', $line); - push @orderList, $asmId; - ++$assemblyCount; -} -close (FH); - -startHtml(); -startTable(); -tableContents(); -endTable(); -endHtml();