608fb8b82f76fc54c4189815cad4ce27bf57b5a0 hiram Mon Jan 13 13:16:53 2020 -0800 now running up NCBI RefSeq track on assembly hubs refs #24748 diff --git src/hg/utils/automation/asmHubNcbiRefSeq.pl src/hg/utils/automation/asmHubNcbiRefSeq.pl new file mode 100755 index 0000000..81031aa --- /dev/null +++ src/hg/utils/automation/asmHubNcbiRefSeq.pl @@ -0,0 +1,129 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use FindBin qw($Bin); +use lib "$Bin"; +use AsmHub; +use File::Basename; + +my $argc = scalar(@ARGV); + +if ($argc != 3) { + printf STDERR "usage: asmHubNcbiGene.pl asmId asmId.names.tab .../trackData/\n"; + printf STDERR "where asmId is the assembly identifier,\n"; + printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; + exit 255; +} + +# from Perl Cookbook Recipe 2.17, print out large numbers with comma +# delimiters: +sub commify($) { + my $text = reverse $_[0]; + $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; + return scalar reverse $text +} + +my $asmId = shift; +my $namesFile = shift; +my $trackDataDir = shift; +my $ncbiRefSeqBbi = "$trackDataDir/ncbiRefSeq/$asmId.ncbiRefSeq.bb"; +my $asmType = "refseq"; + +if ( ! -s $ncbiRefSeqBbi ) { + printf STDERR "ERROR: can not find $asmId.ncbiRefSeq.bb file\n"; + exit 255; +} + +my @partNames = split('_', $asmId); +my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0], + substr($partNames[1],0,3), substr($partNames[1],3,3), + substr($partNames[1],6,3), $asmId); + +$asmType = "genbank" if ($partNames[0] =~ m/GCA/); +my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; +chomp $totalBases; +my $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeq.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; +chomp $geneStats; +my ($itemCount, $basesCovered) = split('\s+', $geneStats); +my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); +$itemCount = commify($itemCount); +$basesCovered = commify($basesCovered); +my $totalBasesCmfy = commify($totalBases); + +my $em = "<em>"; +my $noEm = "</em>"; +my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; +chomp $assemblyDate; +my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; +chomp $ncbiAssemblyId; +my $organism = `grep -v "^#" $namesFile | cut -f5`; +chomp $organism; + +print <<_EOF_ +<h2>Description</h2> +<p> +The NCBI RefSeq Gene annotations for the $assemblyDate $em${organism}$noEm/$asmId +genome assembly is constructed from the gff file <b>${asmId}_genomic.gff.gz</b> +delivered with the NCBI RefSeq genome assemblies at the FTP location:<br> +<a href='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/' target='_blank'>ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/</a> +</p> + +<h2>Track statistics summary</h2> +<p> +<b>Total genome size: </b>$totalBasesCmfy</br> +<b>Gene count: </b>$itemCount<br> +<b>Bases in genes: </b>$basesCovered</br> +<b>Percent genome coverage: </b>% $percentCoverage</br> +</p> + +_EOF_ + ; + +if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqCurated.stats.txt" ) { + $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqCurated.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; + chomp $geneStats; + ($itemCount, $basesCovered) = split('\s+', $geneStats); + $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); + $itemCount = commify($itemCount); + $basesCovered = commify($basesCovered); + printf <<_EOF_ +<p> +<b>Curated gene count: </b>$itemCount<br> +<b>Bases in genes: </b>$basesCovered</br> +<b>Percent genome coverage: </b>% $percentCoverage</br> +</p> +_EOF_ +} + +if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqPredicted.stats.txt" ) { + $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqPredicted.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; + chomp $geneStats; + ($itemCount, $basesCovered) = split('\s+', $geneStats); + $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); + $itemCount = commify($itemCount); + $basesCovered = commify($basesCovered); + printf <<_EOF_ +<p> +<b>Predicted gene count: </b>$itemCount<br> +<b>Bases in genes: </b>$basesCovered</br> +<b>Percent genome coverage: </b>% $percentCoverage</br> +</p> +_EOF_ +} + +if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqOther.stats.txt" ) { + $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqOther.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; + chomp $geneStats; + ($itemCount, $basesCovered) = split('\s+', $geneStats); + $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); + $itemCount = commify($itemCount); + $basesCovered = commify($basesCovered); + printf <<_EOF_ +<p> +<b>Other annotation count: </b>$itemCount<br> +<b>Bases in genes: </b>$basesCovered</br> +<b>Percent genome coverage: </b>% $percentCoverage</br> +</p> +_EOF_ +}