a6931e479d3f8415a57fb979b759537cbc6e0ab0 hiram Thu Aug 8 22:16:57 2019 -0700 xenoRefGene ready to go refs #23734 diff --git src/hg/utils/automation/asmHubXenoRefGene.pl src/hg/utils/automation/asmHubXenoRefGene.pl new file mode 100755 index 0000000..ef7d554 --- /dev/null +++ src/hg/utils/automation/asmHubXenoRefGene.pl @@ -0,0 +1,83 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use FindBin qw($Bin); +use lib "$Bin"; +use AsmHub; +use File::Basename; + +my $argc = scalar(@ARGV); + +if ($argc != 3) { + printf STDERR "usage: asmHubXenoRefGene.pl asmId asmId.names.tab .../trackData/\n"; + printf STDERR "where asmId is the assembly identifier,\n"; + printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; + exit 255; +} + +# from Perl Cookbook Recipe 2.17, print out large numbers with comma +# delimiters: +sub commify($) { + my $text = reverse $_[0]; + $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; + return scalar reverse $text +} + +my $asmId = shift; +my $namesFile = shift; +my $trackDataDir = shift; +my $xenoRefGeneBbi = "$trackDataDir/xenoRefGene/$asmId.xenoRefGene.bb"; +my $asmType = "refseq"; + +if ( ! -s $xenoRefGeneBbi ) { + printf STDERR "ERROR: can not find $asmId.xenoRefGene.bb file\n"; + exit 255; +} + +my @partNames = split('_', $asmId); +my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0], + substr($partNames[1],0,3), substr($partNames[1],3,3), + substr($partNames[1],6,3), $asmId); + +$asmType = "genbank" if ($partNames[0] =~ m/GCA/); +my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; +chomp $totalBases; +my $geneStats = `cat $trackDataDir/xenoRefGene/${asmId}.xenoRefGene.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; +chomp $geneStats; +my ($itemCount, $basesCovered) = split('\s+', $geneStats); +my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); +$itemCount = commify($itemCount); +$basesCovered = commify($basesCovered); +$totalBases = commify($totalBases); + +my $em = "<em>"; +my $noEm = "</em>"; +my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; +chomp $assemblyDate; +my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; +chomp $ncbiAssemblyId; +my $organism = `grep -v "^#" $namesFile | cut -f5`; +chomp $organism; + +print <<_EOF_ +<h2>Description</h2> +<p> +The Genbank mRNAs gene track for the $assemblyDate $em${organism}$noEm/$asmId +genome assembly is constructed by mapping RefSeq mRNAs to this assembly +using a blat procedure, filtered for reasonable matches. +The mRNAs are obtained from the RefSeq release at +<a href='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/' target=_blank> +ftp.ncbi.nlm.nih.gov/refseq/release</a> +</p> + +<h2>Track statistics summary</h2> +<p> +<b>Total genome size: </b>$totalBases</br> +<b>Gene count: </b>$itemCount<br> +<b>Bases in genes: </b>$basesCovered</br> +<b>Percent genome coverage: </b>% $percentCoverage</br> +</p> + +_EOF_ + ;