4951a3f2ca22c980a415c231e59958197baec741 hiram Wed Oct 5 10:41:05 2022 -0700 correct Gene model labels on the ncbiGene (not really NCBI gene) track no redmine diff --git src/hg/utils/automation/asmHubNcbiGene.pl src/hg/utils/automation/asmHubNcbiGene.pl index fbc0b3b..5eeb224 100755 --- src/hg/utils/automation/asmHubNcbiGene.pl +++ src/hg/utils/automation/asmHubNcbiGene.pl @@ -1,85 +1,111 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; my $argc = scalar(@ARGV); if ($argc != 4) { printf STDERR "usage: asmHubNcbiGene.pl asmId ncbiAsmId asmId.names.tab .../trackData/\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; printf STDERR "asmId may be equal to ncbiAsmId if it is a GenArk build\n"; printf STDERR "or asmId might be a default dbName if it is a UCSC style\n"; printf STDERR "browser build.\n"; exit 255; } # from Perl Cookbook Recipe 2.17, print out large numbers with comma # delimiters: sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } my $asmId = shift; my $ncbiAsmId = shift; my $namesFile = shift; my $trackDataDir = shift; my $ncbiGeneBbi = "$trackDataDir/ncbiGene/$asmId.ncbiGene.bb"; my $asmType = "refseq"; if ( ! -s $ncbiGeneBbi ) { printf STDERR "ERROR: can not find $asmId.ncbiGene.bb file\n"; exit 255; } my @partNames = split('_', $ncbiAsmId); my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0], substr($partNames[1],0,3), substr($partNames[1],3,3), substr($partNames[1],6,3), $ncbiAsmId); $asmType = "genbank" if ($partNames[0] =~ m/GCA/); my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`; chomp $totalBases; my $geneStats = `cat $trackDataDir/ncbiGene/${asmId}.ncbiGene.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`; chomp $geneStats; my ($itemCount, $basesCovered) = split('\s+', $geneStats); my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases); $itemCount = commify($itemCount); $basesCovered = commify($basesCovered); $totalBases = commify($totalBases); my $em = "<em>"; my $noEm = "</em>"; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; chomp $ncbiAssemblyId; my $organism = `grep -v "^#" $namesFile | cut -f5`; chomp $organism; +if ( "${asmType}" eq "refseq" ) { + print <<_EOF_ <h2>Description</h2> <p> The NCBI Gene track for the $assemblyDate $em${organism}$noEm/$ncbiAsmId genome assembly is constructed from the gff file <b>${ncbiAsmId}_genomic.gff.gz</b> -delivered with the NCBI RefSeq genome assemblies at the FTP location:<br> +supplied with the genome assembly at the FTP location:<br> +<a href='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/' target='_blank'>ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/</a> +</p> + +_EOF_ + ; + +} else { + +print <<_EOF_ +<h2>Description</h2> +<p> +The Gene model track for the $assemblyDate $em${organism}$noEm/$ncbiAsmId +genome assembly is constructed from the gff file <b>${ncbiAsmId}_genomic.gff.gz</b> +supplied with the genome assembly at the FTP location:<br> <a href='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/' target='_blank'>ftp://ftp.ncbi.nlm.nih.gov/genomes/all/$ftpDirPath/</a> </p> +<p> +The gene models were constructed by the submitter of the assembly to the +NCBI assembly release system. +</p> + +_EOF_ + ; +} +print <<_EOF_ <h2>Track statistics summary</h2> <p> <b>Total genome size: </b>$totalBases<br> <b>Gene count: </b>$itemCount<br> <b>Bases in genes: </b>$basesCovered<br> <b>Percent genome coverage: </b>% $percentCoverage<br> </p> _EOF_ ; +