src/hg/utils/automation/asmHubNcbiRefSeq.pl 595d44078d99eb1f717f097e2e4c886c450b2fe1

595d44078d99eb1f717f097e2e4c886c450b2fe1
hiram
  Tue Mar 28 12:19:29 2023 -0700
adding download links on the gene track description pages for the GTF file no redmine

diff --git src/hg/utils/automation/asmHubNcbiRefSeq.pl src/hg/utils/automation/asmHubNcbiRefSeq.pl
index 64a2be6..e79b908 100755
--- src/hg/utils/automation/asmHubNcbiRefSeq.pl
+++ src/hg/utils/automation/asmHubNcbiRefSeq.pl
@@ -4,64 +4,60 @@
 use warnings;
 use FindBin qw($Bin);
 use lib "$Bin";
 use AsmHub;
 use File::Basename;
 
 my $argc = scalar(@ARGV);
 
 if ($argc != 3) {
   printf STDERR "usage: asmHubNcbiGene.pl asmId asmId.names.tab .../trackData/\n";
   printf STDERR "where asmId is the assembly identifier,\n";
   printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n";
   exit 255;
 }
 
-# from Perl Cookbook Recipe 2.17, print out large numbers with comma
-# delimiters:
-sub commify($) {
-    my $text = reverse $_[0];
-    $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
-    return scalar reverse $text
-}
-
 my $asmId = shift;
+my @parts = split('_', $asmId, 3);
+my $accession = "$parts[0]_$parts[1]";
 my $namesFile = shift;
 my $trackDataDir = shift;
 my $ncbiRefSeqBbi = "$trackDataDir/ncbiRefSeq/$asmId.ncbiRefSeq.bb";
 my $asmType = "refseq";
+my $asmIdPath = &AsmHub::asmIdToPath($asmId);
+my $downloadGtf = "https://hgdownload.soe.ucsc.edu/hubs/$asmIdPath/$accession/genes/$asmId.ncbiRefSeq.gtf.gz";
 
 if ( ! -s $ncbiRefSeqBbi ) {
   printf STDERR "ERROR: can not find $asmId.ncbiRefSeq.bb file\n";
   exit 255;
 }
 
 my @partNames = split('_', $asmId);
 my $ftpDirPath = sprintf("%s/%s/%s/%s/%s", $partNames[0],
    substr($partNames[1],0,3), substr($partNames[1],3,3),
    substr($partNames[1],6,3), $asmId);
 
 $asmType = "genbank" if ($partNames[0] =~ m/GCA/);
 my $totalBases = `ave -col=2 $trackDataDir/../${asmId}.chrom.sizes | grep "^total" | awk '{printf "%d", \$2}'`;
 chomp $totalBases;
 my $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeq.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`;
 chomp $geneStats;
 my ($itemCount, $basesCovered) = split('\s+', $geneStats);
 my $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases);
-$itemCount = commify($itemCount);
-$basesCovered = commify($basesCovered);
-my $totalBasesCmfy = commify($totalBases);
+$itemCount = &AsmHub::commify($itemCount);
+$basesCovered = &AsmHub::commify($basesCovered);
+my $totalBasesCmfy = &AsmHub::commify($totalBases);
 
 my $em = "<em>";
 my $noEm = "</em>";
 my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`;
 chomp $assemblyDate;
 my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`;
 chomp $ncbiAssemblyId;
 my $organism = `grep -v "^#" $namesFile | cut -f5`;
 chomp $organism;
 
 print <<_EOF_
 <h2>Description</h2>
 <p>
 The NCBI RefSeq Genes composite track shows
 $assemblyDate $em${organism}$noEm/$asmId
@@ -71,30 +67,35 @@
 the different tracks were created.
 </p>
 <p>
 Please visit NCBI's
 <a href="https://www.ncbi.nlm.nih.gov/projects/RefSeq/update.cgi"
 target="_blank"> Feedback for Gene and Reference Sequences (RefSeq)</a>
 page to make suggestions, submit additions and corrections, or ask for
 help concerning RefSeq records.
 </p>
 
 <p>
 For more information on the different gene tracks, see our <a target=_blank 
 href="/FAQ/FAQgenes.html">Genes FAQ</a>.
 </p>
 
+<h2>Data Access</h2>
+<p>
+Download <a href='$downloadGtf' target=_blank> $asmId.ncbiRefSeq.gtf.gz </a> GTF file.
+</p>
+
 <h2>Display Conventions and Configuration</h2>
 <p>
 To show only a selected set of subtracks, uncheck the boxes next to the
 tracks that you wish to hide.
 </p>
 
 The tracks available here can include (not all may be present):
 <dl>
   <dt><em><strong>RefSeq annotations and alignments</strong></em></dt>
   <ul>
     <li><em>RefSeq All</em> &ndash; all curated and predicted annotations
      provided by RefSeq.</li>
     <li><em>RefSeq Curated</em> &ndash; subset of <em>RefSeq All</em> that
      includes only those annotations whose accessions begin with NM, NR,
      NP or YP. <small>(NP and YP are used only for protein-coding genes on
@@ -190,76 +191,76 @@
 <p>
 <b>Total genome size: </b>$totalBasesCmfy <b>bases</b><br><br>
 <b>Curated and Predicted Gene count: </b>$itemCount<br>
 <b>Bases in these genes: </b>$basesCovered<br>
 <b>Percent genome coverage: </b>% $percentCoverage<br>
 </p>
 
 _EOF_
    ;
 
 if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqCurated.stats.txt" ) {
   $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqCurated.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`;
   chomp $geneStats;
   ($itemCount, $basesCovered) = split('\s+', $geneStats);
   $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases);
-  $itemCount = commify($itemCount);
-  $basesCovered = commify($basesCovered);
+  $itemCount = &AsmHub::commify($itemCount);
+  $basesCovered = &AsmHub::commify($basesCovered);
   printf <<_EOF_
 <p>
 <b>Curated gene count: </b>$itemCount<br>
 <b>Bases in curated genes: </b>$basesCovered<br>
 <b>Percent genome coverage: </b>%% $percentCoverage<br>
 </p>
 _EOF_
 } else {
   printf <<_EOF_
 <p>
 <b>There are no curated gene annotations.</b>
 </p>
 _EOF_
 }
 
 if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqPredicted.stats.txt" ) {
   $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqPredicted.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`;
   chomp $geneStats;
   ($itemCount, $basesCovered) = split('\s+', $geneStats);
   $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases);
-  $itemCount = commify($itemCount);
-  $basesCovered = commify($basesCovered);
+  $itemCount = &AsmHub::commify($itemCount);
+  $basesCovered = &AsmHub::commify($basesCovered);
   printf <<_EOF_
 <p>
 <b>Predicted gene count: </b>$itemCount<br>
 <b>Bases in genes: </b>$basesCovered<br>
 <b>Percent genome coverage: </b>%% $percentCoverage<br>
 </p>
 _EOF_
 } else {
   printf <<_EOF_
 <p>
 <b>there are no predicted gene annotations</b>
 </p>
 _EOF_
 }
 
 if ( -s "$trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqOther.stats.txt" ) {
   $geneStats = `cat $trackDataDir/ncbiRefSeq/${asmId}.ncbiRefSeqOther.stats.txt | awk '{printf "%d\\n", \$2}' | xargs echo`;
   chomp $geneStats;
   ($itemCount, $basesCovered) = split('\s+', $geneStats);
   $percentCoverage = sprintf("%.3f", 100.0 * $basesCovered / $totalBases);
-  $itemCount = commify($itemCount);
-  $basesCovered = commify($basesCovered);
+  $itemCount = &AsmHub::commify($itemCount);
+  $basesCovered = &AsmHub::commify($basesCovered);
   printf <<_EOF_
 <p>
 <b>Other annotation count: </b>$itemCount<br>
 <b>Bases in other annotations: </b>$basesCovered<br>
 <b>Percent genome coverage: </b>%% $percentCoverage<br>
 </p>
 _EOF_
 }
 
 printf <<_EOF_
 <h2>Credits</h2>
 <p>
 This track was produced at UCSC from data generated by scientists worldwide
 and curated by the NCBI RefSeq project.
 </p>