src/hg/utils/automation/asmHubGatewayPage.pl 1bca724c3e143196d4d3ceea41fe15c0ac19fb70

1bca724c3e143196d4d3ceea41fe15c0ac19fb70
hiram
  Wed Apr 6 13:04:01 2022 -0700
improved gateway page contents refs #29203

diff --git src/hg/utils/automation/asmHubGatewayPage.pl src/hg/utils/automation/asmHubGatewayPage.pl
index 07b361e..84bda67 100755
--- src/hg/utils/automation/asmHubGatewayPage.pl
+++ src/hg/utils/automation/asmHubGatewayPage.pl
@@ -1,28 +1,29 @@
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 use FindBin qw($Bin);
 use lib "$Bin";
 use AsmHub;
 use File::Basename;
 
 ### XXX ### temporary hgdownload-test.gi
 ### my $sourceServer = "hgdownload-test.gi.ucsc.edu";
 
 my $sourceServer = "hgdownload.soe.ucsc.edu";
+my $genomeSize = 0;	# will be set below
 
 my @months = qw( 0 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec );
 
 sub usage() {
   printf STDERR "usage: asmHubGatewayPage.pl <asmHubName> <pathTo>/*assembly_report.txt <pathTo>/asmId.chrom.sizes <pathTo>/image.jpg <pathTo>/photoCredits.txt\n";
   printf STDERR "output is to stdout, redirect to file: > description.html\n";
   printf STDERR "photoCredits.txt is a two line tag<tab>string file:\n";
   printf STDERR "tags: photoCreditURL and photoCreditName\n";
   printf STDERR "use string 'noPhoto' for image and credits when no photo\n";
   printf STDERR "stderr output is routed to a 'asmId.names.tab' file for use elsewhere\n";
   exit 255;
 }
 
 sub chromSizes($) {
   my ($sizeFile) = @_;
@@ -49,31 +50,31 @@
 	    ++$contigCount;
 	    chomp ($line);
 	    my ($name, $size, $rest) = split('\s+', $line, 3);
 	    my $key = sprintf("%s_X_%d", $name, $ix++);
 	    $sizes{$key} = $size;
 	}
 	close (FH);
     }
 
     my $totalSize = 0;
     foreach my $key (keys %sizes) {
 	$totalSize += $sizes{$key}
     }
     my $n50Size = $totalSize / 2;
 
-    my $genomeSize = $totalSize;
+    $genomeSize = $totalSize;
     printf "<b>Total assembly nucleotides:</b> %s<br>\n", &AsmHub::commify($totalSize);
     printf "<b>Assembly contig count:</b> %s<br>\n", &AsmHub::commify($contigCount);
 
     my $prevContig = "";
     my $prevSize = 0;
 
     $totalSize = 0;
     # work through the sizes until reaching the N50 size
     foreach my $key (sort { $sizes{$b} <=> $sizes{$a} } keys %sizes) {
 	$totalSize += $sizes{$key};
 	if ($totalSize > $n50Size) {
 	    my $prevName = $prevContig;
 	    $prevName =~ s/_X_[0-9]+//;
 	    my $origName = $key;
 	    $origName =~ s/_X_[0-9]+//;
@@ -109,30 +110,33 @@
 if ( ! -s $chromSizes ) {
   printf STDERR "ERROR: can not find '$chromSizes'\n";
   usage;
 }
 if ($jpgImage ne "noPhoto") {
   if ( ! -s $jpgImage ) {
     printf STDERR "ERROR: can not find '$jpgImage'\n";
     usage;
   }
   if ( ! -s $photoCredits ) {
     printf STDERR "ERROR: can not find '$photoCredits'\n";
     usage;
   }
 }
 
+my $buildDir = dirname($chromSizes);
+my $genesDir = "$buildDir/genes";
+
 my $photoCreditURL = "";
 my $photoCreditName = "";
 my $imageSize = "";
 my $imageName = "";
 my $imageWidth = 0;
 my $imageHeight = 0;
 my $imageWidthBorder = 15;
 
 if ($jpgImage ne "noPhoto") {
   printf STDERR "# reading $photoCredits\n";
   open (FH, "<$photoCredits") or die "can not read $photoCredits";
   while (my $line = <FH>) {
     chomp $line;
     next if ($line =~ m/^#/);
     next if (length($line) < 2);
@@ -273,32 +277,30 @@
 }
 $orgName =~ s/\s+$//;
 
 printf STDERR "#taxId\tcommonName\tsubmitter\tasmName\torgName\tbioSample\tasmType\tasmLevel\tasmDate\tasmAccession\n";
 printf STDERR "%s\t", $taxId;
 printf STDERR "%s\t", $commonName;
 printf STDERR "%s\t", $submitter;
 printf STDERR "%s\t", $asmName;
 printf STDERR "%s\t", $orgName;
 printf STDERR "%s\t", $bioSample;
 printf STDERR "%s\t", $descrAsmType;
 printf STDERR "%s\t", $asmLevel;
 printf STDERR "%s\t", $asmDate;
 printf STDERR "%s\n", $asmAccession;
 
-# printf "<script type='text/javascript'>var asmId='%s';</script>\n", $asmId;
-
 if (length($imageName)) {
 printf "<!-- Display image in righthand corner -->
 <table align=right border=0 width=%d height=%d>
   <tr><td align=RIGHT><a href=\"https://www.ncbi.nlm.nih.gov/assembly/%s\"
     target=_blank>
     <img src=\"https://%s/hubs/%s/html/%s\" width=%d height=%d alt=\"%s\"></a>
   </td></tr>
   <tr><td align=right>
     <font size=-1> <em>%s</em><BR>
     </font>
     <font size=-2> (Photo courtesy of
       <a href=\"%s\" target=_blank>%s</a>)
     </font>
   </td></tr>
 </table>
@@ -306,125 +308,177 @@
 }
 
 my $sciNameUnderscore = $orgName;
 $sciNameUnderscore =~ s/ /_/g;
 $sciNameUnderscore = "Strigops_habroptilus" if ($orgName =~ m/Strigops habroptila/);
 
 printf "<p>
 <b>Common name:</b>&nbsp;%s<br>
 <b>Taxonomic name: %s, taxonomy ID:</b> <a href='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=%s' target='_blank'> %s</a><br>
 <b>Sequencing/Assembly provider ID:</b> %s<br>
 <b>Assembly date:</b> %s<br>
 <b>Assembly type:</b> %s<br>
 <b>Assembly level:</b> %s<br>
 <b>Biosample:</b> <a href=\"https://www.ncbi.nlm.nih.gov/biosample/?term=%s\" target=\"_blank\">%s</a><br>
 <b>Assembly accession ID:</b> <a href=\"https://www.ncbi.nlm.nih.gov/assembly/%s\" target=\"_blank\">%s</a><br>
-<b>Assembly FTP location:</b> <a href=\"ftp://ftp.ncbi.nlm.nih.gov/genomes/all/%s\" target=\"_blank\">%s</a><br>
+<b>Assembly FTP location:</b> <a href='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/%s' target='_blank'>%s</a><br>
 \n", $commonName, $orgName, $taxId, $taxId, $submitter, $asmDate, $descrAsmType,
   $asmLevel, $bioSample, $bioSample, $asmAccession, $asmAccession, $newStyleUrl, $newStyleUrl;
 
 chromSizes($chromSizes);
 
 printf "</p>\n<hr>
+<h4>Data file downloads</h4>
 <p>
-<b>Download files for this assembly hub:</b><br>
-To use the data from this assembly for a local hub instance at your
-institution, download these data as indicated by these instructions.<br>
-<br>
-To download this assembly data, use this <em>rsync</em> command:
+<ul>
+<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.fa.gz' target=_blank>$asmAccession.fa.gz</a> fasta sequence with original assembly sequence names</li>
+<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.2bit' target=_blank>$asmAccession.2bit</a> UCSC 2bit sequence file with original assembly sequence names</li>
+<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.chromAlias.txt' target=_blank>$asmAccession.chromAlias.txt</a> chromAlias file to relate chromosome names</li>
+";
+
+if ( -s "$buildDir/$asmId.chrNames.fa.gz") {
+printf "<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.chrNames.fa.gz' target=_blank>$asmAccession.chrNames.fa.gz</a> fasta sequence with <b>chrN</b> sequence names</li>\n";
+}
+
+if ( -s "$buildDir/$asmId.chrNames.2bit") {
+printf "<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.chrNames.2bit' target=_blank>$asmAccession.chrNames.2bit</a> UCSC 2bit sequence file with <b>chrN</b> sequence names</li>\n";
+}
+
+if ( -d "$genesDir" ) {
+ open (GD, "ls $genesDir/*.gtf.gz 2> /dev/null|") or die "can not ls $genesDir/*.gtf.gz";
+ while (my $gtfFile = <GD>) {
+    chomp $gtfFile;
+    my $gtf = basename($gtfFile);
+    printf "<li><a href='https://$sourceServer/hubs/$localDataUrl/genes/$gtf' target=_blank>$gtf</a> gene GTF file</li>\n";
+ }
+}
+
+printf "
+<li>explore the hub directory at: <a href='https://$sourceServer/hubs/$localDataUrl/' target=_blank>$sourceServer/hubs/$localDataUrl/</a></li>
+</ul>
+";
+printf "</p>\n<hr>
+<h4>Copy this entire assembly hub for local use</h4>
+<p>
+This download is only for the purpose of using this assembly hub in
+your institution which may have firewall access restrictions to this
+data.<br>
+
+To download this assembly data, use this <b>rsync</b> command:
 <pre>
   rsync -a -P \\
     rsync://$sourceServer/hubs/$localDataUrl/ \\
       ./$accessionId/
-
-  which creates the local directory: ./$accessionId/
 </pre>
-or this <em>wget</em> command:
+
+  which creates the local directory: <b>./$accessionId/</b><br>
+or this <b>wget</b> command:
 <pre>
   wget --timestamping -m -nH -x --cut-dirs=6 -e robots=off -np -k \\
     --reject \"index.html*\" -P \"$accessionId\" \\
        https://$sourceServer/hubs/$localDataUrl/
-
-  which creates a local directory: ./$accessionId/
 </pre>
+
+  which creates a local directory: <b>./$accessionId/</b>
 <p>
-There is an included <em>hub.txt</em> file in that download
+There is an included <b>hub.txt</b> file in that download
 data directory to use for your local track hub instance.<br>
 Using the genome browser menus: <em><strong>My Data</strong> -&gt; <strong>Track Hubs</strong></em><br>
 select the <em><strong>My Hubs</strong></em> tab to enter a URL
-to this hub.txt file to attach this assembly hub to a genome browser.
+to this <b>hub.txt</b> file to attach this assembly hub to a genome browser.
 </p>
 <p>
-The <em>html/$asmId.description.html</em> page is information for your users to
-describe this assembly.  This WEB page with these instructions
-is an instance of html/$asmId.description.html file.
+The <b>html/$asmId.description.html</b> page is information for your users to
+describe this assembly.<br>This web page with these instructions
+is an instance of the <b>html/$asmId.description.html</b> file.
 </p>
 <p>
 See also: <a href='/goldenPath/help/hgTrackHubHelp.html' target=_blank>track hub help</a> documentation.<br>
 </p>\n";
 
+if ($genomeSize < 4294967297) {
   printf "<hr>
+<h4>blat service</h4>
 <p>
-To operate a blat server on this assembly, in the directory where you have
-the <em>$asmId.2bit</em> file:
-<pre>
-gfServer -log=$asmId.gfServer.trans.log -ipLog -canStop start \\
-    yourserver.domain.edu 76543 -trans -mask $asmId.2bit &
-gfServer -log=$asmId.gfServer.log -ipLog -canStop start \\
-    yourserver.domain.edu 76542 -stepSize=5 $asmId.2bit &
-</pre>
-Adjust the port numbers <em>76543</em> <em>76542</em> and the
-<em>yourserver.domain.edu</em> for your local circumstances.<br>
-Typically, port numbers in the range <em>49152</em> to <em>65535</em>
-are available for private use as in this case.
-See also: <a href='https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml' target=_blank>IANA.org</a> port registry.
+There is blat service available for this genome assembly.  When viewing this
+assembly in the genome browser, access the blat service via the
+<em><b>Tools -> Blat</b></em> blue navigation bar menu item.
 </p>
 <p>
-Enter the following specifications in your <em>genomes.txt</em> file:
+For local command line blat service, access
+the blat service via the <b>gfClient</b> command line operation.<br>
+See also: <a href='http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/' target=_blank>
+hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/</a> to download command line
+binaries.<br>
+<br>
+To operate this locally, you will need the <b>%s.2bit</b> file from:
+<pre>
+  https://$sourceServer/hubs/$localDataUrl/
+</pre>
+Which can be obtained with rsync via:
+<pre>
+  rsync -a -P \
+    rsync://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId.2bit ./
+</pre>
+With that <b>$accessionId.2bit</b> file in your working directory where you run
+this command, for example, a DNA query with your DNA sequence in
+the file: <b>someDna.fa</b>
+with result in the file: <b>$accessionId.someDna.psl</b>
+<pre>
+gfClient -t=dna -q=dna -genome=$accessionId -genomeDataDir=$accessionDir \
+    dynablat-01.soe.ucsc.edu 4040 ./ someDna.fa $accessionId.someDna.psl
+</pre>
+For a protein fasta query with your protein sequence in the file: <b>someProtein.faa</b>
+with result in the file: <b>$accessionId.someProtein.psl</b>
 <pre>
-transBlat yourserver.domain.edu 76543
-blat yourserver.domain.edu 76542
+gfClient -t=dnax -q=prot  -genome=$accessionId -genomeDataDir=$accessionDir \
+    dynablat-01.soe.ucsc.edu 4040 ./ someProtein.faa $accessionId.someProtein.psl
 </pre>
-See also: <a href=\"https://genome.ucsc.edu/goldenPath/help/hubQuickStartAssembly.html#blat\"
-target=_blank>Blat for an Assembly Hub</a>
 </p>\n";
 
+} else {
+  printf "<hr>
+<p>
+At this time, this genome size: %s, is too large (greater than 4294967296),
+to function with the UCSC blat system.  We hope to have improvements to
+that system in the future to allow blat service for the larger genome sizes.
+</p>\n", commify($genomeSize);
+}
+
 printf "<hr>
 <p>
 <b>Search the assembly:</b>
 <ul>
 <li>
 <b>By position or search term: </b> Use the &quot;position or search term&quot;
 box to find areas of the genome associated with many different attributes, such
 as a specific chromosomal coordinate range; mRNA, EST, or STS marker names; or
 keywords from the GenBank description of an mRNA.
 <a href=\"http://genome.ucsc.edu/goldenPath/help/query.html\">More information</a>, including sample queries.</li>
 <li>
 <b>By gene name: </b> Type a gene name into the &quot;search term&quot; box,
 choose your gene from the drop-down list, then press &quot;submit&quot; to go
 directly to the assembly location associated with that gene.
-<a href=\"http://genome.ucsc.edu/goldenPath/help/geneSearchBox.html\">More information</a>.</li>
+<a href=\"http://genome.ucsc.edu/goldenPath/help/geneSearchBox.html\">More information</a>.  To avoid case sensitivity issues, always use fully lower case gene names.</li>
 <li>
 <b>By track type: </b> Click the &quot;track search&quot; button
 to find Genome Browser tracks that match specific selection criteria.
 <a href=\"http://genome.ucsc.edu/goldenPath/help/trackSearch.html\">More information</a>.</li>
 </ul>
 </p>
 <hr>\n";
 
-# printf "<script type='text/javascript' src='../js/gatewayPage.js'></script>\n";
-
 __END__
 
 /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.30_GRCh38.p4/GCF_000001405.30_GRCh38.p4_assembly_report.txt
 
 # Assembly Name:  GRCh38.p4
 # Description:    Genome Reference Consortium Human Build 38 patch release 4 (GRCh38.p4)
 # Organism name:  Homo sapiens (human)
 # Taxid:          9606
 # Submitter:      Genome Reference Consortium
 # Date:           2015-6-25
 # Assembly type:  haploid-with-alt-loci
 # Release type:   patch
 # Assembly level: Chromosome
 # Genome representation: full
 # GenBank Assembly Accession: GCA_000001405.19 (latest)