1bca724c3e143196d4d3ceea41fe15c0ac19fb70 hiram Wed Apr 6 13:04:01 2022 -0700 improved gateway page contents refs #29203 diff --git src/hg/utils/automation/asmHubGatewayPage.pl src/hg/utils/automation/asmHubGatewayPage.pl index 07b361e..84bda67 100755 --- src/hg/utils/automation/asmHubGatewayPage.pl +++ src/hg/utils/automation/asmHubGatewayPage.pl @@ -1,28 +1,29 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; ### XXX ### temporary hgdownload-test.gi ### my $sourceServer = "hgdownload-test.gi.ucsc.edu"; my $sourceServer = "hgdownload.soe.ucsc.edu"; +my $genomeSize = 0; # will be set below my @months = qw( 0 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ); sub usage() { printf STDERR "usage: asmHubGatewayPage.pl <asmHubName> <pathTo>/*assembly_report.txt <pathTo>/asmId.chrom.sizes <pathTo>/image.jpg <pathTo>/photoCredits.txt\n"; printf STDERR "output is to stdout, redirect to file: > description.html\n"; printf STDERR "photoCredits.txt is a two line tag<tab>string file:\n"; printf STDERR "tags: photoCreditURL and photoCreditName\n"; printf STDERR "use string 'noPhoto' for image and credits when no photo\n"; printf STDERR "stderr output is routed to a 'asmId.names.tab' file for use elsewhere\n"; exit 255; } sub chromSizes($) { my ($sizeFile) = @_; @@ -49,31 +50,31 @@ ++$contigCount; chomp ($line); my ($name, $size, $rest) = split('\s+', $line, 3); my $key = sprintf("%s_X_%d", $name, $ix++); $sizes{$key} = $size; } close (FH); } my $totalSize = 0; foreach my $key (keys %sizes) { $totalSize += $sizes{$key} } my $n50Size = $totalSize / 2; - my $genomeSize = $totalSize; + $genomeSize = $totalSize; printf "<b>Total assembly nucleotides:</b> %s<br>\n", &AsmHub::commify($totalSize); printf "<b>Assembly contig count:</b> %s<br>\n", &AsmHub::commify($contigCount); my $prevContig = ""; my $prevSize = 0; $totalSize = 0; # work through the sizes until reaching the N50 size foreach my $key (sort { $sizes{$b} <=> $sizes{$a} } keys %sizes) { $totalSize += $sizes{$key}; if ($totalSize > $n50Size) { my $prevName = $prevContig; $prevName =~ s/_X_[0-9]+//; my $origName = $key; $origName =~ s/_X_[0-9]+//; @@ -109,30 +110,33 @@ if ( ! -s $chromSizes ) { printf STDERR "ERROR: can not find '$chromSizes'\n"; usage; } if ($jpgImage ne "noPhoto") { if ( ! -s $jpgImage ) { printf STDERR "ERROR: can not find '$jpgImage'\n"; usage; } if ( ! -s $photoCredits ) { printf STDERR "ERROR: can not find '$photoCredits'\n"; usage; } } +my $buildDir = dirname($chromSizes); +my $genesDir = "$buildDir/genes"; + my $photoCreditURL = ""; my $photoCreditName = ""; my $imageSize = ""; my $imageName = ""; my $imageWidth = 0; my $imageHeight = 0; my $imageWidthBorder = 15; if ($jpgImage ne "noPhoto") { printf STDERR "# reading $photoCredits\n"; open (FH, "<$photoCredits") or die "can not read $photoCredits"; while (my $line = <FH>) { chomp $line; next if ($line =~ m/^#/); next if (length($line) < 2); @@ -273,32 +277,30 @@ } $orgName =~ s/\s+$//; printf STDERR "#taxId\tcommonName\tsubmitter\tasmName\torgName\tbioSample\tasmType\tasmLevel\tasmDate\tasmAccession\n"; printf STDERR "%s\t", $taxId; printf STDERR "%s\t", $commonName; printf STDERR "%s\t", $submitter; printf STDERR "%s\t", $asmName; printf STDERR "%s\t", $orgName; printf STDERR "%s\t", $bioSample; printf STDERR "%s\t", $descrAsmType; printf STDERR "%s\t", $asmLevel; printf STDERR "%s\t", $asmDate; printf STDERR "%s\n", $asmAccession; -# printf "<script type='text/javascript'>var asmId='%s';</script>\n", $asmId; - if (length($imageName)) { printf "<!-- Display image in righthand corner --> <table align=right border=0 width=%d height=%d> <tr><td align=RIGHT><a href=\"https://www.ncbi.nlm.nih.gov/assembly/%s\" target=_blank> <img src=\"https://%s/hubs/%s/html/%s\" width=%d height=%d alt=\"%s\"></a> </td></tr> <tr><td align=right> <font size=-1> <em>%s</em><BR> </font> <font size=-2> (Photo courtesy of <a href=\"%s\" target=_blank>%s</a>) </font> </td></tr> </table> @@ -306,125 +308,177 @@ } my $sciNameUnderscore = $orgName; $sciNameUnderscore =~ s/ /_/g; $sciNameUnderscore = "Strigops_habroptilus" if ($orgName =~ m/Strigops habroptila/); printf "<p> <b>Common name:</b> %s<br> <b>Taxonomic name: %s, taxonomy ID:</b> <a href='https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=%s' target='_blank'> %s</a><br> <b>Sequencing/Assembly provider ID:</b> %s<br> <b>Assembly date:</b> %s<br> <b>Assembly type:</b> %s<br> <b>Assembly level:</b> %s<br> <b>Biosample:</b> <a href=\"https://www.ncbi.nlm.nih.gov/biosample/?term=%s\" target=\"_blank\">%s</a><br> <b>Assembly accession ID:</b> <a href=\"https://www.ncbi.nlm.nih.gov/assembly/%s\" target=\"_blank\">%s</a><br> -<b>Assembly FTP location:</b> <a href=\"ftp://ftp.ncbi.nlm.nih.gov/genomes/all/%s\" target=\"_blank\">%s</a><br> +<b>Assembly FTP location:</b> <a href='ftp://ftp.ncbi.nlm.nih.gov/genomes/all/%s' target='_blank'>%s</a><br> \n", $commonName, $orgName, $taxId, $taxId, $submitter, $asmDate, $descrAsmType, $asmLevel, $bioSample, $bioSample, $asmAccession, $asmAccession, $newStyleUrl, $newStyleUrl; chromSizes($chromSizes); printf "</p>\n<hr> +<h4>Data file downloads</h4> <p> -<b>Download files for this assembly hub:</b><br> -To use the data from this assembly for a local hub instance at your -institution, download these data as indicated by these instructions.<br> -<br> -To download this assembly data, use this <em>rsync</em> command: +<ul> +<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.fa.gz' target=_blank>$asmAccession.fa.gz</a> fasta sequence with original assembly sequence names</li> +<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.2bit' target=_blank>$asmAccession.2bit</a> UCSC 2bit sequence file with original assembly sequence names</li> +<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.chromAlias.txt' target=_blank>$asmAccession.chromAlias.txt</a> chromAlias file to relate chromosome names</li> +"; + +if ( -s "$buildDir/$asmId.chrNames.fa.gz") { +printf "<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.chrNames.fa.gz' target=_blank>$asmAccession.chrNames.fa.gz</a> fasta sequence with <b>chrN</b> sequence names</li>\n"; +} + +if ( -s "$buildDir/$asmId.chrNames.2bit") { +printf "<li><a href='https://$sourceServer/hubs/$localDataUrl/$asmAccession.chrNames.2bit' target=_blank>$asmAccession.chrNames.2bit</a> UCSC 2bit sequence file with <b>chrN</b> sequence names</li>\n"; +} + +if ( -d "$genesDir" ) { + open (GD, "ls $genesDir/*.gtf.gz 2> /dev/null|") or die "can not ls $genesDir/*.gtf.gz"; + while (my $gtfFile = <GD>) { + chomp $gtfFile; + my $gtf = basename($gtfFile); + printf "<li><a href='https://$sourceServer/hubs/$localDataUrl/genes/$gtf' target=_blank>$gtf</a> gene GTF file</li>\n"; + } +} + +printf " +<li>explore the hub directory at: <a href='https://$sourceServer/hubs/$localDataUrl/' target=_blank>$sourceServer/hubs/$localDataUrl/</a></li> +</ul> +"; +printf "</p>\n<hr> +<h4>Copy this entire assembly hub for local use</h4> +<p> +This download is only for the purpose of using this assembly hub in +your institution which may have firewall access restrictions to this +data.<br> + +To download this assembly data, use this <b>rsync</b> command: <pre> rsync -a -P \\ rsync://$sourceServer/hubs/$localDataUrl/ \\ ./$accessionId/ - - which creates the local directory: ./$accessionId/ </pre> -or this <em>wget</em> command: + + which creates the local directory: <b>./$accessionId/</b><br> +or this <b>wget</b> command: <pre> wget --timestamping -m -nH -x --cut-dirs=6 -e robots=off -np -k \\ --reject \"index.html*\" -P \"$accessionId\" \\ https://$sourceServer/hubs/$localDataUrl/ - - which creates a local directory: ./$accessionId/ </pre> + + which creates a local directory: <b>./$accessionId/</b> <p> -There is an included <em>hub.txt</em> file in that download +There is an included <b>hub.txt</b> file in that download data directory to use for your local track hub instance.<br> Using the genome browser menus: <em><strong>My Data</strong> -> <strong>Track Hubs</strong></em><br> select the <em><strong>My Hubs</strong></em> tab to enter a URL -to this hub.txt file to attach this assembly hub to a genome browser. +to this <b>hub.txt</b> file to attach this assembly hub to a genome browser. </p> <p> -The <em>html/$asmId.description.html</em> page is information for your users to -describe this assembly. This WEB page with these instructions -is an instance of html/$asmId.description.html file. +The <b>html/$asmId.description.html</b> page is information for your users to +describe this assembly.<br>This web page with these instructions +is an instance of the <b>html/$asmId.description.html</b> file. </p> <p> See also: <a href='/goldenPath/help/hgTrackHubHelp.html' target=_blank>track hub help</a> documentation.<br> </p>\n"; +if ($genomeSize < 4294967297) { printf "<hr> +<h4>blat service</h4> <p> -To operate a blat server on this assembly, in the directory where you have -the <em>$asmId.2bit</em> file: -<pre> -gfServer -log=$asmId.gfServer.trans.log -ipLog -canStop start \\ - yourserver.domain.edu 76543 -trans -mask $asmId.2bit & -gfServer -log=$asmId.gfServer.log -ipLog -canStop start \\ - yourserver.domain.edu 76542 -stepSize=5 $asmId.2bit & -</pre> -Adjust the port numbers <em>76543</em> <em>76542</em> and the -<em>yourserver.domain.edu</em> for your local circumstances.<br> -Typically, port numbers in the range <em>49152</em> to <em>65535</em> -are available for private use as in this case. -See also: <a href='https://www.iana.org/assignments/service-names-port-numbers/service-names-port-numbers.xhtml' target=_blank>IANA.org</a> port registry. +There is blat service available for this genome assembly. When viewing this +assembly in the genome browser, access the blat service via the +<em><b>Tools -> Blat</b></em> blue navigation bar menu item. </p> <p> -Enter the following specifications in your <em>genomes.txt</em> file: +For local command line blat service, access +the blat service via the <b>gfClient</b> command line operation.<br> +See also: <a href='http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/' target=_blank> +hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/</a> to download command line +binaries.<br> +<br> +To operate this locally, you will need the <b>%s.2bit</b> file from: +<pre> + https://$sourceServer/hubs/$localDataUrl/ +</pre> +Which can be obtained with rsync via: +<pre> + rsync -a -P \ + rsync://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId.2bit ./ +</pre> +With that <b>$accessionId.2bit</b> file in your working directory where you run +this command, for example, a DNA query with your DNA sequence in +the file: <b>someDna.fa</b> +with result in the file: <b>$accessionId.someDna.psl</b> +<pre> +gfClient -t=dna -q=dna -genome=$accessionId -genomeDataDir=$accessionDir \ + dynablat-01.soe.ucsc.edu 4040 ./ someDna.fa $accessionId.someDna.psl +</pre> +For a protein fasta query with your protein sequence in the file: <b>someProtein.faa</b> +with result in the file: <b>$accessionId.someProtein.psl</b> <pre> -transBlat yourserver.domain.edu 76543 -blat yourserver.domain.edu 76542 +gfClient -t=dnax -q=prot -genome=$accessionId -genomeDataDir=$accessionDir \ + dynablat-01.soe.ucsc.edu 4040 ./ someProtein.faa $accessionId.someProtein.psl </pre> -See also: <a href=\"https://genome.ucsc.edu/goldenPath/help/hubQuickStartAssembly.html#blat\" -target=_blank>Blat for an Assembly Hub</a> </p>\n"; +} else { + printf "<hr> +<p> +At this time, this genome size: %s, is too large (greater than 4294967296), +to function with the UCSC blat system. We hope to have improvements to +that system in the future to allow blat service for the larger genome sizes. +</p>\n", commify($genomeSize); +} + printf "<hr> <p> <b>Search the assembly:</b> <ul> <li> <b>By position or search term: </b> Use the "position or search term" box to find areas of the genome associated with many different attributes, such as a specific chromosomal coordinate range; mRNA, EST, or STS marker names; or keywords from the GenBank description of an mRNA. <a href=\"http://genome.ucsc.edu/goldenPath/help/query.html\">More information</a>, including sample queries.</li> <li> <b>By gene name: </b> Type a gene name into the "search term" box, choose your gene from the drop-down list, then press "submit" to go directly to the assembly location associated with that gene. -<a href=\"http://genome.ucsc.edu/goldenPath/help/geneSearchBox.html\">More information</a>.</li> +<a href=\"http://genome.ucsc.edu/goldenPath/help/geneSearchBox.html\">More information</a>. To avoid case sensitivity issues, always use fully lower case gene names.</li> <li> <b>By track type: </b> Click the "track search" button to find Genome Browser tracks that match specific selection criteria. <a href=\"http://genome.ucsc.edu/goldenPath/help/trackSearch.html\">More information</a>.</li> </ul> </p> <hr>\n"; -# printf "<script type='text/javascript' src='../js/gatewayPage.js'></script>\n"; - __END__ /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.30_GRCh38.p4/GCF_000001405.30_GRCh38.p4_assembly_report.txt # Assembly Name: GRCh38.p4 # Description: Genome Reference Consortium Human Build 38 patch release 4 (GRCh38.p4) # Organism name: Homo sapiens (human) # Taxid: 9606 # Submitter: Genome Reference Consortium # Date: 2015-6-25 # Assembly type: haploid-with-alt-loci # Release type: patch # Assembly level: Chromosome # Genome representation: full # GenBank Assembly Accession: GCA_000001405.19 (latest)