d902053614e1eeeb5e84d4acf1dc4a2abb7120f8
hiram
Fri Jan 3 13:47:38 2020 -0800
get the gateway page a bit more generic with argument to specify refs #20137
diff --git src/hg/utils/automation/asmHubGatewayPage.pl src/hg/utils/automation/asmHubGatewayPage.pl
index 5f0d543..b1e9087 100755
--- src/hg/utils/automation/asmHubGatewayPage.pl
+++ src/hg/utils/automation/asmHubGatewayPage.pl
@@ -1,424 +1,425 @@
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use AsmHub;
use File::Basename;
my $sourceServer = "hgdownload.soe.ucsc.edu";
my @months = qw( 0 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec );
sub usage() {
- printf STDERR "usage: asmHubGatewayPage.pl /*assembly_report.txt /asmId.chrom.sizes /image.jpg /photoCredits.txt\n";
+ printf STDERR "usage: asmHubGatewayPage.pl /*assembly_report.txt /asmId.chrom.sizes /image.jpg /photoCredits.txt\n";
printf STDERR "output is to stdout, redirect to file: > description.html\n";
printf STDERR "photoCredits.txt is a two line tagstring file:\n";
printf STDERR "tags: photoCreditURL and photoCreditName\n";
printf STDERR "use string 'noPhoto' for image and credits when no photo\n";
exit 255;
}
sub chromSizes($) {
my ($sizeFile) = @_;
if ( -s $sizeFile ) {
printf STDERR "# reading chrom.sizes file:\n#\t'%s\'\n", $sizeFile;
my $ix = 0;
my $contigCount = 0;
my %sizes; # key is contigName, value is size
if ($sizeFile eq "stdin") {
while (my $line = <>) {
next if ($line =~ m/^\s*#/);
++$contigCount;
chomp ($line);
my ($name, $size, $rest) = split('\s+', $line, 3);
my $key = sprintf("%s_X_%d", $name, $ix++);
$sizes{$key} = $size;
}
} else {
open (FH, "<$sizeFile") or die "can not read $sizeFile";
while (my $line = ) {
next if ($line =~ m/^\s*#/);
++$contigCount;
chomp ($line);
my ($name, $size, $rest) = split('\s+', $line, 3);
my $key = sprintf("%s_X_%d", $name, $ix++);
$sizes{$key} = $size;
}
close (FH);
}
my $totalSize = 0;
foreach my $key (keys %sizes) {
$totalSize += $sizes{$key}
}
my $n50Size = $totalSize / 2;
my $genomeSize = $totalSize;
printf "Total assembly nucleotides: %s
\n", &AsmHub::commify($totalSize);
printf "Assembly contig count: %s
\n", &AsmHub::commify($contigCount);
my $prevContig = "";
my $prevSize = 0;
$totalSize = 0;
# work through the sizes until reaching the N50 size
foreach my $key (sort { $sizes{$b} <=> $sizes{$a} } keys %sizes) {
$totalSize += $sizes{$key};
if ($totalSize > $n50Size) {
my $prevName = $prevContig;
$prevName =~ s/_X_[0-9]+//;
my $origName = $key;
$origName =~ s/_X_[0-9]+//;
printf "N50 size: %s
\n", &AsmHub::commify($sizes{$key});
last;
}
$prevContig = $key;
$prevSize = $sizes{$key};
}
} else {
printf STDERR "# error: can not find chrom.sizes file:\n#\t'%s\'\n",
$sizeFile;
}
}
# typical reference:
# ${inside}/scripts/gatewayPage.pl ${outside}/${asmReport} \
# > "${inside}/${D}/${B}.description.html" \
# 2> "${inside}/${D}/${B}.names.tab"
my $argc = scalar(@ARGV);
-if ($argc != 4) {
+if ($argc != 5) {
usage;
}
-my ($asmReport, $chromSizes, $jpgImage, $photoCredits) = @ARGV;
+my ($asmHubName, $asmReport, $chromSizes, $jpgImage, $photoCredits) = @ARGV;
if ( ! -s $asmReport ) {
printf STDERR "ERROR: can not find '$asmReport'\n";
usage;
}
if ( ! -s $chromSizes ) {
printf STDERR "ERROR: can not find '$chromSizes'\n";
usage;
}
if ($jpgImage ne "noPhoto") {
if ( ! -s $jpgImage ) {
printf STDERR "ERROR: can not find '$jpgImage'\n";
usage;
}
if ( ! -s $photoCredits ) {
printf STDERR "ERROR: can not find '$photoCredits'\n";
usage;
}
}
my $photoCreditURL = "";
my $photoCreditName = "";
my $imageSize = "";
my $imageName = "";
my $imageWidth = 0;
my $imageHeight = 0;
my $imageWidthBorder = 15;
if ($jpgImage ne "noPhoto") {
printf STDERR "# reading $photoCredits\n";
open (FH, "<$photoCredits") or die "can not read $photoCredits";
while (my $line = ) {
chomp $line;
next if ($line =~ m/^#/);
next if (length($line) < 2);
my ($tag, $value) = split('\t', $line);
if ($tag =~ m/photoCreditURL/) {
$photoCreditURL = $value;
} elsif ($tag =~ m/photoCreditName/) {
$photoCreditName = $value;
}
}
close (FH);
if ( -s $jpgImage ) {
$imageSize = `identify $jpgImage | awk '{print \$3}'`;
chomp $imageSize;
($imageWidth, $imageHeight) = split('x', $imageSize);
$imageName = basename($jpgImage);
}
}
# transform this path name into a chrom.sizes reference
my $thisDir = `pwd`;
chomp $thisDir;
printf STDERR "# thisDir $thisDir\n";
my $ftpName = dirname($thisDir);
my $asmId = basename($ftpName);;
my ($gcXPrefix, $accession, $rest) = split('_', $asmId, 3);
my $newStyleUrl = sprintf("%s/%s/%s/%s/%s", $gcXPrefix, substr($accession,0,3),
substr($accession,3,3), substr($accession,6,3), $asmId);
$ftpName =~ s#/hive/data/outside/ncbi/##;
$ftpName =~ s#/hive/data/inside/ncbi/##;
$ftpName =~ s#/hive/data/genomes/asmHubs/##;
printf STDERR "# ftpName $ftpName\n";
# my $urlDirectory = `basename $ftpName`;
# chomp $urlDirectory;
my $speciesSubgroup = $ftpName;
my $asmType = "genbank";
$asmType = "refseq" if ( $speciesSubgroup =~ m#refseq/#);
$speciesSubgroup =~ s#genomes/$asmType/##;;
$speciesSubgroup =~ s#/.*##;;
my %taxIdCommonName; # key is taxId, value is common name
# from NCBI taxonomy database dump
open (FH, "<$ENV{'HOME'}/kent/src/hg/utils/automation/genbank/taxId.comName.tab") or die "can not read taxId.comName.tab";
while (my $line = ) {
chomp $line;
my ($taxId, $comName) = split('\t', $line);
$taxIdCommonName{$taxId} = $comName;
}
close (FH);
my $submitter = "(n/a)";
my $asmName = "(n/a)";
my $orgName = "(n/a)";
my $taxId = "(n/a)";
my $asmDate = "(n/a)";
my $asmAccession = "(n/a)";
my $commonName = "(n/a)";
my $bioSample = "(n/a)";
my $descrAsmType = "(n/a)";
my $asmLevel = "(n/a)";
open (FH, "<$asmReport") or die "can not read $asmReport";
while (my $line = ) {
chomp $line;
$line =~ s/
//g;
if ($line =~ m/date:\s+/i) {
next if ($asmDate !~ m#\(n/a#);
$line =~ s/.*date:\s+//i;
my ($year, $month, $day) = split('-',$line);
$asmDate = sprintf("%02d %s %04d", $day, $months[$month], $year);
}
if ($line =~ m/biosample:\s+/i) {
next if ($bioSample !~ m#\(n/a#);
$line =~ s/.*biosample:\s+//i;
$bioSample = $line;
}
if ($line =~ m/assembly\s+type:\s+/i) {
next if ($descrAsmType !~ m#\(n/a#);
$line =~ s/.*assembly\s+type:\s+//i;
$descrAsmType = $line;
}
if ($line =~ m/assembly\s+level:\s+/i) {
next if ($asmLevel !~ m#\(n/a#);
$line =~ s/.*assembly\s+level:\s+//i;
$asmLevel = $line;
}
if ($line =~ m/assembly\s+name:\s+/i) {
next if ($asmName !~ m#\(n/a#);
$line =~ s/.*assembly\s+name:\s+//i;
$asmName = $line;
}
if ($line =~ m/organism\s+name:\s+/i) {
next if ($orgName !~ m#\(n/a#);
$line =~ s/.*organism\s+name:\s+//i;
$line =~ s/\s+$//;
$orgName = $line;
}
if ($line =~ m/submitter:\s+/i) {
next if ($submitter !~ m#\(n/a#);
$line =~ s/.*submitter:\s+//i;
$submitter = $line;
}
if ($line =~ m/$asmType\s+assembly\s+accession:\s+/i) {
next if ($asmAccession !~ m#\(n/a#);
$line =~ s/.*$asmType\s+assembly\s+accession:\s+//i;
$asmAccession = $line;
$asmAccession =~ s/ .*//;
}
if ($line =~ m/taxid:\s+/i) {
next if ($taxId !~ m#\(n/a#);
$line =~ s/.*taxid:\s+//i;
$taxId = $line;
if (exists($taxIdCommonName{$taxId})) {
$commonName = $taxIdCommonName{$taxId};
}
}
}
close (FH);
$commonName = $orgName if ($commonName =~ m#\(n/a#);
if ($commonName =~ m/\(/) {
$commonName =~ s/.*\(//;
$commonName =~ s/\).*//;
}
if ($orgName =~ m/\(/) {
$orgName =~ s/\(.*//;
}
$orgName =~ s/\s+$//;
printf STDERR "#taxId\tcommonName\tsubmitter\tasmName\torgName\tbioSample\tasmType\tasmLevel\tasmDate\tasmAccession\n";
printf STDERR "%s\t", $taxId;
printf STDERR "%s\t", $commonName;
printf STDERR "%s\t", $submitter;
printf STDERR "%s\t", $asmName;
printf STDERR "%s\t", $orgName;
printf STDERR "%s\t", $bioSample;
printf STDERR "%s\t", $descrAsmType;
printf STDERR "%s\t", $asmLevel;
printf STDERR "%s\t", $asmDate;
printf STDERR "%s\n", $asmAccession;
# printf "\n", $asmId;
if (length($imageName)) {
printf "
-
+
|
%s
(Photo courtesy of
%s)
|
-\n", $imageWidth+$imageWidthBorder, $imageHeight, $asmAccession, $sourceServer, $asmId, $imageName, $imageWidth, $imageHeight, $commonName, $orgName, $photoCreditURL, $photoCreditName;
+\n", $imageWidth+$imageWidthBorder, $imageHeight, $asmAccession, $sourceServer, $asmHubName, $asmId, $imageName, $imageWidth, $imageHeight, $commonName, $orgName, $photoCreditURL, $photoCreditName;
}
my $sciNameUnderscore = $orgName;
$sciNameUnderscore =~ s/ /_/g;
$sciNameUnderscore = "Strigops_habroptilus" if ($orgName =~ m/Strigops habroptila/);
printf "
Common name: %s
Taxonomic name: %s, taxonomy ID: %s
Sequencing/Assembly provider ID: %s
-Vertebrate Genomes Project information: %s
Assembly date: %s
Assembly type: %s
Assembly level: %s
Biosample: %s
Assembly accession ID: %s
Assembly FTP location: %s
-\n", $commonName, $orgName, $taxId, $taxId, $submitter, $sciNameUnderscore, $orgName, $asmDate, $descrAsmType,
+\n", $commonName, $orgName, $taxId, $taxId, $submitter, $asmDate, $descrAsmType,
$asmLevel, $bioSample, $bioSample, $asmAccession, $asmAccession, $newStyleUrl, $newStyleUrl;
chromSizes($chromSizes);
printf "
\n
Download files for this assembly hub:
To use the data from this assembly for a local hub instance at your
institution, download these data as indicated by these instructions.
See also: track hub help documentation.
To download this assembly data, use this rsync command:
- rsync -a -P rsync://$sourceServer/hubs/VGP/genomes/$asmId/ ./$asmId/
+ rsync -a -P \\
+ rsync://$sourceServer/hubs/$asmHubName/genomes/$asmId/ \\
+ ./$asmId/
which creates the local directory: ./$asmId/
or this wget command:
wget --timestamping -m -nH -x --cut-dirs=4 -e robots=off -np -k \\
--reject \"index.html*\" -P \"$asmId\" \\
- https://$sourceServer/hubs/VGP/genomes/$asmId/
+ https://$sourceServer/hubs/$asmHubName/genomes/$asmId/
which creates a local directory: ./$asmId/
There is an included $asmId.genomes.txt file in that download
data to use for your local track hub instance.
You will need to add a hub.txt file to point to this genomes.txt file.
Something like:
hub myLocalHub
shortLabel myLocalHub
-longLabel genomes from Vertebrate Genomes Project assemblies
+longLabel genome assembly $asmId
genomesFile $asmId.genomes.txt
email yourEmail\@yourdomain.edu
descriptionUrl html/$asmId.description.html
The html/$asmId.description.html page is information for your users to
describe this assembly. This WEB page with these instructions
is an instance of html/$asmId.description.html file.
\n";
printf "
To operate a blat server on this assembly, in the directory where you have
the $asmId.2bit file:
gfServer -log=%s.gfServer.trans.log -ipLog -canStop start \\
yourserver.domain.edu 76543 -trans -mask %s.2bit &
gfServer -log=%s.gfServer.log -ipLog -canStop start \\
yourserver.domain.edu 76542 -stepSize=5 %s.2bit &
Adjust the port numbers 76543 76542 and the
yourserver.domain.edu for your local circumstances.
Enter the following specifications in your genomes.txt file:
transBlat yourserver.domain.edu 76543
blat yourserver.domain.edu 76542
See also: Blat for an Assembly Hub
\n", $asmId, $asmId, $asmId, $asmId;
printf "
Search the assembly:
-
By position or search term: Use the "position or search term"
box to find areas of the genome associated with many different attributes, such
as a specific chromosomal coordinate range; mRNA, EST, or STS marker names; or
keywords from the GenBank description of an mRNA.
More information, including sample queries.
-
By gene name: Type a gene name into the "search term" box,
choose your gene from the drop-down list, then press "submit" to go
directly to the assembly location associated with that gene.
More information.
-
By track type: Click the "track search" button
to find Genome Browser tracks that match specific selection criteria.
More information.
\n";
# printf "\n";
__END__
/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.30_GRCh38.p4/GCF_000001405.30_GRCh38.p4_assembly_report.txt
# Assembly Name: GRCh38.p4
# Description: Genome Reference Consortium Human Build 38 patch release 4 (GRCh38.p4)
# Organism name: Homo sapiens (human)
# Taxid: 9606
# Submitter: Genome Reference Consortium
# Date: 2015-6-25
# Assembly type: haploid-with-alt-loci
# Release type: patch
# Assembly level: Chromosome
# Genome representation: full
# GenBank Assembly Accession: GCA_000001405.19 (latest)
# RefSeq Assembly Accession: GCF_000001405.30 (latest)
# RefSeq Assembly and GenBank Assemblies Identical: yes
#
## Assembly-Units:
## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name
## GCA_000001305.2 GCF_000001305.14 Primary Assembly
## GCA_000005045.17 GCF_000005045.16 PATCHES
## GCA_000001315.2 GCF_000001315.2 ALT_REF_LOCI_1