d902053614e1eeeb5e84d4acf1dc4a2abb7120f8 hiram Fri Jan 3 13:47:38 2020 -0800 get the gateway page a bit more generic with argument to specify refs #20137 diff --git src/hg/utils/automation/asmHubGatewayPage.pl src/hg/utils/automation/asmHubGatewayPage.pl index 5f0d543..b1e9087 100755 --- src/hg/utils/automation/asmHubGatewayPage.pl +++ src/hg/utils/automation/asmHubGatewayPage.pl @@ -1,424 +1,425 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; my $sourceServer = "hgdownload.soe.ucsc.edu"; my @months = qw( 0 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ); sub usage() { - printf STDERR "usage: asmHubGatewayPage.pl /*assembly_report.txt /asmId.chrom.sizes /image.jpg /photoCredits.txt\n"; + printf STDERR "usage: asmHubGatewayPage.pl /*assembly_report.txt /asmId.chrom.sizes /image.jpg /photoCredits.txt\n"; printf STDERR "output is to stdout, redirect to file: > description.html\n"; printf STDERR "photoCredits.txt is a two line tagstring file:\n"; printf STDERR "tags: photoCreditURL and photoCreditName\n"; printf STDERR "use string 'noPhoto' for image and credits when no photo\n"; exit 255; } sub chromSizes($) { my ($sizeFile) = @_; if ( -s $sizeFile ) { printf STDERR "# reading chrom.sizes file:\n#\t'%s\'\n", $sizeFile; my $ix = 0; my $contigCount = 0; my %sizes; # key is contigName, value is size if ($sizeFile eq "stdin") { while (my $line = <>) { next if ($line =~ m/^\s*#/); ++$contigCount; chomp ($line); my ($name, $size, $rest) = split('\s+', $line, 3); my $key = sprintf("%s_X_%d", $name, $ix++); $sizes{$key} = $size; } } else { open (FH, "<$sizeFile") or die "can not read $sizeFile"; while (my $line = ) { next if ($line =~ m/^\s*#/); ++$contigCount; chomp ($line); my ($name, $size, $rest) = split('\s+', $line, 3); my $key = sprintf("%s_X_%d", $name, $ix++); $sizes{$key} = $size; } close (FH); } my $totalSize = 0; foreach my $key (keys %sizes) { $totalSize += $sizes{$key} } my $n50Size = $totalSize / 2; my $genomeSize = $totalSize; printf "Total assembly nucleotides: %s
\n", &AsmHub::commify($totalSize); printf "Assembly contig count: %s
\n", &AsmHub::commify($contigCount); my $prevContig = ""; my $prevSize = 0; $totalSize = 0; # work through the sizes until reaching the N50 size foreach my $key (sort { $sizes{$b} <=> $sizes{$a} } keys %sizes) { $totalSize += $sizes{$key}; if ($totalSize > $n50Size) { my $prevName = $prevContig; $prevName =~ s/_X_[0-9]+//; my $origName = $key; $origName =~ s/_X_[0-9]+//; printf "N50 size: %s
\n", &AsmHub::commify($sizes{$key}); last; } $prevContig = $key; $prevSize = $sizes{$key}; } } else { printf STDERR "# error: can not find chrom.sizes file:\n#\t'%s\'\n", $sizeFile; } } # typical reference: # ${inside}/scripts/gatewayPage.pl ${outside}/${asmReport} \ # > "${inside}/${D}/${B}.description.html" \ # 2> "${inside}/${D}/${B}.names.tab" my $argc = scalar(@ARGV); -if ($argc != 4) { +if ($argc != 5) { usage; } -my ($asmReport, $chromSizes, $jpgImage, $photoCredits) = @ARGV; +my ($asmHubName, $asmReport, $chromSizes, $jpgImage, $photoCredits) = @ARGV; if ( ! -s $asmReport ) { printf STDERR "ERROR: can not find '$asmReport'\n"; usage; } if ( ! -s $chromSizes ) { printf STDERR "ERROR: can not find '$chromSizes'\n"; usage; } if ($jpgImage ne "noPhoto") { if ( ! -s $jpgImage ) { printf STDERR "ERROR: can not find '$jpgImage'\n"; usage; } if ( ! -s $photoCredits ) { printf STDERR "ERROR: can not find '$photoCredits'\n"; usage; } } my $photoCreditURL = ""; my $photoCreditName = ""; my $imageSize = ""; my $imageName = ""; my $imageWidth = 0; my $imageHeight = 0; my $imageWidthBorder = 15; if ($jpgImage ne "noPhoto") { printf STDERR "# reading $photoCredits\n"; open (FH, "<$photoCredits") or die "can not read $photoCredits"; while (my $line = ) { chomp $line; next if ($line =~ m/^#/); next if (length($line) < 2); my ($tag, $value) = split('\t', $line); if ($tag =~ m/photoCreditURL/) { $photoCreditURL = $value; } elsif ($tag =~ m/photoCreditName/) { $photoCreditName = $value; } } close (FH); if ( -s $jpgImage ) { $imageSize = `identify $jpgImage | awk '{print \$3}'`; chomp $imageSize; ($imageWidth, $imageHeight) = split('x', $imageSize); $imageName = basename($jpgImage); } } # transform this path name into a chrom.sizes reference my $thisDir = `pwd`; chomp $thisDir; printf STDERR "# thisDir $thisDir\n"; my $ftpName = dirname($thisDir); my $asmId = basename($ftpName);; my ($gcXPrefix, $accession, $rest) = split('_', $asmId, 3); my $newStyleUrl = sprintf("%s/%s/%s/%s/%s", $gcXPrefix, substr($accession,0,3), substr($accession,3,3), substr($accession,6,3), $asmId); $ftpName =~ s#/hive/data/outside/ncbi/##; $ftpName =~ s#/hive/data/inside/ncbi/##; $ftpName =~ s#/hive/data/genomes/asmHubs/##; printf STDERR "# ftpName $ftpName\n"; # my $urlDirectory = `basename $ftpName`; # chomp $urlDirectory; my $speciesSubgroup = $ftpName; my $asmType = "genbank"; $asmType = "refseq" if ( $speciesSubgroup =~ m#refseq/#); $speciesSubgroup =~ s#genomes/$asmType/##;; $speciesSubgroup =~ s#/.*##;; my %taxIdCommonName; # key is taxId, value is common name # from NCBI taxonomy database dump open (FH, "<$ENV{'HOME'}/kent/src/hg/utils/automation/genbank/taxId.comName.tab") or die "can not read taxId.comName.tab"; while (my $line = ) { chomp $line; my ($taxId, $comName) = split('\t', $line); $taxIdCommonName{$taxId} = $comName; } close (FH); my $submitter = "(n/a)"; my $asmName = "(n/a)"; my $orgName = "(n/a)"; my $taxId = "(n/a)"; my $asmDate = "(n/a)"; my $asmAccession = "(n/a)"; my $commonName = "(n/a)"; my $bioSample = "(n/a)"; my $descrAsmType = "(n/a)"; my $asmLevel = "(n/a)"; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = ) { chomp $line; $line =~ s/ //g; if ($line =~ m/date:\s+/i) { next if ($asmDate !~ m#\(n/a#); $line =~ s/.*date:\s+//i; my ($year, $month, $day) = split('-',$line); $asmDate = sprintf("%02d %s %04d", $day, $months[$month], $year); } if ($line =~ m/biosample:\s+/i) { next if ($bioSample !~ m#\(n/a#); $line =~ s/.*biosample:\s+//i; $bioSample = $line; } if ($line =~ m/assembly\s+type:\s+/i) { next if ($descrAsmType !~ m#\(n/a#); $line =~ s/.*assembly\s+type:\s+//i; $descrAsmType = $line; } if ($line =~ m/assembly\s+level:\s+/i) { next if ($asmLevel !~ m#\(n/a#); $line =~ s/.*assembly\s+level:\s+//i; $asmLevel = $line; } if ($line =~ m/assembly\s+name:\s+/i) { next if ($asmName !~ m#\(n/a#); $line =~ s/.*assembly\s+name:\s+//i; $asmName = $line; } if ($line =~ m/organism\s+name:\s+/i) { next if ($orgName !~ m#\(n/a#); $line =~ s/.*organism\s+name:\s+//i; $line =~ s/\s+$//; $orgName = $line; } if ($line =~ m/submitter:\s+/i) { next if ($submitter !~ m#\(n/a#); $line =~ s/.*submitter:\s+//i; $submitter = $line; } if ($line =~ m/$asmType\s+assembly\s+accession:\s+/i) { next if ($asmAccession !~ m#\(n/a#); $line =~ s/.*$asmType\s+assembly\s+accession:\s+//i; $asmAccession = $line; $asmAccession =~ s/ .*//; } if ($line =~ m/taxid:\s+/i) { next if ($taxId !~ m#\(n/a#); $line =~ s/.*taxid:\s+//i; $taxId = $line; if (exists($taxIdCommonName{$taxId})) { $commonName = $taxIdCommonName{$taxId}; } } } close (FH); $commonName = $orgName if ($commonName =~ m#\(n/a#); if ($commonName =~ m/\(/) { $commonName =~ s/.*\(//; $commonName =~ s/\).*//; } if ($orgName =~ m/\(/) { $orgName =~ s/\(.*//; } $orgName =~ s/\s+$//; printf STDERR "#taxId\tcommonName\tsubmitter\tasmName\torgName\tbioSample\tasmType\tasmLevel\tasmDate\tasmAccession\n"; printf STDERR "%s\t", $taxId; printf STDERR "%s\t", $commonName; printf STDERR "%s\t", $submitter; printf STDERR "%s\t", $asmName; printf STDERR "%s\t", $orgName; printf STDERR "%s\t", $bioSample; printf STDERR "%s\t", $descrAsmType; printf STDERR "%s\t", $asmLevel; printf STDERR "%s\t", $asmDate; printf STDERR "%s\n", $asmAccession; # printf "\n", $asmId; if (length($imageName)) { printf "
- \"%s\" + \"%s\"
%s
(Photo courtesy of %s)
-\n", $imageWidth+$imageWidthBorder, $imageHeight, $asmAccession, $sourceServer, $asmId, $imageName, $imageWidth, $imageHeight, $commonName, $orgName, $photoCreditURL, $photoCreditName; +\n", $imageWidth+$imageWidthBorder, $imageHeight, $asmAccession, $sourceServer, $asmHubName, $asmId, $imageName, $imageWidth, $imageHeight, $commonName, $orgName, $photoCreditURL, $photoCreditName; } my $sciNameUnderscore = $orgName; $sciNameUnderscore =~ s/ /_/g; $sciNameUnderscore = "Strigops_habroptilus" if ($orgName =~ m/Strigops habroptila/); printf "

Common name: %s
Taxonomic name: %s, taxonomy ID: %s
Sequencing/Assembly provider ID: %s
-Vertebrate Genomes Project information: %s
Assembly date: %s
Assembly type: %s
Assembly level: %s
Biosample: %s
Assembly accession ID: %s
Assembly FTP location: %s
-\n", $commonName, $orgName, $taxId, $taxId, $submitter, $sciNameUnderscore, $orgName, $asmDate, $descrAsmType, +\n", $commonName, $orgName, $taxId, $taxId, $submitter, $asmDate, $descrAsmType, $asmLevel, $bioSample, $bioSample, $asmAccession, $asmAccession, $newStyleUrl, $newStyleUrl; chromSizes($chromSizes); printf "

\n

Download files for this assembly hub:
To use the data from this assembly for a local hub instance at your institution, download these data as indicated by these instructions.
See also: track hub help documentation.

To download this assembly data, use this rsync command:

-  rsync -a -P rsync://$sourceServer/hubs/VGP/genomes/$asmId/ ./$asmId/
+  rsync -a -P \\
+    rsync://$sourceServer/hubs/$asmHubName/genomes/$asmId/ \\
+      ./$asmId/
 
   which creates the local directory: ./$asmId/
 
or this wget command:
   wget --timestamping -m -nH -x --cut-dirs=4 -e robots=off -np -k \\
     --reject \"index.html*\" -P \"$asmId\" \\
-       https://$sourceServer/hubs/VGP/genomes/$asmId/
+       https://$sourceServer/hubs/$asmHubName/genomes/$asmId/
 
   which creates a local directory: ./$asmId/
 

There is an included $asmId.genomes.txt file in that download data to use for your local track hub instance.
You will need to add a hub.txt file to point to this genomes.txt file.
Something like:
 hub myLocalHub
 shortLabel myLocalHub
-longLabel genomes from Vertebrate Genomes Project assemblies
+longLabel genome assembly $asmId
 genomesFile $asmId.genomes.txt
 email yourEmail\@yourdomain.edu
 descriptionUrl html/$asmId.description.html
 
The html/$asmId.description.html page is information for your users to describe this assembly. This WEB page with these instructions is an instance of html/$asmId.description.html file.

\n"; printf "

To operate a blat server on this assembly, in the directory where you have the $asmId.2bit file:

 gfServer -log=%s.gfServer.trans.log -ipLog -canStop start \\
     yourserver.domain.edu 76543 -trans -mask %s.2bit &
 gfServer -log=%s.gfServer.log -ipLog -canStop start \\
     yourserver.domain.edu 76542 -stepSize=5 %s.2bit &
 
Adjust the port numbers 76543 76542 and the yourserver.domain.edu for your local circumstances.
Enter the following specifications in your genomes.txt file:
 transBlat yourserver.domain.edu 76543
 blat yourserver.domain.edu 76542
 
See also: Blat for an Assembly Hub

\n", $asmId, $asmId, $asmId, $asmId; printf "

Search the assembly:

  • By position or search term: Use the "position or search term" box to find areas of the genome associated with many different attributes, such as a specific chromosomal coordinate range; mRNA, EST, or STS marker names; or keywords from the GenBank description of an mRNA. More information, including sample queries.
  • By gene name: Type a gene name into the "search term" box, choose your gene from the drop-down list, then press "submit" to go directly to the assembly location associated with that gene. More information.
  • By track type: Click the "track search" button to find Genome Browser tracks that match specific selection criteria. More information.


\n"; # printf "\n"; __END__ /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.30_GRCh38.p4/GCF_000001405.30_GRCh38.p4_assembly_report.txt # Assembly Name: GRCh38.p4 # Description: Genome Reference Consortium Human Build 38 patch release 4 (GRCh38.p4) # Organism name: Homo sapiens (human) # Taxid: 9606 # Submitter: Genome Reference Consortium # Date: 2015-6-25 # Assembly type: haploid-with-alt-loci # Release type: patch # Assembly level: Chromosome # Genome representation: full # GenBank Assembly Accession: GCA_000001405.19 (latest) # RefSeq Assembly Accession: GCF_000001405.30 (latest) # RefSeq Assembly and GenBank Assemblies Identical: yes # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000001305.2 GCF_000001305.14 Primary Assembly ## GCA_000005045.17 GCF_000005045.16 PATCHES ## GCA_000001315.2 GCF_000001315.2 ALT_REF_LOCI_1