be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/hg/utils/automation/asmHubGatewayPage.pl src/hg/utils/automation/asmHubGatewayPage.pl index 5024ee8..07b361e 100755 --- src/hg/utils/automation/asmHubGatewayPage.pl +++ src/hg/utils/automation/asmHubGatewayPage.pl @@ -1,439 +1,439 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; ### XXX ### temporary hgdownload-test.gi ### my $sourceServer = "hgdownload-test.gi.ucsc.edu"; my $sourceServer = "hgdownload.soe.ucsc.edu"; my @months = qw( 0 Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec ); sub usage() { printf STDERR "usage: asmHubGatewayPage.pl /*assembly_report.txt /asmId.chrom.sizes /image.jpg /photoCredits.txt\n"; printf STDERR "output is to stdout, redirect to file: > description.html\n"; printf STDERR "photoCredits.txt is a two line tagstring file:\n"; printf STDERR "tags: photoCreditURL and photoCreditName\n"; printf STDERR "use string 'noPhoto' for image and credits when no photo\n"; printf STDERR "stderr output is routed to a 'asmId.names.tab' file for use elsewhere\n"; exit 255; } sub chromSizes($) { my ($sizeFile) = @_; if ( -s $sizeFile ) { printf STDERR "# reading chrom.sizes file:\n#\t'%s\'\n", $sizeFile; my $ix = 0; my $contigCount = 0; my %sizes; # key is contigName, value is size if ($sizeFile eq "stdin") { while (my $line = <>) { next if ($line =~ m/^\s*#/); ++$contigCount; chomp ($line); my ($name, $size, $rest) = split('\s+', $line, 3); my $key = sprintf("%s_X_%d", $name, $ix++); $sizes{$key} = $size; } } else { open (FH, "<$sizeFile") or die "can not read $sizeFile"; while (my $line = ) { next if ($line =~ m/^\s*#/); ++$contigCount; chomp ($line); my ($name, $size, $rest) = split('\s+', $line, 3); my $key = sprintf("%s_X_%d", $name, $ix++); $sizes{$key} = $size; } close (FH); } my $totalSize = 0; foreach my $key (keys %sizes) { $totalSize += $sizes{$key} } my $n50Size = $totalSize / 2; my $genomeSize = $totalSize; printf "Total assembly nucleotides: %s
\n", &AsmHub::commify($totalSize); printf "Assembly contig count: %s
\n", &AsmHub::commify($contigCount); my $prevContig = ""; my $prevSize = 0; $totalSize = 0; # work through the sizes until reaching the N50 size foreach my $key (sort { $sizes{$b} <=> $sizes{$a} } keys %sizes) { $totalSize += $sizes{$key}; if ($totalSize > $n50Size) { my $prevName = $prevContig; $prevName =~ s/_X_[0-9]+//; my $origName = $key; $origName =~ s/_X_[0-9]+//; printf "N50 size: %s
\n", &AsmHub::commify($sizes{$key}); last; } $prevContig = $key; $prevSize = $sizes{$key}; } } else { printf STDERR "# error: can not find chrom.sizes file:\n#\t'%s\'\n", $sizeFile; } } # typical reference: # ${inside}/scripts/gatewayPage.pl ${outside}/${asmReport} \ # > "${inside}/${D}/${B}.description.html" \ # 2> "${inside}/${D}/${B}.names.tab" my $argc = scalar(@ARGV); if ($argc != 5) { usage; } my ($asmHubName, $asmReport, $chromSizes, $jpgImage, $photoCredits) = @ARGV; if ( ! -s $asmReport ) { printf STDERR "ERROR: can not find '$asmReport'\n"; usage; } if ( ! -s $chromSizes ) { printf STDERR "ERROR: can not find '$chromSizes'\n"; usage; } if ($jpgImage ne "noPhoto") { if ( ! -s $jpgImage ) { printf STDERR "ERROR: can not find '$jpgImage'\n"; usage; } if ( ! -s $photoCredits ) { printf STDERR "ERROR: can not find '$photoCredits'\n"; usage; } } my $photoCreditURL = ""; my $photoCreditName = ""; my $imageSize = ""; my $imageName = ""; my $imageWidth = 0; my $imageHeight = 0; my $imageWidthBorder = 15; if ($jpgImage ne "noPhoto") { printf STDERR "# reading $photoCredits\n"; open (FH, "<$photoCredits") or die "can not read $photoCredits"; while (my $line = ) { chomp $line; next if ($line =~ m/^#/); next if (length($line) < 2); my ($tag, $value) = split('\t', $line); if ($tag =~ m/photoCreditURL/) { $photoCreditURL = $value; } elsif ($tag =~ m/photoCreditName/) { $photoCreditName = $value; } } close (FH); if ( -s $jpgImage ) { $imageSize = `identify $jpgImage | awk '{print \$3}'`; chomp $imageSize; ($imageWidth, $imageHeight) = split('x', $imageSize); $imageName = basename($jpgImage); } } # transform this path name into a chrom.sizes reference my $thisDir = `pwd`; chomp $thisDir; my $ftpName = dirname($thisDir); my $asmId = basename($ftpName);; my ($gcXPrefix, $accession, $rest) = split('_', $asmId, 3); my $accessionId = sprintf("%s_%s", $gcXPrefix, $accession); my $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); $accessionDir .= "/" . $accessionId; my $newStyleUrl = sprintf("%s/%s/%s/%s/%s", $gcXPrefix, substr($accession,0,3), substr($accession,3,3), substr($accession,6,3), $asmId); my $localDataUrl = sprintf("%s/%s/%s/%s/%s", $gcXPrefix, substr($accession,0,3), substr($accession,3,3), substr($accession,6,3), $accessionId); $ftpName =~ s#/hive/data/outside/ncbi/##; $ftpName =~ s#/hive/data/inside/ncbi/##; $ftpName =~ s#/hive/data/genomes/asmHubs/##; # my $urlDirectory = `basename $ftpName`; # chomp $urlDirectory; my $speciesSubgroup = $ftpName; my $asmType = "genbank"; -$asmType = "refseq" if ( $speciesSubgroup =~ m#refseq/#); +$asmType = "refseq" if ( $gcXPrefix =~ m#GCF#); $speciesSubgroup =~ s#genomes/$asmType/##;; $speciesSubgroup =~ s#/.*##;; my %taxIdCommonName; # key is taxId, value is common name # from NCBI taxonomy database dump open (FH, "<$ENV{'HOME'}/kent/src/hg/utils/automation/genbank/taxId.comName.tab") or die "can not read taxId.comName.tab"; while (my $line = ) { chomp $line; my ($taxId, $comName) = split('\t', $line); $taxIdCommonName{$taxId} = $comName; } close (FH); my $submitter = "(n/a)"; my $asmName = "(n/a)"; my $orgName = "(n/a)"; my $taxId = "(n/a)"; my $asmDate = "(n/a)"; my $asmAccession = "(n/a)"; my $commonName = "(n/a)"; my $bioSample = "(n/a)"; my $descrAsmType = "(n/a)"; my $asmLevel = "(n/a)"; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = ) { chomp $line; $line =~ s/ //g; if ($line =~ m/date:\s+/i) { next if ($asmDate !~ m#\(n/a#); $line =~ s/.*date:\s+//i; my ($year, $month, $day) = split('-',$line); $asmDate = sprintf("%02d %s %04d", $day, $months[$month], $year); } if ($line =~ m/biosample:\s+/i) { next if ($bioSample !~ m#\(n/a#); $line =~ s/.*biosample:\s+//i; $bioSample = $line; } if ($line =~ m/assembly\s+type:\s+/i) { next if ($descrAsmType !~ m#\(n/a#); $line =~ s/.*assembly\s+type:\s+//i; $descrAsmType = $line; } if ($line =~ m/assembly\s+level:\s+/i) { next if ($asmLevel !~ m#\(n/a#); $line =~ s/.*assembly\s+level:\s+//i; $asmLevel = $line; } if ($line =~ m/assembly\s+name:\s+/i) { next if ($asmName !~ m#\(n/a#); $line =~ s/.*assembly\s+name:\s+//i; $asmName = $line; } if ($line =~ m/organism\s+name:\s+/i) { next if ($orgName !~ m#\(n/a#); $line =~ s/.*organism\s+name:\s+//i; $line =~ s/\s+$//; $orgName = $line; } if ($line =~ m/submitter:\s+/i) { next if ($submitter !~ m#\(n/a#); $line =~ s/.*submitter:\s+//i; $submitter = $line; } if ($line =~ m/$asmType\s+assembly\s+accession:\s+/i) { next if ($asmAccession !~ m#\(n/a#); $line =~ s/.*$asmType\s+assembly\s+accession:\s+//i; $asmAccession = $line; $asmAccession =~ s/ .*//; } if ($line =~ m/taxid:\s+/i) { next if ($taxId !~ m#\(n/a#); $line =~ s/.*taxid:\s+//i; $taxId = $line; if (exists($taxIdCommonName{$taxId})) { $commonName = $taxIdCommonName{$taxId}; } } } close (FH); $commonName = $orgName if ($commonName =~ m#\(n/a#); if ($commonName =~ m/\(/) { $commonName =~ s/.*\(//; $commonName =~ s/\).*//; } if ($orgName =~ m/\(/) { $orgName =~ s/\(.*//; } $orgName =~ s/\s+$//; printf STDERR "#taxId\tcommonName\tsubmitter\tasmName\torgName\tbioSample\tasmType\tasmLevel\tasmDate\tasmAccession\n"; printf STDERR "%s\t", $taxId; printf STDERR "%s\t", $commonName; printf STDERR "%s\t", $submitter; printf STDERR "%s\t", $asmName; printf STDERR "%s\t", $orgName; printf STDERR "%s\t", $bioSample; printf STDERR "%s\t", $descrAsmType; printf STDERR "%s\t", $asmLevel; printf STDERR "%s\t", $asmDate; printf STDERR "%s\n", $asmAccession; # printf "\n", $asmId; if (length($imageName)) { printf "
\"%s\"
%s
(Photo courtesy of %s)
\n", $imageWidth+$imageWidthBorder, $imageHeight, $asmAccession, $sourceServer, $accessionDir, $imageName, $imageWidth, $imageHeight, $commonName, $orgName, $photoCreditURL, $photoCreditName; } my $sciNameUnderscore = $orgName; $sciNameUnderscore =~ s/ /_/g; $sciNameUnderscore = "Strigops_habroptilus" if ($orgName =~ m/Strigops habroptila/); printf "

Common name: %s
Taxonomic name: %s, taxonomy ID: %s
Sequencing/Assembly provider ID: %s
Assembly date: %s
Assembly type: %s
Assembly level: %s
Biosample: %s
Assembly accession ID: %s
Assembly FTP location: %s
\n", $commonName, $orgName, $taxId, $taxId, $submitter, $asmDate, $descrAsmType, $asmLevel, $bioSample, $bioSample, $asmAccession, $asmAccession, $newStyleUrl, $newStyleUrl; chromSizes($chromSizes); printf "

\n

Download files for this assembly hub:
To use the data from this assembly for a local hub instance at your institution, download these data as indicated by these instructions.

To download this assembly data, use this rsync command:

   rsync -a -P \\
     rsync://$sourceServer/hubs/$localDataUrl/ \\
       ./$accessionId/
 
   which creates the local directory: ./$accessionId/
 
or this wget command:
   wget --timestamping -m -nH -x --cut-dirs=6 -e robots=off -np -k \\
     --reject \"index.html*\" -P \"$accessionId\" \\
        https://$sourceServer/hubs/$localDataUrl/
 
   which creates a local directory: ./$accessionId/
 

There is an included hub.txt file in that download data directory to use for your local track hub instance.
Using the genome browser menus: My Data -> Track Hubs
select the My Hubs tab to enter a URL to this hub.txt file to attach this assembly hub to a genome browser.

The html/$asmId.description.html page is information for your users to describe this assembly. This WEB page with these instructions is an instance of html/$asmId.description.html file.

See also: track hub help documentation.

\n"; printf "

To operate a blat server on this assembly, in the directory where you have the $asmId.2bit file:

 gfServer -log=$asmId.gfServer.trans.log -ipLog -canStop start \\
     yourserver.domain.edu 76543 -trans -mask $asmId.2bit &
 gfServer -log=$asmId.gfServer.log -ipLog -canStop start \\
     yourserver.domain.edu 76542 -stepSize=5 $asmId.2bit &
 
Adjust the port numbers 76543 76542 and the yourserver.domain.edu for your local circumstances.
Typically, port numbers in the range 49152 to 65535 are available for private use as in this case. See also: IANA.org port registry.

Enter the following specifications in your genomes.txt file:

 transBlat yourserver.domain.edu 76543
 blat yourserver.domain.edu 76542
 
See also: Blat for an Assembly Hub

\n"; printf "

Search the assembly:

  • By position or search term: Use the "position or search term" box to find areas of the genome associated with many different attributes, such as a specific chromosomal coordinate range; mRNA, EST, or STS marker names; or keywords from the GenBank description of an mRNA. More information, including sample queries.
  • By gene name: Type a gene name into the "search term" box, choose your gene from the drop-down list, then press "submit" to go directly to the assembly location associated with that gene. More information.
  • By track type: Click the "track search" button to find Genome Browser tracks that match specific selection criteria. More information.


\n"; # printf "\n"; __END__ /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.30_GRCh38.p4/GCF_000001405.30_GRCh38.p4_assembly_report.txt # Assembly Name: GRCh38.p4 # Description: Genome Reference Consortium Human Build 38 patch release 4 (GRCh38.p4) # Organism name: Homo sapiens (human) # Taxid: 9606 # Submitter: Genome Reference Consortium # Date: 2015-6-25 # Assembly type: haploid-with-alt-loci # Release type: patch # Assembly level: Chromosome # Genome representation: full # GenBank Assembly Accession: GCA_000001405.19 (latest) # RefSeq Assembly Accession: GCF_000001405.30 (latest) # RefSeq Assembly and GenBank Assemblies Identical: yes # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000001305.2 GCF_000001305.14 Primary Assembly ## GCA_000005045.17 GCF_000005045.16 PATCHES ## GCA_000001315.2 GCF_000001315.2 ALT_REF_LOCI_1