src/hg/utils/automation/makeDownloads.pl 1.22
1.22 2009/06/08 18:38:58 hiram
Generalize the ssh command to avoid questions to the shell for new hosts
Index: src/hg/utils/automation/makeDownloads.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/makeDownloads.pl,v
retrieving revision 1.21
retrieving revision 1.22
diff -b -B -U 1000000 -r1.21 -r1.22
--- src/hg/utils/automation/makeDownloads.pl 24 Apr 2009 00:21:50 -0000 1.21
+++ src/hg/utils/automation/makeDownloads.pl 8 Jun 2009 18:38:58 -0000 1.22
@@ -1,1091 +1,1091 @@
#!/usr/bin/env perl
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit ~/kent/src/hg/utils/automation/makeDownloads.pl instead.
# $Id$
use Getopt::Long;
use warnings;
use strict;
use FindBin qw($Bin);
use lib "$Bin";
use HgAutomate;
use HgRemoteScript;
use HgStepManager;
# Option variable names:
use vars @HgAutomate::commonOptionVars;
use vars @HgStepManager::optionVars;
use vars qw/
$opt_allowMissedTrfs
$opt_noChromRoot
$opt_ignoreRepeatMasker
/;
# Specify the steps supported with -continue / -stop:
my $stepper = new HgStepManager(
[ { name => 'compress', func => \&doCompress },
{ name => 'install', func => \&doInstall },
]
);
# Option defaults:
my $defaultBigClusterHub = 'most available';
my $defaultSmallClusterHub = 'n/a';
my $defaultWorkhorse = 'least loaded';
my $dbHost = 'hgwdev';
my $base = $0;
$base =~ s/^(.*\/)?//;
sub usage {
# Usage / help / self-documentation:
my ($status, $detailed) = @_;
# Basic help (for incorrect usage):
print STDERR "
usage: $base db
options:
";
print STDERR $stepper->getOptionHelp();
print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
'workhorse' => $defaultWorkhorse);
print STDERR <<_EOF_
-allowMissedTrfs tolerate missing trfMaskChrom/*.bed files
-noChromRoot find RM .out files for chr*_hap in actual hap chrom name
-ignoreRepeatMasker do not look for RM .out files
Automates generation of assembly download files for genome database \$db:
compress: Create compressed download files, md5sum.txt and README.txt in
$HgAutomate::clusterData/\$db/goldenPath/*/
install: Create links to those files from
$dbHost:$HgAutomate::goldenPath/\$db/*/
This will blow away any existing README.txt files and any files that are
already in bigZips etc. So if you have added files specially for this
release (include README.txt sections), and then need to run this again,
be sure to back them up in a different directory first.
_EOF_
;
# Detailed help (-help):
print STDERR "
Assumptions:
1. $HgAutomate::clusterData/\$db/{\$db.2bit,chrom.sizes} are in place.
2. AGP, RepeatMasker .out and trfBig .bed files are in their usual places under
$HgAutomate::clusterData/\$db/ . (Will complain if not able to find.)
" if ($detailed);
print "\n";
exit $status;
}
# Globals:
# Command line args: db
my ($db);
# Other:
my ($topDir, $scriptDir, $trfRunDir, $trfRunDirRel);
my ($chromBased, @chroms, %chromRoots, $chromGz, $geneTable);
sub checkOptions {
# Make sure command line options are valid/supported.
my $ok = GetOptions(@HgStepManager::optionSpec,
'allowMissedTrfs',
'noChromRoot',
'ignoreRepeatMasker',
@HgAutomate::commonOptionSpec,
);
&usage(1) if (!$ok);
&usage(0, 1) if ($opt_help);
&HgAutomate::processCommonOptions();
my $err = $stepper->processOptions();
usage(1) if ($err);
$dbHost = $opt_dbHost if ($opt_dbHost);
}
#*** libify?
sub dbHasTable {
my ($dbHost, $db, $table) = @_;
my $rows = `echo show tables like "'$table'" |
- ssh -x $dbHost hgsql -N $db | wc -l`;
+ $HgAutomate::runSSH $dbHost hgsql -N $db | wc -l`;
return ($rows > 0);
} # dbHasTable
sub getGeneTable {
# If there is a suitable table for generating upstream genes, return it.
foreach my $table ('refGene', 'mgcGenes') {
if (&dbHasTable($dbHost, $db, $table)) {
return $table;
}
}
return undef;
} # getGeneTable
#########################################################################
# * step: template [workhorse]
sub compressChromFiles {
# To be called only when assembly is chrom-based.
# Expect to find per-chromosome .agp and RepeatMasker .out files in
# directories with names distilled from chrom names.
# Expect to find filtered TRF .bed files in
# $topDir/bed/simpleRepeat/trfMaskChrom/ .
# Get masked sequence directly from .2bit.
# Add commands to $bossScript that will create .tar.gz compressed archive
# files with per-chrom files from each of those categories.
my ($runDir, $bossScript) = @_;
my @chromAgpFiles = ();
my @chromOutFiles = ();
my @chromTrfFiles = ();
my $problems = 0;
my ($agpFudge, $rmFudge, $trfFudge) = (0, 0, 0);
foreach my $chrRoot (sort keys %chromRoots) {
foreach my $chr (@{$chromRoots{$chrRoot}}) {
my $agpFile = "$chrRoot/$chr.agp";
my $outFile = "$chrRoot/$chr.fa.out";
my $trfFile = "trfMaskChrom/$chr.bed";
if (-e "$topDir/$agpFile") {
push @chromAgpFiles, $agpFile;
} elsif ($chr eq 'chrM') {
# It is OK to lack AGP for chrM, which we sometimes add to assemblies.
$agpFudge++;
} else {
warn "Missing AGP $agpFile\n";
$problems++;
}
if (-e "$topDir/$outFile") {
push @chromOutFiles, $outFile;
} elsif ($chr eq 'chrM') {
# It is OK to lack RepeatMasker output for chrM too.
$rmFudge++;
} else {
if (!$opt_ignoreRepeatMasker) {
warn "Missing RepeatMasker $outFile\n";
$problems++;
} else {
$rmFudge++;
}
}
if (-e "$trfRunDir/$trfFile") {
push @chromTrfFiles, $trfFile;
} elsif ($trfFile =~ /chrM\.bed$/) {
$trfFudge++;
} else {
if ($opt_allowMissedTrfs) {
$trfFudge++;
} else {
warn "Missing TRF $trfFile\n";
$problems++;
}
}
}
if ($problems > 15) {
warn "A bunch of missing files... stopping here.\n";
last;
}
}
if (((scalar(@chromAgpFiles) + $agpFudge) != scalar(@chroms)) ||
((scalar(@chromOutFiles) + $rmFudge) != scalar(@chroms)) ||
((scalar(@chromTrfFiles) + $trfFudge) != scalar(@chroms))) {
die "Sorry, can't find the expected set of per-chromosome files.";
}
$bossScript->add(<<_EOF_
# For the time being, use $chromGz/ to temporarily store uncompressed
# 2bit-derived .fa and .fa.masked files:
rm -rf $chromGz
mkdir $chromGz
foreach chr ( @chroms )
twoBitToFa $topDir/$db.2bit -seq=\$chr $chromGz/\$chr.fa
maskOutFa $chromGz/\$chr.fa hard $chromGz/\$chr.fa.masked
end
# Make compressed archive files of per-chrom .agp, .out, TRF .bed,
# soft- and hard-masked .fa:
cd $topDir
tar cvzf $runDir/bigZips/chromAgp.tar.gz @chromAgpFiles
_EOF_
);
if (! $opt_ignoreRepeatMasker) {
$bossScript->add(<<_EOF_
tar cvzf $runDir/bigZips/chromOut.tar.gz @chromOutFiles
_EOF_
);
}
$bossScript->add(<<_EOF_
cd $runDir/$chromGz
tar cvzf $runDir/bigZips/chromFa.tar.gz *.fa
tar cvzf $runDir/bigZips/chromFaMasked.tar.gz *.fa.masked
cd $trfRunDir
tar cvzf $runDir/bigZips/chromTrf.tar.gz @chromTrfFiles
# Now fix $chromGz/ up proper:
cd $runDir/$chromGz
rm *.fa.masked
gzip *.fa
_EOF_
);
} # compressChromFiles
sub mustFindOne {
# Return the first existing file under $topDir/ in the given list of
# candidate files, or die if none exist.
my @candidates = @_;
my $firstFound;
foreach my $f (@candidates) {
if (-e "$topDir/$f") {
$firstFound = "$topDir/$f";
last;
}
}
if (! defined $firstFound) {
die "Sorry, can't find any of these: {" .
join(", ", @candidates) . "}";
}
return $firstFound;
} # mustFindOne
sub compressScaffoldFiles {
# To be called only when assembly is scaffold-based.
# Expect to find monolithic files containing AGP, RepeatMasker .out, and
# filtered TRF .bed.
# Get masked sequence directly from .2bit.
# Add commands to $bossScript that will create .gz compressed files
# from each of those categories.
my ($runDir, $bossScript) = @_;
my $hgFakeAgpDir = "$HgAutomate::trackBuild/hgFakeAgp";
my $agpFile = &mustFindOne("$db.agp", 'scaffolds.agp',
"$hgFakeAgpDir/$db.agp",
"$hgFakeAgpDir/scaffolds.agp");
my $outFile = &mustFindOne("$db.fa.out", 'scaffolds.out');
my $trfFile = &mustFindOne("$trfRunDirRel/trfMask.bed",
"$trfRunDirRel/scaffolds.bed");
$bossScript->add(<<_EOF_
# Make compressed files of .agp, .out, TRF .bed, soft- and hard-masked .fa:
cd $runDir/bigZips
gzip -c $agpFile > $db.agp.gz
gzip -c $outFile > $db.fa.out.gz
gzip -c $trfFile > $db.trf.bed.gz
twoBitToFa $topDir/$db.2bit stdout \\
| gzip -c > $db.fa.gz
twoBitToFa $topDir/$db.2bit stdout \\
| maskOutFa stdin hard stdout \\
| gzip -c > $db.fa.masked.gz
_EOF_
);
} # compressScaffoldFiles
sub isBaylor {
# Return true if it looks like this assembly is from Baylor.
my ($assemblyLabel) = @_;
return ($assemblyLabel =~ /Baylor/);
}
sub isWustl {
# Return true if it looks like this assembly is from WUSTL.
my ($assemblyLabel) = @_;
return ($assemblyLabel =~ /(WUSTL|WashU|Washington Univ|Chicken)/);
}
sub printAssemblyUsage {
# Print out conditions of use for this assembly.
my ($fh, $Organism, $assemblyLabel) = @_;
if (&isBaylor($assemblyLabel)) {
print $fh <<_EOF_
For conditions of use regarding the $Organism genome sequence data, see
http://www.hgsc.bcm.tmc.edu/projects/conditions_for_use.html .
_EOF_
;
} elsif (&isWustl($assemblyLabel)) {
print $fh <<_EOF_
The $Organism sequence is made freely available to the community by the
Genome Sequencing Center, Washington University School of Medicine, with
the following understanding:
1. The data may be freely downloaded, used in analyses, and repackaged in
databases.
2. Users are free to use the data in scientific papers analyzing these data
if the providers of these data are properly acknowledged. See
http://genome.ucsc.edu/goldenPath/credits.html for credit information.
*** IF GENOME HAS BEEN PUBLISHED -- ADD CITATION ***
*** IF GENOME HAS NOT YET BEEN PUBLISHED: ***
3. The centers producing the data reserve the right to publish the initial
large-scale analyses of the data set, including large-scale identification
of regions of evolutionary conservation and large-scale genomic assembly.
Large-scale refers to regions with size on the order of a chromosome (that
is, 30 Mb or more).
4. Any redistribution of the data should carry this notice.
_EOF_
;
} elsif ($assemblyLabel =~ /JGI/) {
print $fh <<_EOF_
1. The data may be freely downloaded, used in analyses, and repackaged
in databases.
2. Users are free to use the data in scientific papers analyzing
particular genes and regions if the provider of these data
(DOE Joint Genome Institute) is properly acknowledged. See
http://genome.ucsc.edu/goldenPath/credits.html for credit information.
3. *** PLEASE ADD PUBLICATION PLANS, IF ANY ***
4. Any redistribution of the data should carry this notice.
_EOF_
;
} elsif ($assemblyLabel =~ /Broad/) {
print $fh <<_EOF_
*** PLEASE CONFIRM THESE CONDITIONS AND/OR ALTER TO INCLUDE ANY PUBLICATION ***
The $Organism sequence is made freely available before scientific publication
with the following understanding:
1. The data may be freely downloaded, used in analyses, and repackaged in
databases.
2. Users are free to use the data in scientific papers analyzing particular
genes and regions if the provider of these data (The Broad Institute) is
properly acknowledged.
3. The center producing the data reserves the right to publish the initial
large-scale analyses of the data set, including large-scale identification
of regions of evolutionary conservation and large-scale genomic assembly.
Large-scale refers to regions with size on the order of a chromosome (that
is, 30 Mb or more).
4. Any redistribution of the data should carry this notice. 1. The data may
be freely downloaded, used in analyses, and repackaged in databases.
_EOF_
;
} else {
print $fh <<_EOF_
*** PLEASE PASTE IN CONDITIONS OF USE FOR THIS ASSEMBLY IF THERE ARE ANY ***
_EOF_
;
}
} # printAssemblyUsage
sub printSomeHaveConditions {
# Print out a warning that some tables have conditions for use.
my ($fh) = @_;
print $fh <<_EOF_
All the files and tables in this directory are freely usable for any
purpose except for the following:
_EOF_
;
}
sub printAllAreFree {
# State that all tables are freely available.
my ($fh) = @_;
print $fh <<_EOF_
All the files and tables in this directory are freely usable for any purpose.
_EOF_
;
}
sub printTableSpecificUsage {
# If tables exist that have specific conditions for use, print out the
# conditions.
my ($fh) = @_;
my $gotConditions = 0;
if (&dbHasTable($dbHost, $db, 'softBerryGene')) {
&printSomeHaveConditions() if (! $gotConditions);
$gotConditions = 1;
print $fh <<_EOF_
softberryGene.txt and softberryPep.txt - Free for academic
and nonprofit use. Commercial users should contact
Softberry, Inc. at http://www.softberry.com.
_EOF_
;
}
if (&dbHasTable($dbHost, $db, 'knownGene')) {
&printSomeHaveConditions() if (! $gotConditions);
$gotConditions = 1;
print $fh <<_EOF_
Swiss-Prot/UniProt data in knownGene.txt -
UniProt copyright (c) 2002 - 2004 UniProt consortium
For non-commercial use all databases and documents in the UniProt FTP
directory may be copied and redistributed freely, without advance
permission, provided that this copyright statement is reproduced with
each copy.
For commercial use all databases and documents in the UniProt FTP
directory, except the files
ftp://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/uniprot_sprot.dat.gz
and
ftp://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/uniprot_sprot.xml.gz
may be copied and redistributed freely, without advance permission,
provided that this copyright statement is reproduced with each copy.
More information for commercial users can be found in:
http://www.expasy.org/announce/sp_98.html
From January 1, 2005, all databases and documents in the UniProt FTP
directory may be copied and redistributed freely by all entities,
without advance permission, provided that this copyright statement is
reproduced with each copy.
_EOF_
;
}
&printAllAreFree($fh) if (! $gotConditions);
} # printTableSpecificUsage
sub guessSequencingCenter {
my ($assemblyLabel) = @_;
my $sequencingCenter;
my $unknown = "***PLEASE FILL IN SEQUENCING CENTER***";
my $multiple = "***PLEASE FILL IN MULTIPLE SEQUENCING CENTERS***";
if ($assemblyLabel =~ /Zv\d+/) {
return 'a collaboration between the
Wellcome Trust Sanger Institute in Cambridge, UK, the Max Planck Institute
for Developmental Biology in Tuebingen, Germany, the Netherlands Institute
for Developmental Biology (Hubrecht Laboratory), Utrecht, The Netherlands
and Yi Zhou and Leonard Zon from the Children\'s Hospital in Boston,
Massachusetts.';
}
if (&isBaylor($assemblyLabel)) {
$sequencingCenter =
'the Baylor College of Medicine Human Genome Sequencing Center';
}
if (&isWustl($assemblyLabel)) {
return $multiple if ($sequencingCenter);
$sequencingCenter =
'the Genome Sequencing Center at the Washington University School of Medicine in St. Louis';
}
if ($assemblyLabel =~ /Broad/) {
return $multiple if ($sequencingCenter);
$sequencingCenter =
'the Broad Institute at MIT and Harvard';
}
if ($assemblyLabel =~ /Sanger/) {
return $multiple if ($sequencingCenter);
$sequencingCenter =
'the Wellcome Trust Sanger Institute';
}
if ($assemblyLabel =~ /NCBI/) {
return $multiple if ($sequencingCenter);
$sequencingCenter =
'the National Center for Biotechnology Information (NCBI)';
}
if ($assemblyLabel =~ /JGI/) {
return $multiple if ($sequencingCenter);
$sequencingCenter =
'the US DOE Joint Genome Institute (JGI)';
}
$sequencingCenter = $unknown if (! $sequencingCenter);
return $sequencingCenter;
} # guessSequencingCenter
sub guessConsortium {
my ($Organism) = @_;
my $consortium = "";
if (scalar(grep /^$Organism/i, qw( Mouse Rat Chimp Chicken ))) {
$consortium = "\nfrom the $Organism Genome Sequencing Consortium";
} elsif ($Organism eq 'X. tropicalis') {
$consortium = "\nfrom the $Organism Genome Consortium";
}
} # guessConsortium
sub getDescriptives {
# Return a slew of variables used to describe the assembly.
my ($Organism, $assemblyDate, $assemblyLabel) =
&HgAutomate::getAssemblyInfo($dbHost, $db);
my $organism = $Organism;
if ($organism !~ /^[A-Z]\. [a-z]+/) {
$organism = lc($Organism);
}
my $consortium = &guessConsortium($Organism);
my $sequencingCenter = &guessSequencingCenter($assemblyLabel);
my $projectUrl = "***PLEASE INSERT PROJECT URL OR REMOVE THIS STATEMENT***";
# WUSTL project page example: http://genome.wustl.edu/genome.cgi?GENOME=Gallus%20gallus
# Baylor project page example: http://www.hgsc.bcm.tmc.edu/projects/honeybee/
# Broad Institute project page example: http://www.broad.mit.edu/mammals/horse/
return ($Organism, $assemblyDate, $assemblyLabel,
$organism, $consortium, $sequencingCenter, $projectUrl);
}
sub makeDatabaseReadme {
# Dump out a README.txt for the database/ dir, where autodumped .sql
# and .txt for each table will be generated on the RR (not hgwdev).
my ($runDir) = @_;
my ($Organism, $assemblyDate, $assemblyLabel,
$organism, $consortium, $sequencingCenter, $projectUrl) =
&getDescriptives();
my $fh = &HgAutomate::mustOpen(">$runDir/README.database.txt");
print $fh <<_EOF_
This directory contains a dump of the UCSC genome annotation database for
the $assemblyDate assembly of the $organism genome ($db, $assemblyLabel)$consortium.
The annotations were generated by UCSC and collaborators worldwide.
This assembly was produced by $sequencingCenter.
For more information on the $organism genome, see the project website:
$projectUrl
Files included in this directory (updated nightly):
- *.sql files: the MySQL commands used to create the tables
- *.txt.gz files: the database tables in a tab-delimited format
compressed with gzip.
To see descriptions of the tables underlying Genome Browser annotation
tracks, select the table in the Table Browser:
http://genome.ucsc.edu/cgi-bin/hgTables?db=$db
and click the "describe table schema" button. There is also a "view
table schema" link on the configuration page for each track.
---------------------------------------------------------------
If you plan to download a large file or multiple files from this
directory, we recommend you use ftp rather than downloading the files
via our website. To do so, ftp to hgdownload.cse.ucsc.edu, then go to
the directory goldenPath/$db/database/. To download multiple
files, use the "mget" command:
mget <filename1> <filename2> ...
- or -
mget -a (to download all the files in the directory)
Alternate methods to ftp access.
Using an rsync command to download the entire directory:
rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/$db/database/ .
For a single file, e.g. gc5Base.txt.gz
rsync -avzP \
rsync://hgdownload.cse.ucsc.edu/goldenPath/$db/database/gc5Base.txt.gz .
Or with wget, all files:
wget --timestamping \
'ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/database/*'
With wget, a single file:
wget --timestamping \
'ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/database/gc5Base.txt.gz' \
-O gc5Base.txt.gz
To uncompress the *.txt.gz files:
gunzip <table>.txt.gz
The tables can be loaded directly from the .txt.gz compressed file.
It is not necessary to uncompress them to load into a database,
as shown in the example below.
To load one of the tables directly into your local mirror database,
for example the table chromInfo:
## create table from the sql definition
\$ hgsql $db < chromInfo.sql
## load data from the txt.gz file
\$ zcat chromInfo.txt.gz | hgsql $db --local-infile=1 \
-e 'LOAD DATA LOCAL INFILE "/dev/stdin" INTO TABLE chromInfo;'
_EOF_
;
&printAssemblyUsage($fh, $Organism, $assemblyLabel);
&printTableSpecificUsage($fh);
close($fh);
} # makeDatabaseReadme
sub makeBigZipsReadme {
# Dump out a README.txt for bigZips/ .
my ($runDir) = @_;
my ($Organism, $assemblyDate, $assemblyLabel,
$organism, $consortium, $sequencingCenter, $projectUrl) =
&getDescriptives();
my $fh = &HgAutomate::mustOpen(">$runDir/README.bigZips.txt");
print $fh <<_EOF_
This directory contains the $assemblyDate assembly of the $organism genome
($db, $assemblyLabel), as well as repeat annotations and GenBank sequences.
This assembly was produced by $sequencingCenter.
For more information on the $organism genome, see the project website:
$projectUrl
Files included in this directory:
_EOF_
;
if ($chromBased) {
print $fh <<_EOF_
chromAgp.tar.gz - Description of how the assembly was generated from
fragments, unpacking to one file per chromosome.
chromFa.tar.gz - The assembly sequence in one file per chromosome.
Repeats from RepeatMasker and Tandem Repeats Finder (with period
of 12 or less) are shown in lower case; non-repeating sequence is
shown in upper case.
chromFaMasked.tar.gz - The assembly sequence in one file per chromosome.
Repeats are masked by capital Ns; non-repeating sequence is shown in
upper case.
chromOut.tar.gz - RepeatMasker .out files (one file per chromosome).
RepeatMasker was run with the -s (sensitive) setting.
*** PLEASE ADD REPEATMASKER VERSION AND LIB VERSION FROM THE DATE THAT REPEATMASKER WAS RUN (MAY BE IN MAKE.DOC)
chromTrf.tar.gz - Tandem Repeats Finder locations, filtered to keep repeats
with period less than or equal to 12, and translated into UCSC's BED 5+
format (one file per chromosome).
_EOF_
;
} else {
print $fh <<_EOF_
$db.agp.gz - Description of how the assembly was generated from
fragments.
$db.fa.gz - "Soft-masked" assembly sequence in one file.
Repeats from RepeatMasker and Tandem Repeats Finder (with period
of 12 or less) are shown in lower case; non-repeating sequence is
shown in upper case.
$db.fa.masked.gz - "Hard-masked" assembly sequence in one file.
Repeats are masked by capital Ns; non-repeating sequence is shown in
upper case.
$db.fa.out.gz - RepeatMasker .out file. RepeatMasker was run with the
-s (sensitive) setting. *** PLEASE ADD REPEATMASKER VERSION AND LIB VERSION FROM THE DATE THAT REPEATMASKER WAS RUN (MAY BE IN MAKE.DOC)
$db.trf.bed.gz - Tandem Repeats Finder locations, filtered to keep repeats
with period less than or equal to 12, and translated into UCSC's BED
format.
_EOF_
;
}
if (&dbHasTable($dbHost, $db, 'all_est')) {
print $fh <<_EOF_
est.fa.gz - $Organism ESTs in GenBank. This sequence data is updated once a
week via automatic GenBank updates.
_EOF_
;
}
print $fh <<_EOF_
md5sum.txt - checksums of files in this directory
_EOF_
;
if (&dbHasTable($dbHost, $db, 'all_mrna')) {
print $fh <<_EOF_
mrna.fa.gz - $Organism mRNA from GenBank. This sequence data is updated
once a week via automatic GenBank updates.
_EOF_
;
}
if (&dbHasTable($dbHost, $db, 'refGene')) {
print $fh <<_EOF_
refMrna.fa.gz - RefSeq mRNA from the same species as the genome.
This sequence data is updated once a week via automatic GenBank
updates.
_EOF_
;
}
my $dunno = '*** ??? ***';
if ($geneTable) {
my $geneDesc;
if ($geneTable eq 'refGene') {
$geneDesc = 'RefSeq';
} elsif ($geneTable eq 'mgcGenes') {
$geneDesc = 'MGC';
} elsif ($geneTable eq 'xenoRefGene') {
$geneDesc = 'non-$Organism RefSeq';
} else {
$geneDesc = $dunno;
}
print $fh <<_EOF_
upstream1000.fa.gz - Sequences 1000 bases upstream of annotated
transcription starts of $geneDesc genes with annotated 5' UTRs.
_EOF_
;
if ($geneDesc ne $dunno) {
print $fh <<_EOF_
This file is updated weekly so it might be slightly out of sync with
the $geneDesc data which is updated daily for most assemblies.
_EOF_
;
} else {
print $fh <<_EOF_
Note that upstream files are generated only when an assembly is
released. Therefore, the data may be slightly out of synch with
the RefSeq data in assemblies that are incrementally updated
nightly.
_EOF_
;
}
print $fh <<_EOF_
upstream2000.fa.gz - Same as upstream1000, but 2000 bases.
upstream5000.fa.gz - Same as upstream1000, but 5000 bases.
_EOF_
;
}
if (&dbHasTable($dbHost, $db, 'xenoMrna')) {
print $fh <<_EOF_
xenoMrna.fa.gz - GenBank mRNAs from species other than that of
the genome. This sequence data is updated once a week via automatic
GenBank updates.
_EOF_
;
}
print $fh <<_EOF_
------------------------------------------------------------------
If you plan to download a large file or multiple files from this
directory, we recommend that you use ftp rather than downloading the
files via our website. To do so, ftp to hgdownload.cse.ucsc.edu
[username: anonymous, password: your email address], then cd to the
directory goldenPath/$db/bigZips. To download multiple files, use
the "mget" command:
mget <filename1> <filename2> ...
- or -
mget -a (to download all the files in the directory)
Alternate methods to ftp access.
Using an rsync command to download the entire directory:
rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/$db/bigZips/ .
For a single file, e.g. chromFa.tar.gz
rsync -avzP \
rsync://hgdownload.cse.ucsc.edu/goldenPath/$db/bigZips/chromFa.tar.gz .
Or with wget, all files:
wget --timestamping \
'ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/bigZips/*'
With wget, a single file:
wget --timestamping \
'ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/bigZips/chromFa.tar.gz' \
-O chromFa.tar.gz
To unpack the *.tar.gz files:
tar xvzf <file>.tar.gz
To uncompress the fa.gz files:
gunzip <file>.fa.gz
_EOF_
;
&printAssemblyUsage($fh, $Organism, $assemblyLabel);
close($fh);
} # makeBigZipsReadme
sub makeChromosomesReadme {
# Dump out a README.txt for chromsomes/ .
my ($runDir) = @_;
my ($Organism, $assemblyDate, $assemblyLabel,
$organism, $consortium, $sequencingCenter, $projectUrl) =
&getDescriptives();
my $fh = &HgAutomate::mustOpen(">$runDir/README.chromosomes.txt");
print $fh <<_EOF_
This directory contains the $assemblyDate assembly of the $organism genome
($db, $assemblyLabel) in one gzip-compressed FASTA file per chromosome.
This assembly was produced by $sequencingCenter.
For more information on the $organism genome, see the project website:
$projectUrl
Files included in this directory:
- chr*.fa.gz: compressed FASTA sequence of each chromosome.
_EOF_
;
print $fh <<_EOF_
------------------------------------------------------------------
If you plan to download a large file or multiple files from this
directory, we recommend that you use ftp rather than downloading the
files via our website. To do so, ftp to hgdownload.cse.ucsc.edu, then
go to the directory goldenPath/$db/chromosomes. To download multiple
files, use the "mget" command:
mget <filename1> <filename2> ...
- or -
mget -a (to download all the files in the directory)
Alternate methods to ftp access.
Using an rsync command to download the entire directory:
rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/$db/chromosomes/ .
For a single file, e.g. chrM.fa.gz
rsync -avzP \
rsync://hgdownload.cse.ucsc.edu/goldenPath/$db/chromosomes/chrM.fa.gz .
Or with wget, all files:
wget --timestamping \
'ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/chromosomes/*'
With wget, a single file:
wget --timestamping \
'ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/chromosomes/chrM.fa.gz' \
-O chrM.fa.gz
To uncompress the fa.gz files:
gunzip <file>.fa.gz
_EOF_
;
&printAssemblyUsage($fh, $Organism, $assemblyLabel);
close($fh);
} # makeChromosomesReadme
sub makeLiftOverReadme {
# Dump out a README.txt for the liftOver/ dir, where doBlastzChainNet.pl
# runs will deposit the .over.chain.gz files.
my ($runDir) = @_;
my $fh = &HgAutomate::mustOpen(">$runDir/README.liftOver.txt");
print $fh <<_EOF_
This directory contains the data files required as input to the
liftOver utility. This tool -- which requires a Linux platform --
allows the mass conversion of coordinates from one assembly to
another. The executable file for the utility can be downloaded from
http://hgdownload.cse.ucsc.edu/admin/exe/liftOver.gz .
The file names reflect the assembly conversion data contained within
in the format <db1>To<Db2>.over.chain.gz. For example, a file named
hg15ToHg16.over.chain.gz file contains the liftOver data needed to
convert hg15 (Human Build 33) coordinates to hg16 (Human Build 34).
If no file is available for the assembly in which you're interested,
please send a request to the genome mailing list
(genome\@soe.ucsc.edu) and we will attempt to provide you with one.
To download a large file or multiple files from this directory,
we recommend that you use ftp rather than downloading the files via our
website. To do so, ftp to hgdownload.cse.ucsc.edu (user: anonymous),
then cd to goldenPath/$db/liftOver. To download multiple files,
use the "mget" command:
mget <filename1> <filename2> ...
- or -
mget -a (to download all the files in the directory)
-------------------------------------------------------
Please refer to the credits page
(http://genome.ucsc.edu/goldenPath/credits.html) for guidelines and
restrictions regarding data use for these assemblies.
-------------------------------------------------------
Alternate methods to ftp access.
Using an rsync command to download the entire directory:
rsync -avzP rsync://hgdownload.cse.ucsc.edu/goldenPath/$db/liftOver/ .
For a single file, e.g. ${db}ToHg18.over.chain.gz
rsync -avzP \
rsync://hgdownload.cse.ucsc.edu/goldenPath/$db/liftOver/${db}ToHg18.over.chain.gz .
(Hg18 is merely an example here, not necessarily existing.)
Or with wget, all files:
wget --timestamping \
'ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/liftOver/*'
With wget, a single file:
wget --timestamping \
'ftp://hgdownload.cse.ucsc.edu/goldenPath/$db/liftOver/${db}ToHg18.over.chain.gz' \
-O ${db}ToHg18.over.chain.gz
To uncompress the *.chain.gz files:
gunzip <file>.chain.gz
The liftOver utility can read the files in their .gz format,
it is not necessary to uncompress them to use with the liftOver command.
_EOF_
;
close($fh);
} # makeLiftOverReadme
sub doCompress {
# step: compress [workhorse]
my $runDir = "$topDir/goldenPath";
&HgAutomate::mustMkdir($runDir);
my $whatItDoes =
"It creates compressed sequence and repeat-annotation files for download.";
my $workhorse = &HgAutomate::chooseWorkhorse();
my $bossScript = new HgRemoteScript("$scriptDir/doCompress.csh", $workhorse,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
rm -rf bigZips database
mkdir bigZips database
mkdir -p liftOver
_EOF_
);
if ($chromBased) {
&compressChromFiles($runDir, $bossScript);
} else {
&compressScaffoldFiles($runDir, $bossScript);
}
$bossScript->add(<<_EOF_
# Add md5sum.txt and README.txt to each dir:
foreach d (bigZips $chromGz database liftOver)
cd $runDir/\$d
if (\$d != "database" && \$d != "liftOver") then
md5sum *.gz > md5sum.txt
endif
mv $runDir/README.\$d.txt README.txt
end
_EOF_
);
# Create README.*.txt files which will be moved into subdirs by the script.
&makeDatabaseReadme($runDir);
&makeBigZipsReadme($runDir);
&makeChromosomesReadme($runDir) if ($chromBased);
&makeLiftOverReadme($runDir);
$bossScript->execute();
} # doCompress
#########################################################################
# * step: install [dbHost]
sub doInstall {
my $runDir = "$topDir/goldenPath";
my $whatItDoes =
"It creates links from the web server's goldenPath download area to the
actual compressed files.";
my $bossScript = new HgRemoteScript("$scriptDir/doInstall.csh", $dbHost,
$runDir, $whatItDoes);
my $gp = "$HgAutomate::goldenPath/$db";
$bossScript->add(<<_EOF_
mkdir -p $gp
foreach d (bigZips $chromGz database)
rm -rf $gp/\$d
mkdir $gp/\$d
ln -s $runDir/\$d/*.{gz,txt} $gp/\$d/
end
# Don't blow away all of liftOver, just the README -- there may be
# pre-existing links that are not regenerated above.
mkdir -p $gp/liftOver
rm -f $gp/liftOver/README.txt
ln -s $runDir/liftOver/README.txt $gp/liftOver/README.txt
_EOF_
);
if ($geneTable) {
$bossScript->add(<<_EOF_
cd $runDir/bigZips
foreach size (1000 2000 5000)
echo \$size
featureBits $db $geneTable:upstream:\$size -fa=stdout \\
| gzip -c > upstream\$size.fa.gz
end
md5sum up*.gz >> md5sum.txt
ln -s $runDir/bigZips/up*.gz $gp/bigZips/
_EOF_
);
}
$bossScript->execute();
} # doInstall
#########################################################################
# main
# Prevent "Suspended (tty input)" hanging:
&HgAutomate::closeStdin();
# Make sure we have valid options and exactly 1 argument:
&checkOptions();
&usage(1) if (scalar(@ARGV) != 1);
($db) = @ARGV;
$topDir = "$HgAutomate::clusterData/$db";
$scriptDir = "$topDir/jkStuff";
$trfRunDirRel = "$HgAutomate::trackBuild/simpleRepeat";
$trfRunDir = "$topDir/$trfRunDirRel";
$geneTable = &getGeneTable();
if (! -e "$topDir/$db.2bit") {
die "Sorry, this script requires $topDir/$db.2bit.\n";
}
if (! -e "$topDir/chrom.sizes") {
die "Sorry, this script requires $topDir/chrom.sizes.\n";
}
@chroms = split("\n", `awk '{print \$1;}' $topDir/chrom.sizes`);
$chromBased = (scalar(@chroms) <= $HgAutomate::splitThreshold);
if ($chromBased) {
foreach my $chr (@chroms) {
my $chrRoot = $chr;
$chrRoot =~ s/^chr//;
$chrRoot =~ s/_random$//;
if (! $opt_noChromRoot) {
$chrRoot =~ s/_\w+_hap\d+//;
}
push @{$chromRoots{$chrRoot}}, $chr;
}
$chromGz = "chromosomes";
} else {
$chromGz = "";
}
# Do everything.
$stepper->execute();
# Tell the user anything they should know.
my $stopStep = $stepper->getStopStep();
my $upThrough = ($stopStep eq 'install') ? "" :
" (through the '$stopStep' step)";
&HgAutomate::verbose(1, <<_EOF_
*** All done!$upThrough
_EOF_
);
if ($stopStep eq 'install') {
&HgAutomate::verbose(1, <<_EOF_
*** Please take a look at the downloads for $db using a web browser.
*** The downloads url is: http://hgwdev.cse.ucsc.edu/goldenPath/$db.
*** Edit each README.txt to resolve any notes marked with "***":
$topDir/goldenPath/database/README.txt
$topDir/goldenPath/bigZips/README.txt
_EOF_
);
if ($chromBased) {
&HgAutomate::verbose(1, <<_EOF_
$topDir/goldenPath/$chromGz/README.txt
_EOF_
);
}
# liftOver/README.txt doesn't require any editing.
&HgAutomate::verbose(1, <<_EOF_
(The htdocs/goldenPath/$db/*/README.txt "files" are just links to those.)
*** If you have to make any edits that would always apply to future
assemblies from the same sequencing center, please edit them into
~/kent/src/hg/utils/automation/$base (or ask Angie for help).
_EOF_
);
}