src/hg/utils/automation/doEnsGeneUpdate.pl 1.23
1.23 2010/02/11 23:48:34 hiram
now adding dateReference to the trackVersion table entry
Index: src/hg/utils/automation/doEnsGeneUpdate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/doEnsGeneUpdate.pl,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/utils/automation/doEnsGeneUpdate.pl 15 Jan 2010 23:47:36 -0000 1.22
+++ src/hg/utils/automation/doEnsGeneUpdate.pl 11 Feb 2010 23:48:34 -0000 1.23
@@ -1,767 +1,770 @@
#!/usr/bin/env perl
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit ~/kent/src/hg/utils/automation/doEnsGeneUpdate.pl instead.
# $Id$
use Getopt::Long;
use warnings;
use strict;
use FindBin qw($Bin);
use lib "$Bin";
use EnsGeneAutomate;
use HgAutomate;
use HgRemoteScript;
use HgStepManager;
use File::Basename;
# Option variable names, both common and peculiar to this script:
use vars @HgAutomate::commonOptionVars;
use vars @HgStepManager::optionVars;
use vars qw/
$opt_ensVersion
$opt_vegaGene
$opt_buildDir
/;
# Specify the steps supported with -continue / -stop:
my $stepper = new HgStepManager(
[ { name => 'download', func => \&doDownload },
{ name => 'process', func => \&doProcess },
{ name => 'load', func => \&doLoad },
{ name => 'cleanup', func => \&doCleanup },
{ name => 'makeDoc', func => \&doMakeDoc },
]
);
# Option defaults:
my $dbHost = 'hgwdev';
my $vegaSpecies = "human";
my $vegaPep = "Homo_sapiens.VEGA";
my $base = $0;
$base =~ s/^(.*\/)?//;
my (@versionList) = &EnsGeneAutomate::ensVersionList();
my $versionListString = join(", ", @versionList);
sub usage {
# Usage / help / self-documentation:
my ($status, $detailed) = @_;
# Basic help (for incorrect usage):
print STDERR "
usage: $base -ensVersion=NN <db>.ensGene.ra
required options:
-ensVersion=NN - must specify desired Ensembl version
- possible values: $versionListString
<db>.ensGene.ra - configuration file with database and other options
other options:
";
print STDERR $stepper->getOptionHelp();
print STDERR <<_EOF_
-vegaGene Special processing for vegaGene instead of Ensembl
works only on Human, Mouse, Zebrafish
-buildDir dir Use dir instead of default
$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ensGene.<ensVersion #>
(necessary if experimenting with builds).
_EOF_
;
print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
'workhorse' => '',
'bigClusterHub' => '',
'smallClusterHub' => '');
print STDERR "
Automates UCSC's Ensembl gene updates. Steps:
download: fetch GFF and protein data files from Ensembl FTP site
process: parse the GFF file, create coding and non-coding gff files
lift random contigs to UCSC random coordinates
load: load the coding and non-coding tracks, and the proteins
cleanup: Removes or compresses intermediate files.
makeDoc: Displays the make doc text entry to stdout for the procedure
which would be done for this build.
All operations are performed in the build directory which is
$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/ensGene.<ensVersion #> unless -buildDir is given.
<db>.ensGene.ra describes the species, assembly and downloaded files.
To see detailed information about what should appear in config.ra,
run \"$base -help\".
";
# Detailed help (-help):
if ($detailed) {
print STDERR <<_EOF_
-----------------------------------------------------------------------------
Required <db>.ensGene.ra settings:
db xxxYyyN
- The UCSC database name and assembly identifier. xxx is the first three
letters of the genus and Yyy is the first three letters of the species.
N is the sequence number of this species' build at UCSC. For some
species that we have been processing since before this convention, the
pattern is xyN.
Optional <db>.ensGene.ra settings:
liftRandoms yes
- Need to lift Ensembl contig coordinates to UCSC _random coordinates.
- Will use UCSC ctgPos information about contigs to perform the lift.
nameTranslation "<sed translation pattern>"
- sed translation statement to change Ensembl chrom numbers and MT to
UCSC chrN and chrM chrom names. Example:
nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
geneScaffolds yes
- need to translate Ensembl GeneScaffold coordinates to UCSC scaffolds
Will fetch and use the Ensembl MySQL tables seq_region and assembly
to perform the translation
skipInvalid yes
- during the loading of the gene pred, skip all invalid genes
haplotypeLift <path/to/file.lift>
- a lift-down file for Ensembl full chromosome haplotype coordinates
- to UCSC simple haplotype coordinates
liftUp <path/to/ensGene.lft>
- after everything else is done, lift the final gene pred via this
- lift file. Comes in handy if Ensembl chrom names are simply
- different than UCSC chrom names. For example in bushbaby, otoGar1
Assumptions:
1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
database/assembly \$db.
2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
\$db.2bit.
3. The \$db.2bit files have already been distributed to cluster-scratch
(/scratch/hg or /iscratch, /san etc).
4. anything else here ?
_EOF_
}
print "\n";
exit $status;
}
# Globals:
my ($species, $ensGtfUrl, $ensGtfFile, $ensPepUrl, $ensPepFile,
$ensMySqlUrl, $ensVersionDateReference, $previousEnsVersion, $buildDir,
$previousBuildDir, $vegaGene);
# Command line argument:
my $CONFIG;
# Required command line arguments:
my ($ensVersion);
# Required config parameters:
my ($db);
# Conditionally required config parameters:
my ($liftRandoms, $nameTranslation, $geneScaffolds, $knownToEnsembl,
$skipInvalid, $haplotypeLift, $liftUp);
# Other globals:
my ($topDir, $chromBased);
my ($bedDir, $scriptDir, $endNotes);
sub checkOptions {
# Make sure command line options are valid/supported.
my $ok = GetOptions(@HgStepManager::optionSpec,
'ensVersion=s',
'vegaGene',
'buildDir=s',
@HgAutomate::commonOptionSpec,
);
&usage(1) if (!$ok);
&usage(0, 1) if ($opt_help);
&HgAutomate::processCommonOptions();
my $err = $stepper->processOptions();
usage(1) if ($err);
$dbHost = $opt_dbHost if ($opt_dbHost);
}
#########################################################################
# * step: load [dbHost]
sub doLoad {
my $runDir = "$buildDir";
if (! -d "$buildDir/process") {
die "ERROR: load: directory: '$buildDir/process' does not exist.\n" .
"The process step appears to have not been done.\n" .
"Run with -continue=process before this step.\n";
}
my $whatItDoes = "load processed ensGene data into local database.";
my $bossScript = new HgRemoteScript("$runDir/doLoad.csh", $dbHost,
$runDir, $whatItDoes);
my $thisGenePred = "$buildDir" . "/process/$db.allGenes.gp.gz";
my $prevGenePred = "$previousBuildDir" . "/process/$db.allGenes.gp.gz";
my $identicalToPrevious = 0;
if ( -f $prevGenePred ) {
my $thisGenePredSum = `zcat $thisGenePred | sum`;
chomp $thisGenePredSum;
my $prevGenePredSum = `zcat $prevGenePred | sum`;
chomp $prevGenePredSum;
print STDERR "prev: $prevGenePredSum, this: $thisGenePredSum\n";
if ($prevGenePredSum eq $thisGenePredSum) {
print STDERR "previous genes same as new genes";
$identicalToPrevious = 1;
}
}
if ($identicalToPrevious ) {
$bossScript->add(<<_EOF_
hgsql -e 'INSERT INTO trackVersion \\
- (db, name, who, version, updateTime, comment, source) \\
+ (db, name, who, version, updateTime, comment, source, dateReference) \\
VALUES("$db", "ensGene", "$ENV{'USER'}", "$ensVersion", now(), \\
"identical to previous version $previousEnsVersion", \\
- "identical to previous version $previousEnsVersion" );' hgFixed
+ "identical to previous version $previousEnsVersion", \\
+ $ensVersionDateReference );' hgFixed
_EOF_
);
} else {
my $skipInv = "";
$skipInv = "-skipInvalid" if (defined $skipInvalid);
if ($opt_vegaGene) {
$bossScript->add(<<_EOF_
hgLoadGenePred $skipInv -genePredExt $db \\
vegaGene process/not.vegaPseudo.gp.gz >& load.not.pseudo.errors.txt
hgLoadGenePred $skipInv -genePredExt $db \\
vegaPseudoGene process/vegaPseudo.gp.gz >& load.pseudo.errors.txt
hgsql -N -e "select name from vegaPseudoGene;" $db > pseudo.name
hgsql -N -e "select name from vegaGene;" $db > not.pseudo.name
sort -u pseudo.name not.pseudo.name > vegaGene.name
sed -e "s/20/40/; s/19/39/" $ENV{'HOME'}/kent/src/hg/lib/ensGtp.sql \\
> vegaGtp.sql
hgLoadSqlTab $db vegaGtp vegaGtp.sql process/ensGtp.tab
zcat download/$ensPepFile \\
| sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz
zcat vegaPep.txt.gz \\
| ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \\
| sed -e '/^\$/d; s/\*\$//' | sort > vegaPepAll.$db.fa.tab
join vegaPepAll.$db.fa.tab vegaGene.name | sed -e "s/ /\\t/" \\
> vegaPep.$db.fa.tab
hgPepPred $db tab vegaPep vegaPep.$db.fa.tab
# verify names in vegaGene is a superset of names in vegaPep
hgsql -N -e "select name from vegaPep;" $db | sort > vegaPep.name
set geneCount = `cat vegaGene.name | wc -l`
set pepCount = `cat vegaPep.name | wc -l`
set commonCount = `comm -12 vegaPep.name vegaGene.name | wc -l`
set percentId = \\
`echo \$commonCount \$pepCount | awk '{printf "%d", 100.0*\$1/\$2}'`
echo "gene count: \$geneCount, peptide count: \$pepCount, common name count: \$commonCount"
echo "percentId: \$percentId"
if (! (\$percentId > 95)) then
echo "ERROR: percent coverage of peptides to genes: \$percentId"
echo "ERROR: should be greater than 95"
exit 255
endif
_EOF_
);
} elsif (defined $geneScaffolds) {
$bossScript->add(<<_EOF_
hgLoadGenePred $skipInv -genePredExt $db ensGene process/$db.allGenes.gp.gz \\
>& loadGenePred.errors.txt
zcat process/ensemblGeneScaffolds.$db.bed.gz \\
| sed -e "s/GeneScaffold/GS/" | hgLoadBed $db ensemblGeneScaffold stdin
_EOF_
);
} else {
$bossScript->add(<<_EOF_
hgLoadGenePred $skipInv -genePredExt $db \\
ensGene process/$db.allGenes.gp.gz >& loadGenePred.errors.txt
_EOF_
);
}
if (! $opt_vegaGene) {
$bossScript->add(<<_EOF_
zcat download/$ensPepFile \\
| sed -e 's/^>.* transcript:/>/; s/ CCDS.*\$//;' | gzip > ensPep.txt.gz
zcat ensPep.txt.gz \\
| ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \\
| sed -e '/^\$/d; s/\*\$//' | sort > ensPep.$db.fa.tab
hgPepPred $db tab ensPep ensPep.$db.fa.tab
hgLoadSqlTab $db ensGtp ~/kent/src/hg/lib/ensGtp.sql process/ensGtp.tab
# verify names in ensGene is a superset of names in ensPep
hgsql -N -e "select name from ensPep;" $db | sort > ensPep.name
hgsql -N -e "select name from ensGene;" $db | sort > ensGene.name
set geneCount = `cat ensGene.name | wc -l`
set pepCount = `cat ensPep.name | wc -l`
set commonCount = `comm -12 ensPep.name ensGene.name | wc -l`
set percentId = \\
`echo \$commonCount \$pepCount | awk '{printf "%d", 100.0*\$1/\$2}'`
echo "gene count: \$geneCount, peptide count: \$pepCount, common name count: \$commonCount"
echo "percentId: \$percentId"
if (! (\$percentId > 95)) then
echo "ERROR: percent coverage of peptides to genes: \$percentId"
echo "ERROR: should be greater than 95"
exit 255
endif
_EOF_
);
}
if (! $opt_vegaGene && defined $knownToEnsembl) {
$bossScript->add(<<_EOF_
hgMapToGene $db ensGene knownGene knownToEnsembl
_EOF_
);
}
if ($opt_vegaGene) {
$bossScript->add(<<_EOF_
hgsql -e 'INSERT INTO trackVersion \\
- (db, name, who, version, updateTime, comment, source) \\
+ (db, name, who, version, updateTime, comment, source, dateReference) \\
VALUES("$db", "vegaGene", "$ENV{'USER'}", "$ensVersion", now(), \\
"with peptides $ensPepFile", \\
- "$ensGtfUrl" );' hgFixed
+ "$ensGtfUrl", \\
+ $ensVersionDateReference );' hgFixed
_EOF_
);
} else {
$bossScript->add(<<_EOF_
hgsql -e 'INSERT INTO trackVersion \\
- (db, name, who, version, updateTime, comment, source) \\
+ (db, name, who, version, updateTime, comment, source, dateReference) \\
VALUES("$db", "ensGene", "$ENV{'USER'}", "$ensVersion", now(), \\
"with peptides $ensPepFile", \\
- "$ensGtfUrl" );' hgFixed
+ "$ensGtfUrl", \\
+ $ensVersionDateReference );' hgFixed
_EOF_
);
}
}
$bossScript->execute();
} # doLoad
#########################################################################
# * step: process [dbHost]
sub doProcess {
my $runDir = "$buildDir/process";
# First, make sure we're starting clean.
if (-d "$runDir") {
die "ERROR: process: looks like this was run successfully already\n" .
"($runDir exists)\nEither run with -continue=load or some later\n" .
"stage, or move aside/remove\n$runDir\nand run again.\n";
}
&HgAutomate::mustMkdir($runDir);
my $whatItDoes = "process downloaded data from Ensembl FTP site into locally usable data.";
my $lifting = 0;
my $bossScript;
if (defined $liftRandoms) {
$lifting = 1;
}
$bossScript = new HgRemoteScript("$runDir/doProcess.csh", $dbHost,
$runDir, $whatItDoes);
# if lifting, create the lift file
if ($lifting) {
$bossScript->add(<<_EOF_
rm -f randoms.$db.lift
foreach C (`cut -f1 /hive/data/genomes/$db/chrom.sizes | grep random`)
set size = `grep \$C /hive/data/genomes/$db/chrom.sizes | cut -f2`
hgsql -N -e \\
"select chromStart,contig,size,chrom,\$size from ctgPos where chrom='\$C';" \\
$db | awk '{gsub("\\\\.[0-9]+", "", \$2); print }' >> randoms.$db.lift
end
_EOF_
);
}
# somg
$bossScript->add(<<_EOF_
zcat ../download/$ensGtfFile \\
_EOF_
);
# translate Ensemble chrom names to UCSC chrom name
if (defined $nameTranslation) {
$bossScript->add(<<_EOF_
| sed -e $nameTranslation \\
_EOF_
);
}
# lift down haplotypes if necessary
if (defined $haplotypeLift) {
$bossScript->add(<<_EOF_
| liftUp -type=.gtf stdout $haplotypeLift carry stdin \\
_EOF_
);
}
# lift randoms if necessary
if ($lifting) {
$bossScript->add(<<_EOF_
| liftUp -type=.gtf stdout randoms.$db.lift carry stdin \\
_EOF_
);
}
# result is protein coding set
$bossScript->add(<<_EOF_
| gzip > allGenes.gtf.gz
_EOF_
);
$bossScript->add(<<_EOF_
gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \\
| gzip > $db.allGenes.gp.gz
$Bin/extractGtf.pl infoOut.txt > ensGtp.tab
_EOF_
);
if ($opt_vegaGene) {
$bossScript->add(<<_EOF_
zcat allGenes.gtf.gz | grep -i pseudo \\
| sed -e 's/gene_id/other_gene_id/; s/gene_name/gene_id/' > vegaPseudo.gtf
zcat allGenes.gtf.gz | grep -v -i pseudo \\
| sed -e 's/gene_id/other_gene_id/; s/gene_name/gene_id/' > not.vegaPseudo.gtf
gtfToGenePred -genePredExt vegaPseudo.gtf vegaPseudo.gp
gtfToGenePred -genePredExt not.vegaPseudo.gtf not.vegaPseudo.gp
gzip vegaPseudo.gp not.vegaPseudo.gp
_EOF_
);
}
if (defined $geneScaffolds) {
$bossScript->add(<<_EOF_
mv $db.allGenes.gp.gz $db.allGenes.beforeLift.gp.gz
$Bin/ensGeneScaffolds.pl ../download/seq_region.txt.gz \\
../download/assembly.txt.gz | gzip > $db.ensGene.lft.gz
liftAcross -warn -bedOut=ensemblGeneScaffolds.$db.bed $db.ensGene.lft.gz \\
$db.allGenes.beforeLift.gp.gz $db.allGenes.gp >& liftAcross.err.out
gzip ensemblGeneScaffolds.$db.bed $db.allGenes.gp liftAcross.err.out
_EOF_
);
}
if (defined $liftUp) {
$bossScript->add(<<_EOF_
mv $db.allGenes.gp.gz $db.allGenes.beforeLiftUp.gp.gz
liftUp -extGenePred -type=.gp $db.allGenes.gp \\
$liftUp carry \\
$db.allGenes.beforeLiftUp.gp.gz
gzip $db.allGenes.gp
_EOF_
);
if (defined $geneScaffolds) {
$bossScript->add(<<_EOF_
mv ensemblGeneScaffolds.$db.bed.gz ensemblGeneScaffolds.$db.beforeLiftUp.bed.gz
liftUp -type=.bed ensemblGeneScaffolds.$db.bed \\
$liftUp carry \\
ensemblGeneScaffolds.$db.beforeLiftUp.bed.gz
gzip ensemblGeneScaffolds.$db.bed
_EOF_
);
}
}
# if all of these are supposed to be valid, they should be able to
# pass genePredCheck right now
if (! defined $skipInvalid) {
if ($opt_vegaGene) {
$bossScript->add(<<_EOF_
genePredCheck -db=$db vegaPseudo.gp.gz
genePredCheck -db=$db not.vegaPseudo.gp.gz
_EOF_
);
}
$bossScript->add(<<_EOF_
genePredCheck -db=$db $db.allGenes.gp.gz
_EOF_
);
}
$bossScript->execute();
} # doProcess
#########################################################################
# * step: download [dbHost]
sub doDownload {
my $runDir = "$buildDir/download";
# First, make sure we're starting clean.
if (-d "$runDir") {
die "ERROR: download: looks like this was run successfully already\n" .
"($runDir exists)\nEither run with -continue=process or some later\n" .
"stage, or move aside/remove\n$runDir\nand run again.\n";
}
&HgAutomate::mustMkdir($runDir);
my $whatItDoes = "download data from Ensembl FTP site.";
my $bossScript = new HgRemoteScript("$runDir/doDownload.csh", $dbHost,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
wget --tries=2 --timestamping --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
$ensGtfUrl \\
-O $ensGtfFile
wget --tries=2 --timestamping --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
$ensPepUrl \\
-O $ensPepFile
_EOF_
);
if (defined $geneScaffolds) {
$bossScript->add(<<_EOF_
wget --tries=2 --timestamping --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
$ensMySqlUrl/seq_region.txt.gz \\
-O seq_region.txt.gz
wget --tries=2 --timestamping --user=anonymous --password=ucscGenomeBrowser\@ucsc.edu \\
$ensMySqlUrl/assembly.txt.gz \\
-O assembly.txt.gz
_EOF_
);
}
$bossScript->execute();
} # doDownload
#########################################################################
# * step: cleanup [dbHost]
sub doCleanup {
my $runDir = "$buildDir";
my $whatItDoes = "It cleans up or compresses intermediate files.";
my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $dbHost,
$runDir, $whatItDoes);
if ($opt_vegaGene) {
$bossScript->add(<<_EOF_
rm -f pseudo.name not.pseudo.name vegaGene.name vegaPepAll.$db.fa.tab vegaPep.name
gzip vegaPep.$db.fa.tab
_EOF_
);
} else {
$bossScript->add(<<_EOF_
rm -f bed.tab ensPep.txt.gz ensPep.$db.fa.tab ensPep.name ensGene.name
_EOF_
);
}
$bossScript->execute();
} # doCleanup
#########################################################################
# * step: makeDoc [dbHost]
sub doMakeDoc {
my $runDir = "$buildDir";
my $whatItDoes = "Display the make doc text to stdout.";
my $updateTime = `hgsql -N -e 'select updateTime from trackVersion where db = "$db" order by updateTime DESC limit 1;' hgFixed`;
chomp $updateTime;
$updateTime =~ s/ .*//; # removes time
my $organism = `hgsql -N -e 'select organism from dbDb where name = "$db";' hgcentraltest`;
chomp $organism;
my $vegaOpt = "";
my $trackName = "Ensembl";
my $tableName = "ensGene";
my $workDir = "ensGene";
$tableName = "vegaGene" if ($opt_vegaGene);
$trackName = "Vega" if ($opt_vegaGene);
$vegaOpt = "-vegaGene" if ($opt_vegaGene);
$workDir = "vega" if ($opt_vegaGene);
if ($opt_vegaGene) {
print <<_EOF_
############################################################################
# $db - $organism - $trackName Genes version $ensVersion (DONE - $updateTime - $ENV{'USER'})
ssh $dbHost
cd /hive/data/genomes/$db
_EOF_
;
} else {
print <<_EOF_
############################################################################
# $db - $organism - $trackName Genes version $ensVersion (DONE - $updateTime - $ENV{'USER'})
ssh $dbHost
cd /hive/data/genomes/$db
cat << '_EOF_' > $db.$tableName.ra
_EOF_
;
print `cat $db.ensGene.ra`;
print "'_EOF_'\n";
print "# << happy emacs\n\n";
}
print " doEnsGeneUpdate.pl ${vegaOpt} -ensVersion=$ensVersion $db.ensGene.ra\n";
print " ssh hgwdev\n";
print " cd /hive/data/genomes/$db/bed/$workDir.$ensVersion\n";
print " featureBits $db $tableName\n";
print " # ";
print `featureBits $db $tableName`;
if ($opt_vegaGene) {
print " featureBits $db vegaPseudoGene\n";
print " # ";
print `featureBits $db vegaPseudoGene`;
}
print "############################################################################\n";
} # doMakeDoc
sub requireVar {
# Ensure that var is in %config and return its value.
# Remove it from %config so we can check for unrecognized contents.
my ($var, $config) = @_;
my $val = $config->{$var}
|| die "ERROR: requireVar: $CONFIG is missing required variable \"$var\".\n" .
"For a detailed list of required variables, run \"$base -help\".\n";
delete $config->{$var};
return $val;
} # requireVar
sub optionalVar {
# If var has a value in %config, return it.
# Remove it from %config so we can check for unrecognized contents.
my ($var, $config) = @_;
my $val = $config->{$var};
delete $config->{$var} if ($val);
return $val;
} # optionalVar
sub parseConfig {
# Parse config.ra file, make sure it contains the required variables.
my ($configFile) = @_;
my %config = ();
my $fh = &HgAutomate::mustOpen($configFile);
while (<$fh>) {
next if (/^\s*#/ || /^\s*$/);
if (/^\s*(\w+)\s*(.*)$/) {
my ($var, $val) = ($1, $2);
if (! exists $config{$var}) {
$config{$var} = $val;
} else {
die "ERROR: parseConfig: Duplicate definition for $var line $. of config file $configFile.\n";
}
} else {
die "ERROR: parseConfig: Can't parse line $. of config file $configFile:\n$_\n";
}
}
close($fh);
# Required variables.
$db = &requireVar('db', \%config);
$species = &HgAutomate::getSpecies($dbHost, $db);
&HgAutomate::verbose(1,
"\n db: $db, species: '$species'\n");
# Optional variables.
$liftRandoms = &optionalVar('liftRandoms', \%config);
$nameTranslation = &optionalVar('nameTranslation', \%config);
$geneScaffolds = &optionalVar('geneScaffolds', \%config);
$knownToEnsembl = &optionalVar('knownToEnsembl', \%config);
$skipInvalid = &optionalVar('skipInvalid', \%config);
$haplotypeLift = &optionalVar('haplotypeLift', \%config);
$liftUp = &optionalVar('liftUp', \%config);
# verify they actually do say yes
if (defined $liftRandoms && $liftRandoms !~ m/^yes$/i) {
undef $liftRandoms;
}
if (defined $geneScaffolds && $geneScaffolds !~ m/^yes$/i) {
undef $geneScaffolds;
}
if (defined $knownToEnsembl && $knownToEnsembl !~ m/^yes$/i) {
undef $knownToEnsembl;
}
if (defined $skipInvalid && $skipInvalid !~ m/^yes$/i) {
undef $skipInvalid;
}
if (defined $haplotypeLift) {
if (! -s $haplotypeLift) {
print STDERR "ERROR: config file specifies a haplotypeLift file:\n";
die "$haplotypeLift can not be found.\n";
}
}
if (defined $liftUp) {
if (! -s $liftUp) {
print STDERR "ERROR: config file specifies a liftUp file:\n";
die "$liftUp can not be found.\n";
}
}
# Make sure no unrecognized variables were given.
my @stragglers = sort keys %config;
if (scalar(@stragglers) > 0) {
die "ERROR: parseConfig: config file $CONFIG has unrecognized variables:\n"
. " " . join(", ", @stragglers) . "\n" .
"For a detailed list of supported variables, run \"$base -help\".\n";
}
$topDir = "/hive/data/genomes/$db";
} # parseConfig
#########################################################################
# main
# Prevent "Suspended (tty input)" hanging:
&HgAutomate::closeStdin();
# Make sure we have valid options and exactly 1 argument:
&checkOptions();
&usage(1) if (scalar(@ARGV) != 1);
if (!defined $ENV{'USER'}) {
print STDERR "ERROR: your shell environment does not define a USER";
print STDERR "The USER identity is required for history";
exit 255;
}
if (!defined $opt_ensVersion) {
print STDERR "ERROR: must specify -ensVersion=NN on command line\n";
&usage(1);
}
$ensVersion = $opt_ensVersion;
$previousEnsVersion = $ensVersion - 1;
if (! $opt_vegaGene) {
if ($versionListString !~ m/$ensVersion/) {
print STDERR "ERROR: do not recognize given -ensVersion=$ensVersion\n";
die "must be one of: $versionListString\n";
}
}
($CONFIG) = @ARGV;
&parseConfig($CONFIG);
$bedDir = "$topDir/$HgAutomate::trackBuild";
# Force debug and verbose until this is looking pretty solid:
# $opt_debug = 1;
# $opt_verbose = 3 if ($opt_verbose < 3);
# Establish what directory we will work in, tack on the ensembl version ID.
if ($opt_vegaGene) {
if ($db !~ m/^mm|^hg|^danRer/) {
printf STDERR "ERROR: this is database: $db\n";
die "vegaGene build only good on human, mouse or zebrafish\n";
}
$buildDir = $opt_buildDir ? $opt_buildDir :
"$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/vega.$ensVersion";
$previousBuildDir =
"$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/vega.$previousEnsVersion";
} else {
$buildDir = $opt_buildDir ? $opt_buildDir :
"$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ensGene.$ensVersion";
$previousBuildDir =
"$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/ensGene.$previousEnsVersion";
}
if ($opt_vegaGene) {
$vegaSpecies = "mouse" if ($db =~ m/^mm/);
$vegaSpecies = "human" if ($db =~ m/^hg/);
$vegaSpecies = "danio" if ($db =~ m/^danRer/);
$vegaPep = "Mus_musculus.VEGA$ensVersion" if ($db =~ m/^mm/);
$vegaPep = "Homo_sapiens.VEGA" if ($db =~ m/^hg/);
$vegaPep = "Danio_rerio.VEGA$ensVersion" if ($db =~ m/^danRer/);
}
if ($opt_vegaGene) {
$ensGtfUrl = "ftp://ftp.sanger.ac.uk/pub/vega/$vegaSpecies/gtf_file.gz";
$ensPepUrl = "ftp://ftp.sanger.ac.uk/pub/vega/$vegaSpecies/pep/$vegaPep.$ensVersion.pep.all.fa.gz";
$ensMySqlUrl = "";
$ensVersionDateReference = `date "+%Y-%m-%d"`;
chomp $ensVersionDateReference;
} else {
($ensGtfUrl, $ensPepUrl, $ensMySqlUrl, $ensVersionDateReference) =
&EnsGeneAutomate::ensGeneVersioning($db, $ensVersion );
}
die "ERROR: download: can not find Ensembl version $ensVersion FTP URL for UCSC database $db"
if (!defined $ensGtfUrl);
$ensGtfFile = basename($ensGtfUrl);
$ensPepFile = basename($ensPepUrl);
&HgAutomate::verbose(2,"Ensembl GTF URL: $ensGtfUrl\n");
&HgAutomate::verbose(2,"Ensembl GTF File: $ensGtfFile\n");
&HgAutomate::verbose(2,"Ensembl PEP URL: $ensPepUrl\n");
&HgAutomate::verbose(2,"Ensembl PEP File: $ensPepFile\n");
&HgAutomate::verbose(2,"Ensembl MySql URL: $ensMySqlUrl\n");
&HgAutomate::verbose(2,"Ensembl Date Reference: $ensVersionDateReference\n");
# Do everything.
$stepper->execute();
# Tell the user anything they should know.
my $stopStep = $stepper->getStopStep();
my $upThrough = ($stopStep eq 'cleanup') ? "" :
" (through the '$stopStep' step)";
&HgAutomate::verbose(1,
"\n *** All done!$upThrough\n");
&HgAutomate::verbose(1,
" *** Steps were performed in $buildDir\n");
&HgAutomate::verbose(1, "\n");