src/hg/utils/automation/doSimpleRepeat.pl 1.5
1.5 2009/06/08 18:38:58 hiram
Generalize the ssh command to avoid questions to the shell for new hosts
Index: src/hg/utils/automation/doSimpleRepeat.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/doSimpleRepeat.pl,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/utils/automation/doSimpleRepeat.pl 17 Oct 2008 18:29:12 -0000 1.4
+++ src/hg/utils/automation/doSimpleRepeat.pl 8 Jun 2009 18:38:58 -0000 1.5
@@ -1,448 +1,449 @@
#!/usr/bin/env perl
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit ~/kent/src/hg/utils/automation/doSimpleRepeat.pl instead.
# $Id$
use Getopt::Long;
use warnings;
use strict;
use Carp;
use FindBin qw($Bin);
use lib "$Bin";
use HgAutomate;
use HgRemoteScript;
use HgStepManager;
# Hardcoded (for now):
my $chunkSize = 50000000;
my $singleRunSize = 200000000;
my $clusterBin = qw(/cluster/bin/$MACHTYPE);
# Option variable names, both common and peculiar to this script:
use vars @HgAutomate::commonOptionVars;
use vars @HgStepManager::optionVars;
use vars qw/
$opt_buildDir
$opt_unmaskedSeq
/;
# Specify the steps supported with -continue / -stop:
my $stepper = new HgStepManager(
[ { name => 'trf', func => \&doTrf },
{ name => 'filter', func => \&doFilter },
{ name => 'load', func => \&doLoad },
{ name => 'cleanup', func => \&doCleanup },
]
);
# Option defaults:
my $defaultSmallClusterHub = 'most available';
my $defaultWorkhorse = 'least loaded';
my $dbHost = 'hgwdev';
my $unmaskedSeq = "\$db.unmasked.2bit";
my $base = $0;
$base =~ s/^(.*\/)?//;
sub usage {
# Usage / help / self-documentation:
my ($status, $detailed) = @_;
# Basic help (for incorrect usage):
print STDERR "
usage: $base db
options:
";
print STDERR $stepper->getOptionHelp();
print STDERR <<_EOF_
-buildDir dir Use dir instead of default
$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/simpleRepeat.\$date
(necessary when continuing at a later date).
-unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead
of default ($unmaskedSeq).
_EOF_
;
print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
'workhorse' => '',
'smallClusterHub' => '');
my ($sizeM, $chunkM) = ($singleRunSize, $chunkSize);
$sizeM =~ s/000000$/Mb/; $chunkM =~ s/000000$/Mb/;
print STDERR "
Automates UCSC's simpleRepeat (TRF) process for genome database \$db. Steps:
trf: If total genome size is <= $sizeM, run trfBig on a workhorse;
otherwise do a cluster run of trfBig on $chunkM sequence chunks.
filter: If a cluster run was performed, concatenate the results into
simpleRepeat.bed. Filter simpleRepeat.bed (period <= 12) to
trfMask.bed. If \$db is chrom-based, split trfMaskBed into
trfMaskChrom/chr*.bed for downloads.
load: Load simpleRepeat.bed into the simpleRepeat table in \$db.
cleanup: Removes or compresses intermediate files.
All operations are performed in the build directory which is
$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/simpleRepeat.\$date unless -buildDir is given.
Run -help to see what files are required for this script.
";
# Detailed help (-help):
print STDERR "
Assumptions:
1. $HgAutomate::clusterData/\$db/\$db.unmasked.2bit contains sequence for
database/assembly \$db. (This can be overridden with -unmaskedSeq.)
2. $clusterBin/trfBig and $clusterBin/trf exist
on workhorse, fileServer and cluster nodes.
" if ($detailed);
print "\n";
exit $status;
}
# Globals:
# Command line args: db
my ($db);
# Other:
my ($buildDir, $chromBased, $useCluster);
sub checkOptions {
# Make sure command line options are valid/supported.
my $ok = GetOptions(@HgStepManager::optionSpec,
'buildDir=s',
'unmaskedSeq=s',
@HgAutomate::commonOptionSpec,
);
&usage(1) if (!$ok);
&usage(0, 1) if ($opt_help);
&HgAutomate::processCommonOptions();
my $err = $stepper->processOptions();
usage(1) if ($err);
$dbHost = $opt_dbHost if ($opt_dbHost);
}
#########################################################################
# * step: trf [smallClusterHub or workhorse depending on genome size]
sub doCluster {
my $runDir = "$buildDir/run.cluster";
&HgAutomate::mustMkdir($runDir);
my $paraHub = $opt_smallClusterHub ? $opt_smallClusterHub :
&HgAutomate::chooseSmallClusterByBandwidth();
my @okIn = grep !/scratch/,
&HgAutomate::chooseFilesystemsForCluster($paraHub, "in");
my @okOut =
&HgAutomate::chooseFilesystemsForCluster($paraHub, "out");
if (scalar(@okOut) > 1) {
@okOut = grep !/$okIn[0]/, @okOut;
}
my $inHive = 0;
$inHive = 1 if ($okIn[0] =~ m#/hive/data/genomes#);
my $clusterSeqDir = "$okIn[0]/$db";
my $clusterSeq = "$clusterSeqDir/$db.doSimp.2bit";
if ($inHive) {
$clusterSeq = "$clusterSeqDir/$db.unmasked.2bit";
}
my $partDir .= "$okOut[0]/$db/TrfPart";
# Cluster job script:
my $fh = &HgAutomate::mustOpen(">$runDir/TrfRun.csh");
print $fh <<_EOF_
#!/bin/csh -ef
set finalOut = \$1
set inLst = \$finalOut:r
set inLft = \$inLst:r.lft
$HgAutomate::setMachtype
# Use local disk for output, and move the final result to \$finalOut
# when done, to minimize I/O.
set tmpDir = `mktemp -d -p /scratch/tmp doSimpleRepeat.cluster.XXXXXX`
pushd \$tmpDir
foreach spec (`cat \$inLst`)
# Remove path and .2bit filename to get just the seq:start-end spec:
set base = `echo \$spec | sed -r -e 's/^[^:]+://'`
# If \$spec is the whole sequence, twoBitToFa removes the :start-end part,
# which causes liftUp to barf later. So tweak the header back to
# seq:start-end for liftUp's sake:
twoBitToFa \$spec stdout \\
| sed -e "s/^>.*/>\$base/" \\
| $clusterBin/trfBig -trf=$clusterBin/trf \\
stdin /dev/null -bedAt=\$base.bed -tempDir=/scratch/tmp
end
# Due to the large chunk size, .lft files can have thousands of lines.
# Even though the liftUp code does &lineFileClose, somehow we still
# run out of filehandles. So limit the size of liftSpecs:
split -l 500 \$inLft SplitLft.
# Lift up:
foreach splitLft (SplitLft.*)
set bedFiles = `awk '{print \$2 ".bed"};' \$splitLft`
endsInLf -zeroOk \$bedFiles
cat \$bedFiles \\
| liftUp -type=.bed tmpOut.\$splitLft \$splitLft error stdin
end
cat tmpOut.* > tmpOut__bed
# endsInLf is much faster than using para to {check out line}:
endsInLf -zeroOk tmpOut*
# Move final result into place:
mv tmpOut__bed \$finalOut
popd
rm -rf \$tmpDir
_EOF_
;
close($fh);
&HgAutomate::makeGsub($runDir,
"./TrfRun.csh $partDir/\$(path1).bed");
+ `touch "$runDir/para_hub_$paraHub"`;
my $chunkM = $chunkSize;
$chunkM =~ s/000000$/Mb/;
my $whatItDoes =
"It computes a logical partition of unmasked 2bit into $chunkM chunks
and runs it on the cluster with the most available bandwidth.";
my $bossScript = new HgRemoteScript("$runDir/doTrf.csh", $paraHub,
$runDir, $whatItDoes);
if (! $inHive) {
$bossScript->add(<<_EOF_
mkdir -p $clusterSeqDir
rsync -av $unmaskedSeq $clusterSeq
_EOF_
);
}
$bossScript->add(<<_EOF_
chmod a+x TrfRun.csh
rm -rf $partDir
$Bin/simplePartition.pl $clusterSeq $chunkSize $partDir
rm -f $buildDir/TrfPart
ln -s $partDir $buildDir/TrfPart
$HgAutomate::gensub2 $partDir/partitions.lst single gsub jobList
$HgAutomate::paraRun
_EOF_
);
if (! $inHive) {
$bossScript->add(<<_EOF_
rm -f $clusterSeq
_EOF_
);
}
$bossScript->execute();
} # doCluster
sub doSingle {
my $runDir = "$buildDir";
&HgAutomate::mustMkdir($runDir);
my $workhorse = &HgAutomate::chooseWorkhorse();
my $whatItDoes = "It runs trfBig on the entire (small) genome in one pass.";
my $bossScript = new HgRemoteScript("$runDir/doTrf.csh", $workhorse,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
$HgAutomate::setMachtype
twoBitToFa $unmaskedSeq stdout \\
| $clusterBin/trfBig -trf=$clusterBin/trf \\
stdin /dev/null -bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
_EOF_
);
$bossScript->execute();
} # doSingle
sub doTrf {
if ($useCluster) {
&doCluster();
} else {
&doSingle();
}
} # doTrf
#########################################################################
# * step: filter [fileServer]
sub doFilter {
my $runDir = "$buildDir";
if ($useCluster) {
&HgAutomate::checkExistsUnlessDebug('trf', 'filter',
"$buildDir/run.cluster/run.time");
}
my $whatItDoes = "It filters simpleRepeat.bed > trfMask.bed.\n";
if ($useCluster) {
$whatItDoes =
"It concatenates .bed files from cluster run into simpleRepeat.bed.\n"
. $whatItDoes;
}
if ($chromBased) {
$whatItDoes .=
"It splits trfMask.bed into per-chrom files for bigZips download generation.";
}
my $fileServer = &HgAutomate::chooseFileServer($runDir);
my $bossScript = new HgRemoteScript("$runDir/doFilter.csh", $fileServer,
$runDir, $whatItDoes);
# Use symbolic link created in cluster step:
my $partDir = "$buildDir/TrfPart";
if ($useCluster) {
$bossScript->add(<<_EOF_
cat $partDir/???/*.bed > simpleRepeat.bed
endsInLf simpleRepeat.bed
if (\$status) then
echo Uh-oh -- simpleRepeat.bed fails endsInLf. Look at $partDir/ bed files.
exit 1
endif
_EOF_
);
}
$bossScript->add(<<_EOF_
awk '{if (\$5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
_EOF_
);
if ($chromBased) {
$bossScript->add(<<_EOF_
splitFileByColumn trfMask.bed trfMaskChrom/
_EOF_
);
}
$bossScript->execute();
} # doFilter
#########################################################################
# * step: load [dbHost]
sub doLoad {
my $runDir = "$buildDir";
&HgAutomate::checkExistsUnlessDebug('filter', 'load',
"$buildDir/simpleRepeat.bed");
my $whatItDoes = "It loads simpleRepeat.bed into the simpleRepeat table.";
my $bossScript = new HgRemoteScript("$runDir/doLoad.csh", $dbHost,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
hgLoadBed $db simpleRepeat simpleRepeat.bed \\
-sqlTable=\$HOME/kent/src/hg/lib/simpleRepeat.sql
featureBits $db simpleRepeat >& fb.simpleRepeat
cat fb.simpleRepeat
_EOF_
);
$bossScript->execute();
} # doLoad
#########################################################################
# * step: cleanup [fileServer]
sub doCleanup {
my $runDir = "$buildDir";
my $whatItDoes = "It cleans up or compresses intermediate files.";
my $fileServer = &HgAutomate::chooseFileServer($runDir);
my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
if (-e TrfPart/000) then
rm -rf TrfPart/???
endif
_EOF_
);
$bossScript->execute();
} # doCleanup
#########################################################################
# main
# Prevent "Suspended (tty input)" hanging:
&HgAutomate::closeStdin();
# Make sure we have valid options and exactly 1 argument:
&checkOptions();
&usage(1) if (scalar(@ARGV) != 1);
($db) = @ARGV;
# Now that we know the $db, figure out our paths:
my $date = `date +%Y-%m-%d`;
chomp $date;
$buildDir = $opt_buildDir ? $opt_buildDir :
"$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/simpleRepeat.$date";
$unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq :
"$HgAutomate::clusterData/$db/$db.unmasked.2bit";
if (! -e $unmaskedSeq) {
die $opt_unmaskedSeq ? "Error: -unmaskedSeq $unmaskedSeq does not exist.\n" :
"Error: required file $unmaskedSeq does not exist. " .
"(use -unmaskedSeq <file> ?)\n";
}
die "Error: -unmaskedSeq filename must end in .2bit (got $unmaskedSeq).\n"
if ($unmaskedSeq !~ /\.2bit$/);
if ($unmaskedSeq !~ /^\//) {
my $pwd = `pwd`;
chomp $pwd;
$unmaskedSeq = "$pwd/$unmaskedSeq";
}
my $pipe = &HgAutomate::mustOpen("twoBitInfo $unmaskedSeq stdout |");
my $seqCount = 0;
my $genomeSize = 0;
while (<$pipe>) {
chomp;
my (undef, $size) = split;
$genomeSize += $size;
$seqCount++;
if ($seqCount > $HgAutomate::splitThreshold &&
$genomeSize > $singleRunSize) {
# No need to keep counting -- we know our boolean answers.
last;
}
}
close($pipe);
die "Could not open pipe from twoBitInfo $unmaskedSeq"
unless ($genomeSize > 0 && $seqCount > 0);
$chromBased = ($seqCount <= $HgAutomate::splitThreshold);
$useCluster = ($genomeSize > $singleRunSize);
&HgAutomate::verbose(2, "\n$db is " .
($chromBased ? "chrom-based" : "scaffold-based") . ".\n" .
"Total genome size is " .
($useCluster ? "> $singleRunSize; cluster & cat." :
"<= $singleRunSize; single run.") .
"\n\n");
# Do everything.
$stepper->execute();
# Tell the user anything they should know.
my $stopStep = $stepper->getStopStep();
my $upThrough = ($stopStep eq 'cleanup') ? "" :
" (through the '$stopStep' step)";
&HgAutomate::verbose(1, <<_EOF_
*** All done!$upThrough
*** Steps were performed in $buildDir
_EOF_
);
if ($stepper->stepPrecedes('trf', $stopStep)) {
my $buildDirRel = $buildDir;
$buildDirRel =~ s/^$HgAutomate::clusterData\/$db\///;
&HgAutomate::verbose(1, <<_EOF_
*** Please check the log file to see if trf had warnings that we
should pass on to Gary Benson (ideally with a small test case).
*** IMPORTANT: Developer, it is up to you to make use of trfMask.bed!
Here is a typical command sequence, to be run after RepeatMasker
(or WindowMasker etc.) has completed:
cd $HgAutomate::clusterData/$db
twoBitMask $db.rmsk.2bit -add $buildDirRel/trfMask.bed $db.2bit
*** Again, it is up to you to incorporate trfMask.bed into the final 2bit!
_EOF_
);
}
&HgAutomate::verbose(1, "\n");