src/hg/utils/automation/doCpgIslands.pl 1.3
1.3 2010/02/11 23:49:36 hiram
check source input for enough valid bases before running cpglh.exe
Index: src/hg/utils/automation/doCpgIslands.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/doCpgIslands.pl,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/utils/automation/doCpgIslands.pl 13 Mar 2009 22:27:12 -0000 1.2
+++ src/hg/utils/automation/doCpgIslands.pl 11 Feb 2010 23:49:36 -0000 1.3
@@ -1,266 +1,271 @@
#!/usr/bin/env perl
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit ~/kent/src/hg/utils/automation/doCpgIslands.pl instead.
use Getopt::Long;
use warnings;
use strict;
use FindBin qw($Bin);
use lib "$Bin";
use HgAutomate;
use HgRemoteScript;
use HgStepManager;
# Option variable names, both common and peculiar to this script:
use vars @HgAutomate::commonOptionVars;
use vars @HgStepManager::optionVars;
use vars qw/
$opt_buildDir
$opt_maskedSeq
/;
# Specify the steps supported with -continue / -stop:
my $stepper = new HgStepManager(
[ { name => 'compile', func => \&doCompile },
{ name => 'hardMask', func => \&doHardMask },
{ name => 'cpg', func => \&doCpg },
{ name => 'makeBed', func => \&doMakeBed },
{ name => 'load', func => \&doLoadCpg },
{ name => 'cleanup', func => \&doCleanup },
]
);
# Option defaults:
my $dbHost = 'hgwdev';
my $defaultWorkhorse = 'kolossus';
my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit";
my $base = $0;
$base =~ s/^(.*\/)?//;
sub usage {
# Usage / help / self-documentation:
my ($status, $detailed) = @_;
# Basic help (for incorrect usage):
print STDERR "
usage: $base db
options:
";
print STDERR $stepper->getOptionHelp();
print STDERR <<_EOF_
-buildDir dir Use dir instead of default
$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/cpgIslands
(necessary when continuing at a later date).
-maskedSeq seq.2bit Use seq.2bit as the masked input sequence instead
of default ($maskedSeq).
_EOF_
;
print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
'workhorse' => $defaultWorkhorse);
print STDERR "
Automates UCSC's CpG Island finder for genome database \$db. Steps:
compile: Check-out and build the CpG Island program from Asif Chinwalla.
hardMask: Creates hard-masked fastas needed for the CpG Island program.
cpg: Run cpglh.exe on the hard-masked fastas
makeBed: Transform output from cpglh.exe into cpgIsland.bed
load: Load cpgIsland.bed into \$db.
cleanup: Removes hard-masked fastas and output from cpglh.exe.
All operations are performed in the build directory which is
$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/cpgIslands unless -buildDir is given.
";
# Detailed help (-help):
print STDERR "
Assumptions:
1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
database/assembly \$db.
" if ($detailed);
print "\n";
exit $status;
}
# Globals:
# Command line args: db
my ($db);
# Other:
my ($buildDir);
sub checkOptions {
# Make sure command line options are valid/supported.
my $ok = GetOptions(@HgStepManager::optionSpec,
'buildDir=s',
'maskedSeq=s',
@HgAutomate::commonOptionSpec,
);
&usage(1) if (!$ok);
&usage(0, 1) if ($opt_help);
&HgAutomate::processCommonOptions();
my $err = $stepper->processOptions();
usage(1) if ($err);
$dbHost = $opt_dbHost if ($opt_dbHost);
}
#########################################################################
# * step: check out [dbHost]
sub doCompile {
my $runDir = $buildDir;
&HgAutomate::mustMkdir($runDir);
my $whatItDoes = "Checks out the cpglh source and compiles the executable.";
my $bossScript = new HgRemoteScript("$runDir/doCompile.csh", $dbHost,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
# comment out the following two lines if it compiles cleanly
# some day
sed 's/\\(extern char\\* malloc\\)/\\/\\/ \\1/' cpg_lh.c > tmp.c
mv tmp.c cpg_lh.c
make
cd ../../
ln -s hg3rdParty/cpgIslands/cpglh.exe
_EOF_
);
$bossScript->execute();
} # doCompile
#########################################################################
# * step: hard mask [workhorse]
sub doHardMask {
my $runDir = $buildDir;
&HgAutomate::mustMkdir($runDir);
my $whatItDoes = "Make hard-masked fastas for each chrom.";
my $workhorse = &HgAutomate::chooseWorkhorse();
my $bossScript = new HgRemoteScript("$runDir/doHardMask.csh", $workhorse,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
mkdir -p hardMaskedFa
foreach chrom ( `twoBitInfo $maskedSeq stdout | cut -f1` )
twoBitToFa ${maskedSeq}:\$chrom stdout \\
| maskOutFa stdin hard \\
- hardMaskedFa\/\$\{chrom\}.fa
+ hardMaskedFa/\$chrom.fa
end
_EOF_
);
$bossScript->execute();
} # doHardMask
#########################################################################
# * step: cpg [workhorse] (will change to a cluster at some point)
sub doCpg {
my $runDir = $buildDir;
&HgAutomate::mustMkdir($runDir);
my $whatItDoes = "Run cpglh.exe on masked sequence.";
my $workhorse = &HgAutomate::chooseWorkhorse();
my $bossScript = new HgRemoteScript("$runDir/doCpg.csh", $workhorse,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
mkdir -p results
foreach chrom ( `twoBitInfo $maskedSeq stdout | cut -f1` )
- ./cpglh.exe hardMaskedFa\/\$\{chrom\}.fa > results\/\$\{chrom\}.cpg
+ set C = `faCount hardMaskedFa/\$chrom.fa | egrep -v "^#seq|^total" | awk '{print \$2 - \$7 }'`
+ if ( \$C > 200 ) then
+ ./cpglh.exe hardMaskedFa/\$chrom.fa > results/\$chrom.cpg
+ else
+ touch results/\$chrom.cpg
+ endif
end
_EOF_
);
$bossScript->execute();
} # doCpg
#########################################################################
# * step: make bed [workhorse]
sub doMakeBed {
my $runDir = $buildDir;
&HgAutomate::mustMkdir($runDir);
my $whatItDoes = "Makes bed from cpglh.exe output.";
my $workhorse = &HgAutomate::chooseWorkhorse();
my $bossScript = new HgRemoteScript("$runDir/doMakeBed.csh", $workhorse,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
catDir results \\
| awk \'\{\$2 = \$2 - 1; width = \$3 - \$2; printf\(\"\%s\\t\%d\\t\%s\\t\%s \%s\\t\%s\\t\%s\\t\%0.0f\\t\%0.1f\\t\%s\\t\%s\\n\", \$1, \$2, \$3, \$5, \$6, width, \$6, width\*\$7\*0.01, 100.0\*2\*\$6\/width, \$7, \$9\);}\' \\
| sort -k1,1 -k2,2n > cpgIsland.bed
_EOF_
);
$bossScript->execute();
} # doMakeBed
#########################################################################
# * step: load [dbHost]
sub doLoadCpg {
my $runDir = $buildDir;
&HgAutomate::mustMkdir($runDir);
my $whatItDoes = "Loads cpgIsland.bed.";
my $bossScript = new HgRemoteScript("$runDir/doLoadCpg.csh", $dbHost,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
cvs -d /projects/compbio/cvsroot checkout -P kent/src/hg/lib
ln -s kent/src/hg/lib/cpgIslandExt.sql
hgLoadBed -sqlTable=cpgIslandExt.sql -tab $db cpgIslandExt cpgIsland.bed
_EOF_
);
$bossScript->execute();
} # doLoad
#########################################################################
# * step: cleanup [fileServer]
sub doCleanup {
my $runDir = $buildDir;
my $whatItDoes = "It cleans up or compresses intermediate files.";
my $fileServer = &HgAutomate::chooseFileServer($runDir);
my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer,
$runDir, $whatItDoes);
$bossScript->add(<<_EOF_
rm -rf hg3rdParty/ kent/
rm -rf hardMaskedFa/
rm -rf results/
rm -f bed.tab cpgIslandExt.sql cpglh.exe
gzip cpgIsland.bed
_EOF_
);
$bossScript->execute();
} # doCleanup
#########################################################################
# main
# Prevent "Suspended (tty input)" hanging:
&HgAutomate::closeStdin();
# Make sure we have valid options and exactly 1 argument:
&checkOptions();
&usage(1) if (scalar(@ARGV) != 1);
($db) = @ARGV;
# Force debug and verbose until this is looking pretty solid:
#$opt_debug = 1;
#$opt_verbose = 3 if ($opt_verbose < 3);
# Establish what directory we will work in.
$buildDir = $opt_buildDir ? $opt_buildDir :
"$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/cpgIslands";
$maskedSeq = $opt_maskedSeq ? $opt_maskedSeq :
"$HgAutomate::clusterData/$db/$db.2bit";
# Do everything.
$stepper->execute();
# Tell the user anything they should know.
my $stopStep = $stepper->getStopStep();
my $upThrough = ($stopStep eq 'cleanup') ? "" :
" (through the '$stopStep' step)";
&HgAutomate::verbose(1,
"\n *** All done!$upThrough\n");
&HgAutomate::verbose(1,
" *** Steps were performed in $buildDir\n");
&HgAutomate::verbose(1, "\n");