src/hg/utils/automation/doBlastzChainNet.pl 1.31
1.31 2010/02/11 23:47:51 hiram
not loading split table is now the default
Index: src/hg/utils/automation/doBlastzChainNet.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/doBlastzChainNet.pl,v
retrieving revision 1.30
retrieving revision 1.31
diff -b -B -U 1000000 -r1.30 -r1.31
--- src/hg/utils/automation/doBlastzChainNet.pl 26 Oct 2009 20:54:43 -0000 1.30
+++ src/hg/utils/automation/doBlastzChainNet.pl 11 Feb 2010 23:47:51 -0000 1.31
@@ -1,1579 +1,1585 @@
#!/usr/bin/env perl
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit ~/kent/src/hg/utils/automation/doBlastzChainNet.pl instead.
# $Id$
# to-do items:
# - lots of testing
# - better logging: right now it just passes stdout and stderr,
# leaving redirection to a logfile up to the user
# - -swapBlastz, -loadBlastz
# - -tDb, -qDb
# - -tUnmasked, -qUnmasked
# - -axtBlastz
# - another Gill wish list item: save a lav header (involves run-blastz-ucsc)
# - 2bit / multi-sequence support when abridging?
# - reciprocal best?
# - hgLoadSeq of query instead of assuming there's a $qDb database?
use Getopt::Long;
use warnings;
use strict;
use FindBin qw($Bin);
use lib "$Bin";
use HgAutomate;
use HgRemoteScript;
use HgStepManager;
# Hardcoded paths/command sequences:
my $getFileServer = '/cluster/bin/scripts/fileServer';
my $blastzRunUcsc = "$Bin/blastz-run-ucsc";
my $partition = "$Bin/partitionSequence.pl";
my $clusterLocal = '/scratch/hg';
my $clusterSortaLocal = '/iscratch/i';
my @clusterNAS = ('/cluster/bluearc', '/san/sanvol1');
my $clusterNAS = join('/... or ', @clusterNAS) . '/...';
my @clusterNoNo = ('/cluster/home', '/projects');
my @fileServerNoNo = ('kkhome', 'kks00');
my @fileServerNoLogin = ('kkusr01', '10.1.1.3', '10.1.10.11',
'sanhead1', 'sanhead2', 'sanhead3', 'sanhead4',
'sanhead5', 'sanhead6', 'sanhead7', 'sanhead8');
# Option variable names, both common and peculiar to doBlastz:
use vars @HgAutomate::commonOptionVars;
use vars @HgStepManager::optionVars;
use vars qw/
$opt_blastzOutRoot
$opt_swap
$opt_chainMinScore
$opt_chainLinearGap
$opt_tRepeats
$opt_qRepeats
$opt_readmeOnly
$opt_ignoreSelf
$opt_syntenicNet
$opt_noDbNameCheck
$opt_inclHap
$opt_noLoadChainSplit
+ $opt_loadChainSplit
/;
# Specify the steps supported with -continue / -stop:
my $stepper = new HgStepManager(
[ { name => 'partition', func => \&doPartition },
{ name => 'blastz', func => \&doBlastzClusterRun },
{ name => 'cat', func => \&doCatRun },
{ name => 'chainRun', func => \&doChainRun },
{ name => 'chainMerge', func => \&doChainMerge },
{ name => 'net', func => \&netChains },
{ name => 'load', func => \&loadUp },
{ name => 'download', func => \&doDownloads },
{ name => 'cleanup', func => \&cleanup },
{ name => 'syntenicNet',func => \&doSyntenicNet }
]
);
# Option defaults:
my $bigClusterHub = 'kk';
my $smallClusterHub = 'memk';
my $dbHost = 'hgwdev';
my $workhorse = 'kolossus';
my $defaultChainLinearGap = "loose";
my $defaultChainMinScore = "1000"; # from axtChain itself
my $defaultTRepeats = ""; # for netClass option tRepeats
my $defaultQRepeats = ""; # for netClass option qRepeats
my $defaultSeq1Limit = 30;
my $defaultSeq2Limit = 100;
sub usage {
# Usage / help / self-documentation:
my ($status, $detailed) = @_;
my $base = $0;
$base =~ s/^(.*\/)?//;
# Basic help (for incorrect usage):
print STDERR "
usage: $base DEF
options:
";
print STDERR $stepper->getOptionHelp();
print STDERR <<_EOF_
-blastzOutRoot dir Directory path where outputs of the blastz cluster
run will be stored. By default, they will be
stored in the $HgAutomate::clusterData build directory , but
this option can specify something more cluster-
friendly: $clusterNAS .
If dir does not already exist it will be created.
Blastz outputs are removed in the cleanup step.
-swap DEF has already been used to create chains; swap
those chains (target for query), then net etc. in
a new directory:
$HgAutomate::clusterData/\$qDb/$HgAutomate::trackBuild/blastz.\$tDb.swap/
-chainMinScore n Add -minScore=n (default: $defaultChainMinScore) to the
axtChain command.
-chainLinearGap type Add -linearGap=<loose|medium|filename> to the
axtChain command. (default: loose)
-tRepeats table Add -tRepeats=table to netClass (default: rmsk)
-qRepeats table Add -qRepeats=table to netClass (default: rmsk)
-ignoreSelf Do not assume self alignments even if tDb == qDb
-syntenicNet Perform optional syntenicNet step
-noDbNameCheck ignore Db name format
-inclHap include haplotypes *_hap* in chain/net, default not
- -noLoadChainSplit do not load split chain tables even if chrom based
+ -loadChainSplit load split chain tables, default is not split tables
_EOF_
;
print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
'workhorse' => $workhorse,
'fileServer' => '',
'bigClusterHub' => $bigClusterHub,
'smallClusterHub' => $smallClusterHub);
print STDERR "
Automates UCSC's blastz/chain/net pipeline:
1. Big cluster run of blastz.
2. Small cluster consolidation of blastz result files.
3. Small cluster chaining run.
4. Sorting and netting of chains on the fileserver
(no nets for self-alignments).
5. Generation of liftOver-suitable chains from nets+chains on fileserver
(not done for self-alignments).
6. Generation of axtNet and mafNet files on the fileserver (not for self).
7. Addition of gap/repeat info to nets on hgwdev (not for self).
8. Loading of chain and net tables on hgwdev (no nets for self).
9. Setup of download directory on hgwdev.
10.Optional (-syntenicNet flag): Generation of syntenic mafNet files.
DEF is a Scott Schwartz-style bash script containing blastz parameters.
This script makes a lot of assumptions about conventional placements of
certain files, and what will be in the DEF vars. Stick to the conventions
described in the -help output, pray to the cluster gods, and all will go
well. :)
";
# Detailed help (-help):
print STDERR "
Assumptions:
1. $HgAutomate::clusterData/\$db/ is the main directory for database/assembly \$db.
$HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/ will be the directory
created for this run, where \$tDb is the target/reference db and
\$qDb is the query. (Can be overridden, see #10 below.)
$dbHost:$HgAutomate::goldenPath/\$tDb/vs\$QDb/ (or vsSelf)
is the directory where downloadable files need to go.
LiftOver chains (not applicable for self-alignments) go in this file:
$HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/liftOver/\$tDbTo\$QDb.over.chain.gz
a copy is kept here (in case the liftOver/ copy is overwritten):
$HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/\$tDb.\$qDb.over.chain.gz
and symbolic links to the liftOver/ file are put here:
$dbHost:$HgAutomate::goldenPath/\$tDb/liftOver/\$tDbTo\$QDb.over.chain.gz
$dbHost:$HgAutomate::gbdb/\$tDb/liftOver/\$tDbTo\$QDb.over.chain.gz
2. DEF's SEQ1* variables describe the target/reference assembly.
DEF's SEQ2* variables describe the query assembly.
If those are the same assembly, then we're doing self-alignments and
will drop aligned blocks that cross the diagonal.
3. DEF's SEQ1_DIR is either a directory containing one nib file per
target sequence (usually chromosome), OR a complete path to a
single .2bit file containing all target sequences. This directory
should be in $clusterLocal or $clusterSortaLocal .
SEQ2_DIR: ditto for query.
4. DEF's SEQ1_LEN is a tab-separated dump of the target database table
chromInfo -- or at least a file that contains all sequence names
in the first column, and corresponding sizes in the second column.
Normally this will be $HgAutomate::clusterData/\$tDb/chrom.sizes, but for a
scaffold-based assembly, it is a good idea to put it in $clusterSortaLocal
or $clusterNAS
because it will be a large file and it is read by blastz-run-ucsc
(big cluster script).
SEQ2_LEN: ditto for query.
5. DEF's SEQ1_CHUNK and SEQ1_LAP determine the step size and overlap size
of chunks into which large target sequences are to be split before
alignment. SEQ2_CHUNK and SEQ2_LAP: ditto for query.
6. DEF's SEQ1_LIMIT and SEQ2_LIMIT decide what the maximum number of
sequences should be for any partitioned file (the files created in the
tParts and qParts directories). This limit only effects SEQ1 or SEQ2
when they are 2bit files. Some 2bit files have too many contigs. This
reduces the number of blastz hippos (jobs taking forever compared to
the other jobs). SEQ1_LIMIT defaults to $defaultSeq1Limit and SEQ2_LIMIT defaults to $defaultSeq2Limit.
7. DEF's BLASTZ_ABRIDGE_REPEATS should be set to something nonzero if
abridging of lineage-specific repeats is to be performed. If so, the
following additional constraints apply:
a. Both target and query assemblies must be structured as one nib file
per sequence in SEQ*_DIR (sorry, this rules out scaffold-based
assemblies).
b. SEQ1_SMSK must be set to a directory containing one file per target
sequence, with the name pattern \$seq.out.spec. This file must be
a RepeatMasker .out file (usually filtered by DateRepeats). The
directory should be under $clusterLocal or $clusterSortaLocal .
SEQ2_SMSK: ditto for query.
8. DEF's BLASTZ_[A-Z] variables will be translated into blastz command line
options (e.g. BLASTZ_H=foo --> H=foo, BLASTZ_Q=foo --> Q=foo).
For human-mouse evolutionary distance/sensitivity, none of these are
necessary (blastz-run-ucsc defaults will be used). Here's what we have
used for human-fugu and other very-distant pairs:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=$HgAutomate::clusterData/blastz/HoxD55.q
Blastz parameter tuning is somewhat of an art and is beyond the scope
here. Webb Miller and Jim can provide guidance on how to set these for
a new pair of organisms.
9. DEF's PATH variable, if set, must specify a path that contains programs
necessary for blastz to run: blastz, and if BLASTZ_ABRIDGE_REPEATS is set,
then also fasta_subseq, strip_rpts, restore_rpts, and revcomp.
If DEF does not contain a PATH, blastz-run-ucsc will use its own default.
10. DEF's BLASTZ variable can specify an alternate path for blastz.
11. DEF's BASE variable can specify the blastz/chain/net build directory
(defaults to $HgAutomate::clusterData/\$tDb/$HgAutomate::trackBuild/blastz.\$qDb.\$date/).
12. SEQ?_CTGDIR specifies sequence source with the contents of full chrom
sequences and the contig randoms and chrUn. This keeps the contigs
separate during the blastz and chaining so that chains won't go through
across multiple contigs on the randoms.
13. SEQ?_CTGLEN specifies a length file to be used in conjunction with the
special SEQ?_CTGDIR file specified above which contains the random contigs.
14. SEQ?_LIFT specifies a lift file to lift sequences in the SEQ?_CTGDIR
to their random and chrUn positions. This is useful for a 2bit file that
has both full chrom sequences and the contigs for the randoms.
15. SEQ2_SELF=1 specifies the SEQ2 is already specially split for self
alignments and to use SEQ2 sequence for self alignment, not just a
copy of SEQ1
16. TMPDIR - specifies directory on cluster node to keep temporary files
Typically TMPDIR=/scratch/tmp
17. All other variables in DEF will be ignored!
" if ($detailed);
exit $status;
}
# Globals:
my %defVars = ();
my ($DEF, $tDb, $qDb, $QDb, $isSelf, $selfSplit, $buildDir, $fileServer);
my ($swapDir, $splitRef, $inclHap);
sub isInDirList {
# Return TRUE if $dir is under (begins with) something in dirList.
my ($dir, @dirList) = @_;
my $pat = '^(' . join('|', @dirList) . ')(/.*)?$';
return ($dir =~ m@$pat@);
}
sub enforceClusterNoNo {
# Die right away if user is trying to put cluster output somewhere
# off-limits.
my ($dir, $desc) = @_;
if (&isInDirList($dir, @clusterNoNo)) {
die "\ncluster outputs are forbidden to go to " .
join (' or ', @clusterNoNo) . " so please choose a different " .
"$desc instead of $dir .\n\n";
}
my $testFileServer = `$getFileServer $dir/`;
if (scalar(grep /^$testFileServer$/, @fileServerNoNo)) {
die "\ncluster outputs are forbidden to go to fileservers " .
join (' or ', @fileServerNoNo) . " so please choose a different " .
"$desc instead of $dir (which is hosted on $testFileServer).\n\n";
}
}
sub checkOptions {
# Make sure command line options are valid/supported.
my $ok = GetOptions(@HgStepManager::optionSpec,
@HgAutomate::commonOptionSpec,
"blastzOutRoot=s",
"swap",
"chainMinScore=i",
"chainLinearGap=s",
"tRepeats=s",
"qRepeats=s",
"readmeOnly",
"ignoreSelf",
"syntenicNet",
"noDbNameCheck",
"inclHap",
- "noLoadChainSplit"
+ "noLoadChainSplit",
+ "loadChainSplit"
);
&usage(1) if (!$ok);
&usage(0, 1) if ($opt_help);
&HgAutomate::processCommonOptions();
my $err = $stepper->processOptions();
usage(1) if ($err);
if ($opt_swap) {
if ($opt_continue) {
if ($stepper->stepPrecedes($opt_continue, 'net')) {
warn "\nIf -swap is specified, then -continue must specify a step ".
"of \"net\" or later.\n";
&usage(1);
}
} else {
# If -swap is given but -continue is not, force -continue and tell
# $stepper to reevaluate options:
$opt_continue = 'chainMerge';
$err = $stepper->processOptions();
usage(1) if ($err);
}
if ($opt_stop) {
if ($stepper->stepPrecedes($opt_stop, 'chainMerge')) {
warn "\nIf -swap is specified, then -stop must specify a step ".
"of \"chainMerge\" or later.\n";
&usage(1);
}
}
}
if ($opt_blastzOutRoot) {
if ($opt_blastzOutRoot !~ m@^/\S+/\S+@) {
warn "\n-blastzOutRoot must specify a full path.\n";
&usage(1);
}
&enforceClusterNoNo($opt_blastzOutRoot, '-blastzOutRoot');
if (! &isInDirList($opt_blastzOutRoot, @clusterNAS)) {
warn "\n-blastzOutRoot is intended to specify something on " .
"$clusterNAS, but I'll trust your judgment " .
"and use $opt_blastzOutRoot\n\n";
}
}
$workhorse = $opt_workhorse if ($opt_workhorse);
$bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
$smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub);
}
#########################################################################
# The following routines were taken almost verbatim from blastz-run-ucsc,
# so may be good candidates for libification! unless that would slow down
# blastz-run-ucsc...
# nfsNoodge() was removed from loadDef() and loadSeqSizes() -- since this
# script will not be run on the cluster, we should fully expect files to
# be immediately visible.
sub loadDef {
# Read parameters from a bash script with Scott's param variable names:
my ($def) = @_;
my $fh = &HgAutomate::mustOpen("$def");
while (<$fh>) {
s/^\s*export\s+//;
next if (/^\s*#/ || /^\s*$/);
if (/(\w+)\s*=\s*(.*)/) {
my ($var, $val) = ($1, $2);
while ($val =~ /\$(\w+)/) {
my $subst = $defVars{$1};
if (defined $subst) {
$val =~ s/\$$1/$subst/;
} else {
die "Can't find value to substitute for \$$1 in $DEF var $var.\n";
}
}
$defVars{$var} = $val;
}
}
close($fh);
}
sub loadSeqSizes {
# Load up sequence -> size mapping from $sizeFile into $hashRef.
my ($sizeFile, $hashRef) = @_;
my $fh = &HgAutomate::mustOpen("$sizeFile");
while (<$fh>) {
chomp;
my ($seq, $size) = split;
$hashRef->{$seq} = $size;
}
close($fh);
}
# end shared stuff from blastz-run-ucsc
#########################################################################
sub requireVar {
my ($var) = @_;
die "Error: $DEF is missing variable $var\n" if (! defined $defVars{$var});
}
sub requirePath {
my ($var) = @_;
my $val = $defVars{$var};
die "Error: $DEF $var=$val must specify a complete path\n"
if ($val !~ m@^/\S+/\S+@);
if ( -d $val ) {
my $fileCount = `find $val -maxdepth 1 -type f | wc -l`;
chomp $fileCount;
if ($fileCount < 1) {
die "Error: $DEF variable: $var=$val specifies an empty directory.\n";
}
} elsif ( ! -s $val ) {
die "Error: $DEF variable: $var=$val is not a file or directory.\n";
}
}
sub requireNum {
my ($var) = @_;
my $val = $defVars{$var};
die "Error: $DEF variable $var=$val must specify a number.\n"
if ($val !~ /^\d+$/);
}
my $oldDbFormat = '[a-z][a-z](\d+)?';
my $newDbFormat = '[a-z][a-z][a-z][A-Z][a-z][a-z0-9](\d+)?';
sub getDbFromPath {
# Require that $val is a full path that contains a recognizable db as
# one of its elements (possibly the last one).
my ($var) = @_;
my $val = $defVars{$var};
my $db;
if ($opt_noDbNameCheck ||
$val =~ m@^/\S+/($oldDbFormat|$newDbFormat)((\.2bit)|(/(\S+)?))?$@) {
$db = $1;
} else {
die "Error: $DEF variable $var=$val must be a full path with " .
"a recognizable database as one of its elements.\n"
}
if (! defined($db)) {
if ($val =~ m#^/hive/data/genomes/#) {
$val =~ s#^/hive/data/genomes/##;
$val =~ s#/.*##;
$db = $val;
warn "Warning: assuming database $db from /hive/data/genomes/<db>/ path\n";
} elsif ($val =~ m#^/scratch/data/#) {
$val =~ s#^/scratch/data/##;
$val =~ s#/.*##;
$db = $val;
warn "Warning: assuming database $db from /scratch/data/<db>/ path\n";
}
}
return $db;
}
sub checkDef {
# Make sure %defVars contains what we need and looks consistent with
# our assumptions.
foreach my $s ('SEQ1_', 'SEQ2_') {
foreach my $req ('DIR', 'LEN', 'CHUNK', 'LAP') {
&requireVar("$s$req");
}
&requirePath($s . 'DIR');
&requirePath($s . 'LEN');
&requireNum($s . 'CHUNK');
&requireNum($s . 'LAP');
}
$tDb = &getDbFromPath('SEQ1_DIR');
$qDb = &getDbFromPath('SEQ2_DIR');
$isSelf = $opt_ignoreSelf ? 0 : ($tDb eq $qDb);
# special split on SEQ2 for Self alignments
$selfSplit = $defVars{'SEQ2_SELF'} || 0;
$QDb = $isSelf ? 'Self' : ucfirst($qDb);
if ($isSelf && $opt_swap) {
die "-swap is not supported for self-alignments\n" .
"($DEF has $tDb as both target and query).\n";
}
HgAutomate::verbose(1, "$DEF looks OK!\n" .
"\ttDb=$tDb\n\tqDb=$qDb\n\ts1d=$defVars{SEQ1_DIR}\n" .
"\tisSelf=$isSelf\n");
if ($defVars{'SEQ1_SMSK'} || $defVars{'SEQ2_SMSK'} ||
$defVars{'BLASTZ_ABRIDGE_REPEATS'}) {
&requireVar('BLASTZ_ABRIDGE_REPEATS');
foreach my $s ('SEQ1_', 'SEQ2_') {
my $var = $s. 'SMSK';
&requireVar($var);
&requirePath($var);
}
HgAutomate::verbose(1, "Abridging repeats!\n");
}
}
sub doPartition {
# Partition the sequence up before blastz.
my $paraHub = $bigClusterHub;
my $runDir = "$buildDir/run.blastz";
my $targetList = "$tDb.lst";
my $queryList = $isSelf ? $targetList :
($opt_ignoreSelf ? "$qDb.ignoreSelf.lst" : "$qDb.lst");
if ($selfSplit) {
$queryList = "$qDb.selfSplit.lst"
}
my $tPartDir = '-lstDir tParts';
my $qPartDir = '-lstDir qParts';
my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl';
my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'};
my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'};
my $seq1Len = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'};
my $seq2Len = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'};
my $seq1Limit = (defined $defVars{'SEQ1_LIMIT'}) ? $defVars{'SEQ1_LIMIT'} :
$defaultSeq1Limit;
my $seq2Limit = (defined $defVars{'SEQ2_LIMIT'}) ? $defVars{'SEQ2_LIMIT'} :
$defaultSeq2Limit;
my $partitionTargetCmd =
("$partition $defVars{SEQ1_CHUNK} $defVars{SEQ1_LAP} " .
"$seq1Dir $seq1Len -xdir xdir.sh -rawDir $outRoot $seq1Limit " .
"$tPartDir > $targetList");
my $partitionQueryCmd =
(($isSelf && (! $selfSplit)) ?
'# Self-alignment ==> use target partition for both.' :
"$partition $defVars{SEQ2_CHUNK} $defVars{SEQ2_LAP} " .
"$seq2Dir $seq2Len $seq2Limit " .
"$qPartDir > $queryList");
&HgAutomate::mustMkdir($runDir);
my $whatItDoes =
"It computes partitions of target and query sequences into chunks of the
specified size for the blastz cluster run. The actual splitting of
sequence is not performed here, but later on by blastz cluster jobs.";
my $bossScript = new HgRemoteScript("$runDir/doPartition.csh", $paraHub,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
$partitionTargetCmd
+set L1 = `wc -l < $targetList`
$partitionQueryCmd
+set L2 = `wc -l < $queryList`
+set L = `echo \$L1 \$L2 | awk '{print \$1*\$2}'`
+echo "cluster batch jobList size: \$L = \$L1 * \$L1"
_EOF_
);
$bossScript->execute();
my $mkOutRootHost = $opt_blastzOutRoot ? $paraHub : $fileServer;
my $mkOutRoot = $opt_blastzOutRoot ? "mkdir -p $opt_blastzOutRoot;" : "";
&HgAutomate::run("$HgAutomate::runSSH $mkOutRootHost " .
"'(cd $runDir; $mkOutRoot csh -ef xdir.sh)'");
}
sub doBlastzClusterRun {
# Set up and perform the big-cluster blastz run.
my $paraHub = $bigClusterHub;
my $runDir = "$buildDir/run.blastz";
my $targetList = "$tDb.lst";
my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl';
my $queryList = $isSelf ? $targetList :
($opt_ignoreSelf ? "$qDb.ignoreSelf.lst" : "$qDb.lst");
if ($selfSplit) {
$queryList = "$qDb.selfSplit.lst"
}
# First, make sure we're starting clean.
if (-e "$runDir/run.time") {
die "doBlastzClusterRun: looks like this was run successfully already " .
"(run.time exists). Either run with -continue cat or some later " .
"stage, or move aside/remove $runDir/ and run again.\n";
} elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) {
die "doBlastzClusterRun: looks like we are not starting with a clean " .
"slate. Please move aside or remove $runDir/ and run again.\n";
}
# Second, make sure we got through the partitioning already
if (! -e "$runDir/$targetList" && ! $opt_debug) {
die "doBlastzClusterRun: there's no target list file " .
"so start over without the -continue align.\n";
}
if (! -e "$runDir/$queryList" && ! $opt_debug) {
die "doBlastzClusterRun: there's no query list file" .
"so start over without the -continue align.\n";
}
my $templateCmd = ("$blastzRunUcsc -outFormat psl " .
($isSelf ? '-dropSelf ' : '') .
'$(path1) $(path2) ../DEF ' .
'{check out exists ' .
$outRoot . '/$(file1)/$(file1)_$(file2).psl }');
&HgAutomate::makeGsub($runDir, $templateCmd);
`touch "$runDir/para_hub_$paraHub"`;
my $whatItDoes = "It sets up and performs the big cluster blastz run.";
my $bossScript = new HgRemoteScript("$runDir/doClusterRun.csh", $paraHub,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
$HgAutomate::gensub2 $targetList $queryList gsub jobList
$HgAutomate::paraRun
_EOF_
);
$bossScript->execute();
} # sub doBlastzClusterRun {}
sub doCatRun {
# Do a small cluster run to concatenate the lowest level of chunk result
# files from the big cluster blastz run. This brings results up to the
# next level: per-target-chunk results, which may still need to be
# concatenated into per-target-sequence in the next step after this one --
# chaining.
my $paraHub = $smallClusterHub;
my $runDir = "$buildDir/run.cat";
# First, make sure we're starting clean.
if (-e "$runDir/run.time") {
die "doCatRun: looks like this was run successfully already " .
"(run.time exists). Either run with -continue chainRun or some later " .
"stage, or move aside/remove $runDir/ and run again.\n";
} elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) {
die "doCatRun: looks like we are not starting with a clean " .
"slate. Please move aside or remove $runDir/ and run again.\n";
}
# Make sure previous stage was successful.
my $successFile = "$buildDir/run.blastz/run.time";
if (! -e $successFile && ! $opt_debug) {
die "doCatRun: looks like previous stage was not successful (can't find " .
"$successFile).\n";
}
&HgAutomate::mustMkdir($runDir);
&HgAutomate::makeGsub($runDir,
"./cat.csh \$(path1) {check out exists ../pslParts/\$(file1).psl.gz}");
`touch "$runDir/para_hub_$paraHub"`;
my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : '../psl';
my $fh = &HgAutomate::mustOpen(">$runDir/cat.csh");
print $fh <<_EOF_
#!/bin/csh -ef
find $outRoot/\$1/ -name "*.psl" | xargs cat | gzip -c > \$2
_EOF_
;
close($fh);
my $whatItDoes =
"It sets up and performs a small cluster run to concatenate all files in
each subdirectory of $outRoot into a per-target-chunk file.";
my $bossScript = new HgRemoteScript("$runDir/doCatRun.csh", $paraHub,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
(cd $outRoot; find . -maxdepth 1 -type d | grep '^./') \\
| sed -e 's#/\$##; s#^./##' > tParts.lst
chmod a+x cat.csh
$HgAutomate::gensub2 tParts.lst single gsub jobList
mkdir ../pslParts
$HgAutomate::paraRun
_EOF_
);
$bossScript->execute();
} # sub doCatRun {}
sub makePslPartsLst {
# Create a pslParts.lst file the subdirectories of pslParts; if some
# are for subsequences of the same sequence, make a single .lst line
# for the sequence (single chaining job with subseqs' alignments
# catted together). Otherwise (i.e. subdirs that contain small
# target seqs glommed together by partitionSequences) make one .lst
# line per partition.
return if ($opt_debug);
opendir(P, "$buildDir/pslParts")
|| die "Couldn't open directory $buildDir/pslParts for reading: $!\n";
my @parts = readdir(P);
closedir(P);
my $partsLst = "$buildDir/axtChain/run/pslParts.lst";
my $fh = &HgAutomate::mustOpen(">$partsLst");
my %seqs = ();
my $count = 0;
foreach my $p (@parts) {
$p =~ s@^/.*/@@; $p =~ s@/$@@;
$p =~ s/\.psl\.gz//;
next if ($p eq '.' || $p eq '..');
if ($p =~ m@^(\S+:\S+):\d+-\d+$@) {
# Collapse subsequences (subranges of a sequence) down to one entry
# per sequence:
$seqs{$1} = 1;
} else {
print $fh "$p\n";
$count++;
}
}
foreach my $p (keys %seqs) {
print $fh "$p:\n";
$count++;
}
close($fh);
if ($count < 1) {
die "makePslPartsLst: didn't find any pslParts/ items.";
}
}
sub doChainRun {
# Do a small cluster run to chain alignments to each target sequence.
my $paraHub = $smallClusterHub;
my $runDir = "$buildDir/axtChain/run";
# First, make sure we're starting clean.
if (-e "$runDir/run.time") {
die "doChainRun: looks like this was run successfully already " .
"(run.time exists). Either run with -continue chainMerge or some " .
"later stage, or move aside/remove $runDir/ and run again.\n";
} elsif ((-e "$runDir/gsub" || -e "$runDir/jobList") && ! $opt_debug) {
die "doChainRun: looks like we are not starting with a clean " .
"slate. Please move aside or remove $runDir/ and run again.\n";
}
# Make sure previous stage was successful.
my $successFile = "$buildDir/run.cat/run.time";
if (! -e $successFile && ! $opt_debug) {
die "doChainRun: looks like previous stage was not successful (can't " .
"find $successFile).\n";
}
&HgAutomate::mustMkdir($runDir);
&HgAutomate::makeGsub($runDir,
"chain.csh \$(file1) {check out line+ chain/\$(file1).chain}");
`touch "$runDir/para_hub_$paraHub"`;
my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'};
my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'};
my $matrix = $defVars{'BLASTZ_Q'} ? "-scoreScheme=$defVars{BLASTZ_Q} " : "";
my $minScore = $opt_chainMinScore ? "-minScore=$opt_chainMinScore" : "";
my $linearGap = $opt_chainLinearGap ? "-linearGap=$opt_chainLinearGap" :
"-linearGap=$defaultChainLinearGap";
my $fh = &HgAutomate::mustOpen(">$runDir/chain.csh");
print $fh <<_EOF_
#!/bin/csh -ef
zcat ../../pslParts/\$1*.psl.gz \\
| axtChain -psl -verbose=0 $matrix $minScore $linearGap stdin \\
$seq1Dir \\
$seq2Dir \\
stdout \\
| chainAntiRepeat $seq1Dir \\
$seq2Dir \\
stdin \$2
_EOF_
;
if (exists($defVars{'SEQ1_LIFT'})) {
print $fh <<_EOF_
set c=\$2:t:r
echo "lifting \$2 to \${c}.lifted.chain"
liftUp liftedChain/\${c}.lifted.chain \\
$defVars{'SEQ1_LIFT'} carry \$2
rm \$2
mv liftedChain/\${c}.lifted.chain \$2
_EOF_
;
}
if (exists($defVars{'SEQ2_LIFT'})) {
print $fh <<_EOF_
set c=\$2:t:r
echo "lifting \$2 to \${c}.lifted.chain"
liftUp -chainQ liftedChain/\${c}.lifted.chain \\
$defVars{'SEQ2_LIFT'} carry \$2
rm \$2
mv liftedChain/\${c}.lifted.chain \$2
_EOF_
;
}
close($fh);
&makePslPartsLst();
my $whatItDoes =
"It sets up and performs a small cluster run to chain all alignments
to each target sequence.";
my $bossScript = new HgRemoteScript("$runDir/doChainRun.csh", $paraHub,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
chmod a+x chain.csh
$HgAutomate::gensub2 pslParts.lst single gsub jobList
mkdir chain liftedChain
$HgAutomate::paraRun
rmdir liftedChain
_EOF_
);
$bossScript->execute();
} # sub doChainRun {}
sub postProcessChains {
# chainMergeSort etc.
my $runDir = "$buildDir/axtChain";
my $chain = "$tDb.$qDb.all.chain.gz";
# First, make sure we're starting clean.
if (-e "$runDir/$chain") {
die "postProcessChains: looks like this was run successfully already " .
"($chain exists). Either run with -continue net or some later " .
"stage, or move aside/remove $runDir/$chain and run again.\n";
} elsif (-e "$runDir/all.chain" || -e "$runDir/all.chain.gz") {
die "postProcessChains: looks like this was run successfully already " .
"(all.chain[.gz] exists). Either run with -continue net or some later " .
"stage, or move aside/remove $runDir/all.chain[.gz] and run again.\n";
} elsif (-e "$runDir/chain" && ! $opt_debug) {
die "postProcessChains: looks like we are not starting with a clean " .
"slate. Please move aside or remove $runDir/chain and run again.\n";
}
# Make sure previous stage was successful.
my $successFile = "$buildDir/axtChain/run/run.time";
if (! -e $successFile && ! $opt_debug) {
die "postProcessChains: looks like previous stage was not successful " .
"(can't find $successFile).\n";
}
my $cmd="$HgAutomate::runSSH $workhorse nice ";
$cmd .= "'find $runDir/run/chain -name \"*.chain\" ";
$cmd .= "| chainMergeSort -inputList=stdin ";
$cmd .= "| nice gzip -c > $runDir/$chain'";
&HgAutomate::run($cmd);
if ($splitRef) {
&HgAutomate::run("$HgAutomate::runSSH $fileServer nice " .
"chainSplit $runDir/chain $runDir/$chain");
}
&HgAutomate::nfsNoodge("$runDir/$chain");
} # sub postProcessChains {}
sub getAllChain {
# Find the most likely candidate for all.chain from a previous run/step.
my ($runDir) = @_;
my $chain;
if (-e "$runDir/$tDb.$qDb.all.chain.gz") {
$chain = "$tDb.$qDb.all.chain.gz";
} elsif (-e "$runDir/$tDb.$qDb.all.chain") {
$chain = "$tDb.$qDb.all.chain";
} elsif (-e "$runDir/all.chain.gz") {
$chain = "all.chain.gz";
} elsif (-e "$runDir/all.chain") {
$chain = "all.chain";
} elsif ($opt_debug) {
$chain = "$tDb.$qDb.all.chain.gz";
}
return $chain;
}
sub swapChains {
# chainMerge step for -swap: chainSwap | chainSort.
my $runDir = "$swapDir/axtChain";
my $inChain = &getAllChain("$buildDir/axtChain");
my $swappedChain = "$qDb.$tDb.all.chain.gz";
# First, make sure we're starting clean.
if (-e "$runDir/$swappedChain") {
die "swapChains: looks like this was run successfully already " .
"($runDir/$swappedChain exists). Either run with -continue net or some " .
"later stage, or move aside/remove $runDir/$swappedChain and run again.\n";
} elsif (-e "$runDir/all.chain" || -e "$runDir/all.chain.gz") {
die "swapChains: looks like this was run successfully already " .
"($runDir/all.chain[.gz] exists). Either run with -continue net or some " .
"later stage, or move aside/remove $runDir/all.chain[.gz] and run again.\n";
}
# Main routine already made sure that $buildDir/axtChain/all.chain is there.
&HgAutomate::run("$HgAutomate::runSSH $workhorse nice " .
"'chainSwap $buildDir/axtChain/$inChain stdout " .
"| nice chainSort stdin stdout " .
"| nice gzip -c > $runDir/$swappedChain'");
&HgAutomate::nfsNoodge("$runDir/$swappedChain");
if ($splitRef) {
&HgAutomate::run("$HgAutomate::runSSH $fileServer nice " .
"chainSplit $runDir/chain $runDir/$swappedChain");
}
} # sub swapChains {}
sub swapGlobals {
# Swap our global variables ($buildDir, $tDb, $qDb and %defVars SEQ1/SEQ2)
# so that the remaining steps need no tweaks for -swap.
$buildDir = $swapDir;
my $tmp = $qDb;
$qDb = $tDb;
$tDb = $tmp;
$QDb = $isSelf ? 'Self' : ucfirst($qDb);
foreach my $var ('DIR', 'LEN', 'CHUNK', 'LAP', 'SMSK') {
$tmp = $defVars{"SEQ1_$var"};
$defVars{"SEQ1_$var"} = $defVars{"SEQ2_$var"};
$defVars{"SEQ2_$var"} = $tmp;
}
$defVars{'BASE'} = $swapDir;
}
sub doChainMerge {
# If -swap, swap chains from other org; otherwise, merge the results
# from the chainRun step.
if ($opt_swap) {
&swapChains();
&swapGlobals();
} else {
&postProcessChains();
}
}
sub netChains {
# Turn chains into nets (,axt,maf,.over.chain).
# Don't do this for self alignments.
return if ($isSelf);
my $runDir = "$buildDir/axtChain";
# First, make sure we're starting clean.
if (-d "$buildDir/mafNet") {
die "netChains: looks like this was run successfully already " .
"(mafNet exists). Either run with -continue load or some later " .
"stage, or move aside/remove $buildDir/mafNet " .
"and $runDir/noClass.net and run again.\n";
} elsif (-e "$runDir/noClass.net") {
die "netChains: looks like we are not starting with a " .
"clean slate. Please move aside or remove $runDir/noClass.net " .
"and run again.\n";
}
# Make sure previous stage was successful.
my $chain = &getAllChain($runDir);
if (! defined $chain && ! $opt_debug) {
die "netChains: looks like previous stage was not successful " .
"(can't find [$tDb.$qDb.]all.chain[.gz]).\n";
}
my $whatItDoes =
"It generates nets (without repeat/gap stats -- those are added later on
$dbHost) from chains, and generates axt, maf and .over.chain from the nets.";
my $bossScript = new HgRemoteScript("$runDir/netChains.csh", $workhorse,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
# Make nets ("noClass", i.e. without rmsk/class stats which are added later):
chainPreNet $inclHap $chain $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} stdout \\
| chainNet $inclHap stdin -minSpace=1 $defVars{SEQ1_LEN} $defVars{SEQ2_LEN} stdout /dev/null \\
| netSyntenic stdin noClass.net
# Make liftOver chains:
netChainSubset -verbose=0 noClass.net $chain stdout \\
| chainStitchId stdin stdout | gzip -c > $tDb.$qDb.over.chain.gz
_EOF_
);
my $seq1Dir = $defVars{'SEQ1_DIR'};
my $seq2Dir = $defVars{'SEQ2_DIR'};
if ($splitRef) {
$bossScript->add(<<_EOF_
# Make axtNet for download: one .axt per $tDb seq.
netSplit noClass.net net
cd ..
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt \$f axtChain/chain/\$f:t:r.chain \\
$seq1Dir $seq2Dir stdout \\
| axtSort stdin stdout \\
| gzip -c > axtNet/\$f:t:r.$tDb.$qDb.net.axt.gz
end
# Make mafNet for multiz: one .maf per $tDb seq.
mkdir mafNet
foreach f (axtNet/*.$tDb.$qDb.net.axt.gz)
axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. \$f \\
$defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\
stdout \\
| gzip -c > mafNet/\$f:t:r:r:r:r:r.maf.gz
end
_EOF_
);
} else {
$bossScript->add(<<_EOF_
# Make axtNet for download: one .axt for all of $tDb.
mkdir ../axtNet
netToAxt -verbose=0 noClass.net $chain \\
$seq1Dir $seq2Dir stdout \\
| axtSort stdin stdout \\
| gzip -c > ../axtNet/$tDb.$qDb.net.axt.gz
# Make mafNet for multiz: one .maf for all of $tDb.
mkdir ../mafNet
axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. ../axtNet/$tDb.$qDb.net.axt.gz \\
$defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\
stdout \\
| gzip -c > ../mafNet/$tDb.$qDb.net.maf.gz
_EOF_
);
}
$bossScript->execute();
} # sub netChains {}
sub loadUp {
# Load chains; add repeat/gap stats to net; load nets.
my $runDir = "$buildDir/axtChain";
my $QDbLink = "chain$QDb" . "Link";
# First, make sure we're starting clean.
if (-e "$runDir/$tDb.$qDb.net" || -e "$runDir/$tDb.$qDb.net.gz") {
die "loadUp: looks like this was run successfully already " .
"($tDb.$qDb.net[.gz] exists). Either run with -continue download, " .
"or move aside/remove $runDir/$tDb.$qDb.net[.gz] and run again.\n";
}
# Make sure previous stage was successful.
my $successDir = $isSelf ? "$runDir/$tDb.$qDb.all.chain.gz" :
"$buildDir/mafNet/";
if (! -e $successDir && ! $opt_debug) {
die "loadUp: looks like previous stage was not successful " .
"(can't find $successDir).\n";
}
my $whatItDoes =
"It loads the chain tables into $tDb, adds gap/repeat stats to the .net file,
and loads the net table.";
my $bossScript = new HgRemoteScript("$runDir/loadUp.csh", $dbHost,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
# Load chains:
_EOF_
);
- if ((! $opt_noLoadChainSplit) && $splitRef) {
+ if ($opt_loadChainSplit && $splitRef) {
$bossScript->add(<<_EOF_
cd $runDir/chain
foreach c (`awk '{print \$1;}' $defVars{SEQ1_LEN}`)
set f = \$c.chain
if (! -e \$f) then
echo no chains for \$c
set f = /dev/null
endif
hgLoadChain $tDb \${c}_chain$QDb \$f
end
_EOF_
);
} else {
$bossScript->add(<<_EOF_
cd $runDir
hgLoadChain -tIndex $tDb chain$QDb $tDb.$qDb.all.chain.gz
_EOF_
);
}
if (! $isSelf) {
my $tRepeats = $opt_tRepeats ? "-tRepeats=$opt_tRepeats" : $defaultTRepeats;
my $qRepeats = $opt_qRepeats ? "-qRepeats=$opt_qRepeats" : $defaultQRepeats;
if ($opt_swap) {
$tRepeats = $opt_qRepeats ? "-tRepeats=$opt_qRepeats" : $defaultQRepeats;
$qRepeats = $opt_tRepeats ? "-qRepeats=$opt_tRepeats" : $defaultTRepeats;
}
$bossScript->add(<<_EOF_
# Add gap/repeat stats to the net file using database tables:
cd $runDir
netClass -verbose=0 $tRepeats $qRepeats -noAr noClass.net $tDb $qDb $tDb.$qDb.net
# Load nets:
netFilter -minGap=10 $tDb.$qDb.net \\
| hgLoadNet -verbose=0 $tDb net$QDb stdin
cd $buildDir
featureBits $tDb $QDbLink >&fb.$tDb.$QDbLink.txt
cat fb.$tDb.$QDbLink.txt
_EOF_
);
}
$bossScript->execute();
# maybe also peek in trackDb and see if entries need to be added for chain/net
} # sub loadUp {}
sub makeDownloads {
# Compress the netClassed .net for download (other files should have been
# compressed already).
my $runDir = "$buildDir/axtChain";
if (-e "$runDir/$tDb.$qDb.net") {
&HgAutomate::run("$HgAutomate::runSSH $fileServer nice " .
"gzip $runDir/$tDb.$qDb.net");
}
# Make an md5sum.txt file.
my $net = $isSelf ? "" : "$tDb.$qDb.net.gz";
my $whatItDoes =
"It makes an md5sum.txt file for downloadable files, with relative paths
matching what the user will see on the download server, and installs the
over.chain file in the liftOver dir.";
my $bossScript = new HgRemoteScript("$runDir/makeMd5sum.csh", $workhorse,
$runDir, $whatItDoes, $DEF);
my $over = $tDb . "To$QDb.over.chain.gz";
my $altOver = "$tDb.$qDb.over.chain.gz";
my $liftOverDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver";
$bossScript->add(<<_EOF_
mkdir -p $liftOverDir
md5sum $tDb.$qDb.all.chain.gz $net > md5sum.txt
_EOF_
);
if (! $isSelf) {
$bossScript->add(<<_EOF_
rm -f $liftOverDir/$over
cp -p $altOver $liftOverDir/$over
cd ..
md5sum axtNet/*.gz >> axtChain/md5sum.txt
_EOF_
);
}
$bossScript->execute();
}
sub getBlastzParams {
# Return parameters in BLASTZ_Q file, or defaults, for README.txt.
my $matrix =
" A C G T
A 91 -114 -31 -123
C -114 100 -125 -31
G -31 -125 100 -114
T -123 -31 -114 91";
if ($defVars{'BLASTZ_Q'}) {
my $fh = &HgAutomate::mustOpen($defVars{'BLASTZ_Q'});
my $line = <$fh>;
if ($line !~ /^\s*A\s+C\s+G\s+T\s*$/) {
die "Can't parse first line of $defVars{BLASTZ_Q}";
}
$matrix = ' ' . $line;
foreach my $base ('A', 'C', 'G', 'T') {
$line = <$fh>;
die "Too few lines of $defVars{BLASTZ_Q}" if (! $line);
if ($line !~ /^\s*-?\d+\s+-?\d+\s+-?\d+\s+-?\d+\s*$/) {
die "Can't parse this line of $defVars{BLASTZ_Q}:\n$line";
}
$matrix .= " $base " . $line;
}
chomp $matrix;
$line = <$fh>;
if ($line && $line =~ /\S/) {
warn "\nWarning: BLASTZ_Q matrix file $defVars{BLASTZ_Q} has " .
"additional contents after the matrix -- those are ignored " .
"by blastz.\n\n";
}
close($fh);
}
my $o = $defVars{'BLASTZ_O'} || 400;
my $e = $defVars{'BLASTZ_E'} || 30;
my $k = $defVars{'BLASTZ_K'} || 3000;
my $l = $defVars{'BLASTZ_L'} || 2200;
my $h = $defVars{'BLASTZ_H'} || 2000;
my $blastzOther = '';
foreach my $var (sort keys %defVars) {
if ($var =~ /^BLASTZ_(\w)$/) {
my $p = $1;
if ($p ne 'K' && $p ne 'L' && $p ne 'H' && $p ne 'Q') {
if ($blastzOther eq '') {
$blastzOther = 'Other blastz
parameters specifically set for this species pair:';
}
$blastzOther .= "\n $p=$defVars{$var}";
}
}
}
return ($matrix, $o, $e, $k, $l, $h, $blastzOther);
}
sub commafy {
# Assuming $num is a number, add commas where appropriate.
my ($num) = @_;
$num =~ s/(\d)(\d\d\d)$/$1,$2/;
$num =~ s/(\d)(\d\d\d),/$1,$2,/g;
return($num);
}
sub describeOverlapping {
# Return some text describing how large sequences were split.
my $lap;
my $chunkPlusLap1 = $defVars{'SEQ1_CHUNK'} + $defVars{'SEQ1_LAP'};
my $chunkPlusLap2 = $defVars{'SEQ2_CHUNK'} + $defVars{'SEQ2_LAP'};
if ($chunkPlusLap1 == $chunkPlusLap2) {
$lap .= "Any sequences larger\n" .
"than " . &commafy($chunkPlusLap1) . " bases were split into chunks of " .
&commafy($chunkPlusLap1) . " bases
overlapping by " . &commafy($defVars{SEQ1_LAP}) . " bases for alignment.";
} else {
$lap .= "Any $tDb sequences larger\n" .
"than " . &commafy($chunkPlusLap1) . " bases were split into chunks of " .
&commafy($chunkPlusLap1) . " bases overlapping
by " . &commafy($defVars{SEQ1_LAP}) . " bases for alignment. " .
"A similar process was followed for $qDb,
with chunks of " . &commafy($chunkPlusLap2) . " overlapping by " .
&commafy($defVars{SEQ2_LAP}) . ".)";
}
$lap .= " Following alignment, the
coordinates of the chunk alignments were corrected by the
blastz-normalizeLav script written by Scott Schwartz of Penn State.";
return $lap;
}
sub dumpDownloadReadme {
# Write a file (README.txt) describing the download files.
my ($file) = @_;
my $fh = &HgAutomate::mustOpen(">$file");
my ($tGenome, $tDate, $tSource) = &HgAutomate::getAssemblyInfo($dbHost, $tDb);
my ($qGenome, $qDate, $qSource) = &HgAutomate::getAssemblyInfo($dbHost, $qDb);
my $dir = $splitRef ? 'axtNet/*.' : '';
my ($matrix, $o, $e, $k, $l, $h, $blastzOther) = &getBlastzParams();
my $defaultMatrix = $defVars{'BLASTZ_Q'} ? '' : ' the default matrix';
my $lap = &describeOverlapping();
my $abridging = "";
if ($defVars{'BLASTZ_ABRIDGE_REPEATS'}) {
if ($isSelf) {
$abridging = "
All repetitive sequences identified by RepeatMasker were removed from the
assembly before alignment using the fasta-subseq and strip_rpts programs
from Penn State. The abbreviated genome was aligned with blastz, and the
transposons were then added back in (i.e. the alignment coordinates were
adjusted) using the restore_rpts program from Penn State.";
} else {
$abridging = "
Transposons that have been inserted since the $qGenome/$tGenome split were
removed from the assemblies before alignment using the fasta-subseq and
strip_rpts programs from Penn State. The abbreviated genomes were aligned
with blastz, and the transposons were then added back in (i.e. the
alignment coordinates were adjusted) using the restore_rpts program from
Penn State.";
}
}
my $desc = $isSelf ?
"This directory contains alignments of $tGenome ($tDb, $tDate,
$tSource) to itself." :
"This directory contains alignments of the following assemblies:
- target/reference: $tGenome ($tDb, $tDate, $tSource)
- query: $qGenome ($qDb, $qDate, $qSource)";
print $fh "$desc
Files included in this directory:
- md5sum.txt: md5sum checksums for the files in this directory
- $tDb.$qDb.all.chain.gz: chained blastz alignments. The chain format is
described in http://genome.ucsc.edu/goldenPath/help/chain.html .
";
if (! $isSelf) {
print $fh
" - $tDb.$qDb.net.gz: \"net\" file that describes rearrangements between
the species and the best $qGenome match to any part of the
$tGenome genome. The net format is described in
http://genome.ucsc.edu/goldenPath/help/net.html .
- $dir$tDb.$qDb.net.axt.gz: chained and netted alignments,
i.e. the best chains in the $tGenome genome, with gaps in the best
chains filled in by next-best chains where possible. The axt format is
described in http://genome.ucsc.edu/goldenPath/help/axt.html .
";
}
if ($opt_swap) {
my $TDb = ucfirst($tDb);
print $fh
"The chainSwap program was used to translate $qDb-referenced chained blastz
alignments to $tDb into $tDb-referenced chains aligned to $qDb. See
the download directory goldenPath/$qDb/vs$TDb/README.txt for more
information about the $qDb-referenced blastz and chaining process.
";
} else {
print $fh ($isSelf ?
"The $tDb assembly was aligned to itself" :
"The $tDb and $qDb assemblies were aligned");
my $chainMinScore = $opt_chainMinScore ? "$opt_chainMinScore" :
$defaultChainMinScore;
my $chainLinearGap = $opt_chainLinearGap ? "$opt_chainLinearGap" :
$defaultChainLinearGap;
print $fh " by the blastz alignment
program, which is available from Webb Miller's lab at Penn State
University (http://www.bx.psu.edu/miller_lab/). $lap $abridging
The blastz scoring matrix (Q parameter) used was$defaultMatrix:
$matrix
with a gap open penalty of O=$o and a gap extension penalty of E=$e.
The minimum score for an alignment to be kept was K=$k for the first pass
and L=$l for the second pass, which restricted the search space to the
regions between two alignments found in the first pass. The minimum
score for alignments to be interpolated between was H=$h. $blastzOther
The .lav format blastz output was translated to the .psl format with
lavToPsl, then chained by the axtChain program.\n
Chain minimum score: $chainMinScore, and linearGap matrix of ";
if ($chainLinearGap =~ m/loose/) {
print $fh "(loose):
tablesize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
tGap 325 360 400 450 600 1100 3600 7600 15600 31600 56600
bothGap 625 660 700 750 900 1400 4000 8000 16000 32000 57000
";
} elsif ($chainLinearGap =~ m/medium/) {
print $fh "(medium):
tableSize 11
smallSize 111
position 1 2 3 11 111 2111 12111 32111 72111 152111 252111
qGap 350 425 450 600 900 2900 22900 57900 117900 217900 317900
tGap 350 425 450 600 900 2900 22900 57900 117900 217900 317900
bothGap 750 825 850 1000 1300 3300 23300 58300 118300 218300 318300
";
} else {
print $fh "(specified):\n", `cat $chainLinearGap`, "\n";
}
}
if (! $isSelf) {
print $fh "
Chained alignments were processed into nets by the chainNet, netSyntenic,
and netClass programs.
Best-chain alignments in axt format were extracted by the netToAxt program.";
}
print $fh "
All programs run after blastz were written by Jim Kent at UCSC.
----------------------------------------------------------------
If you plan to download a large file or multiple files from this directory,
we recommend you use ftp rather than downloading the files via our website.
To do so, ftp to hgdownload.cse.ucsc.edu, then go to the directory
goldenPath/$tDb/vs$QDb/. To download multiple files, use the \"mget\"
command:
mget <filename1> <filename2> ...
- or -
mget -a (to download all files in the current directory)
All files in this directory are freely available for public use.
--------------------------------------------------------------------
References
Chiaromonte F, Yap VB, Miller W. Scoring pairwise genomic sequence
alignments. Pac Symp Biocomput. 2002;:115-26.
Kent WJ, Baertsch R, Hinrichs A, Miller W, Haussler D.
Evolution's cauldron: Duplication, deletion, and rearrangement in the
mouse and human genomes. Proc Natl Acad Sci U S A. 2003 Sep
30;100(20):11484-9.
Schwartz S, Kent WJ, Smit A, Zhang Z, Baertsch R, Hardison RC,
Haussler D, Miller W. Human-Mouse Alignments with BLASTZ. Genome
Res. 2003 Jan;13(1):103-7.
";
close($fh);
}
sub installDownloads {
# Load chains; add repeat/gap stats to net; load nets.
my $runDir = "$buildDir/axtChain";
# Make sure previous stage was successful.
my $successFile = $isSelf ? "$runDir/$tDb.$qDb.all.chain.gz" :
"$runDir/$tDb.$qDb.net.gz";
if (! -e $successFile && ! $opt_debug) {
die "installDownloads: looks like previous stage was not successful " .
"(can't find $successFile).\n";
}
&dumpDownloadReadme("$runDir/README.txt");
my $over = $tDb . "To$QDb.over.chain.gz";
my $liftOverDir = "$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/liftOver";
my $gpLiftOverDir = "$HgAutomate::goldenPath/$tDb/liftOver";
my $gbdbLiftOverDir = "$HgAutomate::gbdb/$tDb/liftOver";
my $andNets = $isSelf ? "." :
", nets and axtNet,\n" .
"# and copies the liftOver chains to the liftOver download dir.";
my $whatItDoes = "It creates the download directory for chains$andNets";
my $bossScript = new HgRemoteScript("$runDir/installDownloads.csh", $dbHost,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
mkdir -p $HgAutomate::goldenPath/$tDb
rm -rf $HgAutomate::goldenPath/$tDb/vs$QDb
mkdir $HgAutomate::goldenPath/$tDb/vs$QDb
cd $HgAutomate::goldenPath/$tDb/vs$QDb
ln -s $runDir/$tDb.$qDb.all.chain.gz .
cp -p $runDir/README.txt .
ln -s $runDir/md5sum.txt .
_EOF_
);
if (! $isSelf) {
my $axt = ($splitRef ?
"mkdir axtNet\n" . "ln -s $buildDir/axtNet/*.axt.gz axtNet/" :
"ln -s $buildDir/axtNet/$tDb.$qDb.net.axt.gz .");
$bossScript->add(<<_EOF_
ln -s $runDir/$tDb.$qDb.net.gz .
$axt
mkdir -p $gpLiftOverDir
rm -f $gpLiftOverDir/$over
ln -s $liftOverDir/$over $gpLiftOverDir/$over
mkdir -p $gbdbLiftOverDir
rm -f $gbdbLiftOverDir/$over
ln -s $liftOverDir/$over $gbdbLiftOverDir/$over
hgAddLiftOverChain -minMatch=0.1 -multiple -path=$gbdbLiftOverDir/$over \\
$tDb $qDb
# Update (or create) liftOver/md5sum.txt with the new .over.chain.gz.
if (-e $gpLiftOverDir/md5sum.txt) then
set tmpFile = `mktemp -t tmpMd5.XXXXXX`
grep -v $over $gpLiftOverDir/md5sum.txt > \$tmpFile
md5sum $gpLiftOverDir/$over \\
| sed -e 's\@$gpLiftOverDir/\@\@' >> \$tmpFile
sort \$tmpFile > $gpLiftOverDir/md5sum.txt
rm \$tmpFile
else
md5sum $gpLiftOverDir/$over | sed -e 's\@$gpLiftOverDir/\@\@' \\
> $gpLiftOverDir/md5sum.txt
endif
_EOF_
);
}
$bossScript->execute();
# maybe also peek in trackDb and see if entries need to be added for chain/net
}
sub doDownloads {
# Create compressed files for download and make links from test server's
# goldenPath/ area.
&makeDownloads();
&installDownloads();
}
sub cleanup {
# Remove intermediate files.
my $runDir = $buildDir;
my $outRoot = $opt_blastzOutRoot ? "$opt_blastzOutRoot/psl" : "$buildDir/psl";
my $rootCanal = ($opt_blastzOutRoot ?
"rmdir --ignore-fail-on-non-empty $opt_blastzOutRoot" :
'');
my $whatItDoes =
"It cleans up files after a successful blastz/chain/net/install series.
It uses rm -f so failures should be ignored (e.g. if a partial cleanup has
already been performed).";
my $bossScript = new HgRemoteScript("$buildDir/cleanUp.csh", $fileServer,
$runDir, $whatItDoes, $DEF);
$bossScript->add(<<_EOF_
rm -fr $outRoot/
$rootCanal
rm -fr $buildDir/axtChain/run/chain/
rm -fr $buildDir/axtChain/run/err/
rm -fr $buildDir/run.blastz/err/
rm -fr $buildDir/run.cat/err/
rm -f $buildDir/axtChain/noClass.net
rm -f $buildDir/run.blastz/batch.bak
rm -f $buildDir/run.cat/batch.bak
rm -f $buildDir/axtChain/run/batch.bak
_EOF_
);
if ($splitRef) {
$bossScript->add(<<_EOF_
rm -fr $buildDir/axtChain/net/
rm -fr $buildDir/axtChain/chain/
_EOF_
);
}
$bossScript->execute();
}
sub doSyntenicNet {
# Create syntenic net mafs for multiz
my $whatItDoes =
"It filters the net for synteny and creates syntenic net MAF files for
multiz. Use this option when the query genome is high-coverage and not
too distant from the reference. Suppressed unless -syntenicNet is included.";
if (not $opt_syntenicNet) {
return;
}
my $runDir = "$buildDir/axtChain";
# First, make sure we're starting clean.
my $successDir = "$buildDir/mafSynNet";
if (-e $successDir) {
die "doSyntenicNet: looks like this was run successfully already " .
"($successDir). To re-run, " .
"move aside/remove $successDir and run again.\n";
}
# Make sure previous stage was successful.
my $successFile = "$runDir/$tDb.$qDb.net.gz";
if (! -e "$successFile" && ! $opt_debug) {
die "doSyntenicNet: looks like previous stage was not successful " .
"(can't find $successFile).\n";
}
my $bossScript = new HgRemoteScript("$runDir/netSynteny.csh", $workhorse,
$runDir, $whatItDoes, $DEF);
if ($splitRef) {
$bossScript->add(<<_EOF_
# filter net for synteny and create syntenic net mafs
netFilter -syn $tDb.$qDb.net.gz \\
| netSplit stdin synNet
chainSplit chain $tDb.$qDb.all.chain.gz
cd ..
mkdir $successDir
foreach f (axtChain/synNet/*.net)
netToAxt \$f axtChain/chain/\$f:t:r.chain \\
$defVars{'SEQ1_DIR'} $defVars{'SEQ2_DIR'} stdout \\
| axtSort stdin stdout \\
| axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. stdin \\
$defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\
stdout \\
| gzip -c > mafSynNet/\$f:t:r:r:r:r:r.maf.gz
end
rm -fr $runDir/synNet
rm -fr $runDir/chain
_EOF_
);
} else {
# scaffold-based assembly
# filter net for synteny and create syntenic net mafs
$bossScript->add(<<_EOF_
netFilter -syn $tDb.$qDb.net.gz | gzip -c > $tDb.$qDb.syn.net.gz
netToAxt $tDb.$qDb.syn.net.gz $tDb.$qDb.all.chain.gz \\
$defVars{'SEQ1_DIR'} $defVars{'SEQ2_DIR'} stdout \\
| axtSort stdin stdout \\
| axtToMaf -tPrefix=$tDb. -qPrefix=$qDb. stdin \\
$defVars{SEQ1_LEN} $defVars{SEQ2_LEN} \\
stdout \\
| gzip -c > $tDb.$qDb.synNet.maf.gz
_EOF_
);
}
$bossScript->execute();
}
#########################################################################
#
# -- main --
# Prevent "Suspended (tty input)" hanging:
&HgAutomate::closeStdin();
#$opt_debug = 1;
&checkOptions();
&usage(1) if (scalar(@ARGV) != 1);
($DEF) = @ARGV;
$inclHap = "";
$inclHap = "-inclHap" if ($opt_inclHap);
&loadDef($DEF);
&checkDef();
my $seq1IsSplit = (`wc -l < $defVars{SEQ1_LEN}` <=
$HgAutomate::splitThreshold);
my $seq2IsSplit = (`wc -l < $defVars{SEQ2_LEN}` <=
$HgAutomate::splitThreshold);
# Undocumented option for quickly generating a README from DEF:
if ($opt_readmeOnly) {
$splitRef = $opt_swap ? $seq2IsSplit : $seq1IsSplit;
&swapGlobals() if $opt_swap;
&dumpDownloadReadme("/tmp/README.txt");
exit 0;
}
my $date = `date +%Y-%m-%d`;
chomp $date;
$buildDir = $defVars{'BASE'} ||
"$HgAutomate::clusterData/$tDb/$HgAutomate::trackBuild/blastz.$qDb.$date";
if ($opt_swap) {
my $inChain = &getAllChain("$buildDir/axtChain");
if (! defined $inChain) {
die "-swap: Can't find $buildDir/axtChain/[$tDb.$qDb.]all.chain[.gz]\n" .
"which is required for -swap.\n";
}
$swapDir = "$HgAutomate::clusterData/$qDb/$HgAutomate::trackBuild/blastz.$tDb.swap";
&HgAutomate::mustMkdir("$swapDir/axtChain");
$splitRef = $seq2IsSplit;
&HgAutomate::verbose(1, "Swapping from $buildDir/axtChain/$inChain\n" .
"to $swapDir/axtChain/$qDb.$tDb.all.chain.gz .\n");
} else {
if (! -d $buildDir) {
&HgAutomate::mustMkdir($buildDir);
}
if (! $opt_blastzOutRoot &&
$stepper->stepPrecedes($stepper->getStartStep(), 'chainRun')) {
&enforceClusterNoNo($buildDir,
'blastz/chain/net build directory (or use -blastzOutRoot)');
}
$splitRef = $seq1IsSplit;
&HgAutomate::verbose(1, "Building in $buildDir\n");
}
if (! -e "$buildDir/DEF") {
&HgAutomate::run("cp $DEF $buildDir/DEF");
}
$fileServer = &HgAutomate::chooseFileServer($opt_swap ? $swapDir : $buildDir);
# When running -swap, swapGlobals() happens at the end of the chainMerge step.
# However, if we also use -continue with some step later than chainMerge, we
# need to call swapGlobals before executing the remaining steps.
if ($opt_swap &&
$stepper->stepPrecedes('chainMerge', $stepper->getStartStep())) {
&swapGlobals();
}
$stepper->execute();
HgAutomate::verbose(1,
"\n *** All done!\n");
HgAutomate::verbose(1,
" *** Make sure that goldenPath/$tDb/vs$QDb/README.txt is accurate.\n")
if ($stepper->stepPrecedes('load', $stepper->getStopStep()));
HgAutomate::verbose(1,
" *** Add {chain,net}$QDb tracks to trackDb.ra if necessary.\n")
if ($stepper->stepPrecedes('net', $stepper->getStopStep()));
HgAutomate::verbose(1,
"\n\n");