src/hg/utils/automation/blastz-run-ucsc 1.10
1.10 2009/05/13 00:04:56 hiram
Tolerate empty results when running lineage specific repeats and tolerate previously existing results
Index: src/hg/utils/automation/blastz-run-ucsc
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/blastz-run-ucsc,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/utils/automation/blastz-run-ucsc 19 Dec 2008 21:25:40 -0000 1.9
+++ src/hg/utils/automation/blastz-run-ucsc 13 May 2009 00:04:56 -0000 1.10
@@ -1,601 +1,614 @@
#!/bin/env perl
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit ~/kent/src/hg/utils/automation/blastz-run-ucsc instead.
# Based on Scott Schwartz's bash script blastz-run;
# rewritten in perl and extended to handle .2bit inputs and
# multi-record fasta for target as well as query by Angie Hinrichs.
# $Id$
use Getopt::Long;
use strict;
use warnings;
use vars qw/
$opt_outFormat
$opt_gz
$opt_dropSelf
$opt_help
/;
sub usage {
my ($status) = @_;
my $base = $0;
$base =~ s/^(.*\/)?//;
print STDERR "
usage: $base target query DEF out
options:
-outFormat (psl|axt) Convert blastz output from lav to psl or axt.
-gz Compress output with gzip.
-dropSelf When converting to psl or axt, drop alignment
blocks that cross the diagonal (trivial self-
alignments).
Unpacks sequence (if necessary), runs blastz and lifts/\"normalizes\"
output (if necessary).
Target and query can be list files containing one sequence specifier
per line, or sequence specifiers. A sequence specifier is a .2bit,
.nib or .fa(.gz) files, possibly with a sequence and range specifier
appended.
DEF is a Scott Schwartz-style bash script containing blastz parameters.
out is the (lifted/\"normalized\") output of blastz (optionally converted
to axt or psl, and/or gzipped). This script does not pay attention to
out's file suffixes -- it is up to you to make sure that they are
consistent with output format and compression options.
\n";
exit $status;
}
sub checkOptions {
# Make sure command line options are valid/supported.
my $ok = GetOptions("outFormat=s",
"gz",
"dropSelf",
"help");
&usage(1) if (!$ok);
&usage(0) if ($opt_help);
if ($opt_outFormat) {
$opt_outFormat =~ tr/A-Z/a-z/;
if ($opt_outFormat eq 'lav') {
$opt_outFormat = undef;
} elsif ($opt_outFormat !~ /^(psl|axt)$/) {
print STDERR "\nUnsupported -outFormat $opt_outFormat.\n";
&usage(1);
}
}
if ($opt_dropSelf && !($opt_outFormat && $opt_outFormat =~ /^(psl|axt)$/)) {
print STDERR "\n-dropSelf is only supported for -outFormat axt or psl.\n";
&usage(1);
}
}
# my $machtype = $ENV{'MACHTYPE'};
my $machtype = "x86_64";
# This hash will contain the params defined in the Scott-style DEF script:
my %defVars = ();
# Temporary local directory for intermediate files:
my $TMP;
# For lifting, we'll need to know the sequence sizes:
my %tSizes = ();
my %qSizes = ();
my $machType = `uname -m`;
chomp $machType;
$machType = "i386" if ($machType eq "i686");
my $defaultPath = ("/cluster/bin/penn/$machType:/cluster/bin/penn:" .
"/cluster/bin/scripts:/cluster/bin/$machType:" .
'/bin:/usr/bin');
sub cleanDie {
# Clean up $TMP (if it has been created) and then die.
my ($msg) = @_;
if ($TMP && -d $TMP) {
system("/bin/sh", "-c", "/bin/rm -rf $TMP");
}
die $msg;
}
sub run {
# Run a command in sh, with PATH specified in %defVars.
my ($cmd) = @_;
my $setPath = "export PATH=";
$setPath .= "$defVars{PATH}:" if (defined $defVars{'PATH'});
$setPath .= "$defaultPath; ";
system("/bin/sh", "-c", $setPath . $cmd) == 0 ||
&cleanDie("Command failed:\n$setPath\n$cmd\n");
}
sub nfsNoodge {
# Try to access the given file/dir; if it fails, don't die, just
# wait a second, try again.
# Scott's blastz-run included a similar trick.
my ($file) = @_;
my $ret = system("/bin/sh", "-c", "ls -d $file > /dev/null");
if ($ret != 0) {
sleep 1;
$ret = system("/bin/sh", "-c", "ls -d $file > /dev/null");
sleep 1 if ($ret != 0);
}
}
sub loadDef {
# Read parameters from a bash script with Scott's param variable names:
my ($def) = @_;
&nfsNoodge($def);
open(DEF, "$def") || &cleanDie("Can't open def file $def: $!\n");
while (<DEF>) {
s/^\s*export\s+//;
next if (/^\s*#/ || /^\s*$/);
if (/(\w+)\s*=\s*(.*)/) {
my ($var, $val) = ($1, $2);
while ($val =~ /\$(\w+)/) {
my $subst = $defVars{$1};
$val =~ s/\$$1/$subst/ if (defined $subst);
}
$defVars{$var} = $val;
}
}
close(DEF);
}
sub loadSeqSizes {
# Load up sequence -> size mapping from $sizeFile into $hashRef.
my ($sizeFile, $hashRef) = @_;
&nfsNoodge($sizeFile);
open(SIZE, "$sizeFile") || &cleanDie("Can't open size file $sizeFile: $!\n");
while (<SIZE>) {
chomp;
my ($seq, $size) = split;
$hashRef->{$seq} = $size;
}
close(SIZE);
}
sub fileBase {
# Get rid of leading path and .suffix (and .gz too if it's there).
my ($path) = @_;
$path =~ s/^(.*\/)?(\S+)\.\w+(\.gz)?/$2/;
return $path;
}
sub parseFileSpec {
# Take a file spec with possible seq name, start and end specifiers and
# split into components.
my ($info, $dir) = @_;
my ($fName, $seq, $start, $end);
if ($info =~ /^(\S+):(\S+):(\d+)-(\d+)$/) {
($fName, $seq, $start, $end) = ($1, $2, $3, $4);
} elsif ($info =~ /^(\S+):(\d+)-(\d+)$/) {
($fName, $seq, $start, $end) = ($1, &fileBase($info), $2, $3);
} else {
($fName, $seq, $start, $end) = ($info, &fileBase($info), 0, 0);
}
$fName = "$dir/$fName" if ($fName !~ /^[\/.]/);
return ($fName, $seq, $start, $end);
}
sub enumerateFileSpecs {
# Given a sequence file name, sequence file name + offset spec, or
# a name of a file containing a list of such file specs, return a
# list of file specs.
my ($listOrSpec, $seqDir) = @_;
my @specs = ();
my ($fName, $seq, $start, $end) = &parseFileSpec($listOrSpec, $seqDir);
if ($fName =~ /\.(fa(\.gz)?|nib|2bit)$/) {
# It appears to be a single spec not a list.
push @specs, $listOrSpec;
} else {
# It doesn't have a recognized file ending -- it had better be a list.
# Assume each line contains a single spec.
&nfsNoodge($listOrSpec);
open(LST, "$listOrSpec")
|| &cleanDie("Couldn't open list file $listOrSpec: $!\n");
while (<LST>) {
chomp;
push @specs, $_;
}
close(LST);
}
return @specs;
}
sub collapseFileSpecs {
# If we get a big old list of query sequences that are not subranged,
# then cat them all into one big .fa file so we can invoke blastz
# only once, and don't have to lift afterwards.
my @fileSpecs = @_;
my $collapsed = 0;
if (scalar(@fileSpecs) == 1) {
# Only one spec in the list ==> no collapsing to do.
return ($collapsed, @fileSpecs);
}
# Now see if there are any specs that look like subranges:
foreach my $spec (@fileSpecs) {
my ($fname, $seq, $start, $end) = &parseFileSpec($spec, ".");
if ($start > 0) {
# Found a subrange; no collapsing.
return ($collapsed, @fileSpecs);
}
}
# OK, we're in the clear to concatenate all of these complete sequences
# onto one local file.
my $big = "$TMP/bigCollapse.fa";
# avoiding exec of shell here instead of going through &run
# and making an empty new file $big
&run("/bin/cp /dev/null $big");
foreach my $spec (@fileSpecs) {
my ($seq, $local) = unpackForBlastz($spec, '.', 'tmp');
$local =~ s/\[\S+\]$//;
if ($local =~ /\.nib$/) {
my $nib = $local;
$local =~ s/\.nib/.fa/;
$local =~ s/(.*\/)/tmp./;
my $size = `nibSize $nib | awk '{print \$3;}'`;
chomp $size;
&run("nibFrag -masked $nib 0 $size + $local");
}
&run("/bin/cat $local >> $big");
&run("/bin/rm -f $local") if ($local =~ /^$TMP/);
}
$collapsed = 1;
@fileSpecs = ($big);
return ($collapsed, @fileSpecs);
}
sub getBlastzParams {
# Look for BLASTZ_? param specs in %defVars; turn into blastz command line
# option string. (blastz command line option names are all 1-character,
# and all 1-char BLASTZ_? DEF params are blastz command line options.)
my $params = " ";
foreach my $p (keys %defVars) {
if ($p =~ /^BLASTZ_(\w)$/) {
$params .= "$1=$defVars{$p} ";
}
}
return $params;
}
sub blastzRangeSpec {
# Return a blastz-style range spec unless a null range is passed in.
my ($start, $end) = @_;
if ($end != 0) {
$start += 1;
return "[$start,$end]";
} else {
return "";
}
}
sub unpackForBlastz {
# If the sequence file is .fa.gz or .2bit, uncompress/extract fa into
# a local file; if it's something that blastz already supports like
# .fa or .nib, translate the offset spec (if any) into blastz's format.
my ($spec, $seqDir, $prefix) = @_;
my ($fname, $seq, $start, $end) = &parseFileSpec($spec, $seqDir);
my $local = "$TMP/$prefix.$seq.$start.$end.fa";
if ($fname =~ /\.2bit$/) {
# Don't use twoBitToFa's range extraction capabilities -- that
# confuses blastz/normalizeLav. Just extract the whole thing as
# fa and tell blastz what range we're using:
&run("/cluster/bin/$machtype/twoBitToFa $fname:$seq $local");
$local .= &blastzRangeSpec($start, $end);
} elsif ($fname =~ /\.fa(\.gz)?$/) {
if ($1) {
&run("/bin/gunzip -c $fname > $local");
} else {
$local = $fname;
}
$local .= &blastzRangeSpec($start, $end);
} elsif ($fname =~ /\.nib$/) {
$local = $fname;
$local .= &blastzRangeSpec($start, $end);
} else {
&cleanDie("Don't know what type of sequence file this is: $fname");
}
return ($seq, $local);
}
sub parseBlastzSpec {
# Extract info back out of blastz spec. Leave start 1-based.
my ($spec) = @_;
my ($fname, $seq, $start, $end);
if ($spec =~ /^(\S*\/)?(\S+)\.(fa|nib)(\[(\d+),(\d+)\])?$/) {
$fname = "$1$2.$3";
$seq = $2;
if ($4) {
$start = $5;
$end = $6;
} else {
$start = 0;
$end = 0;
}
} else {
&cleanDie("Can't parse this blastz spec: $spec");
}
return ($fname, $seq, $start, $end);
}
sub emptyLav {
# satisify later processing steps with an empty lav result
my ($raw) = @_;
open (FH,">$raw") || &cleanDie("emptyLav: can not write to $raw $!\n");
print FH <<_EOF_
#:lav
d {
"blastz target query H=2000
A C G T
91 -114 -31 -123
-114 100 -125 -31
-31 -125 100 -114
-123 -31 -114 91
O = 400, E = 30"
}
m {
n 0
}
#:eof
_EOF_
;
close (FH);
}
sub checkSubRange {
# given an fa or nib file name: file.fa|nib[one_based_start,end]
# verify that bit of sequence has usable sequence for blastz
my ($faNibFile) = @_;
my ($file, $seq, $one_start, $end) = &parseBlastzSpec($faNibFile);
my $start = $one_start - 1; # 1 based becomes 0 based coord
my $answer = 0; # default answer is usable sequence exists
return($answer) if ((0 == $end) || (($end - $start) > 10000));
if ($file =~ m/.nib$/) {
$answer =
`nibFrag -masked $file $start $end "+" stdout | faSize stdin | grep " 0 upper " | wc -l`;
} else {
$answer =
`faFrag -mixed $file $start $end stdout | faSize stdin | grep " 0 upper " | wc -l`;
}
return($answer);
}
sub plainBlastz {
# just run blastz -- target and query can have "[$start,$end]" specifiers
# where $start is 1-based. Put output in file name $raw.
my ($target, $query, $options, $raw) = @_;
+ my $answer = 1; # default answer is that blastz was run OK
my $blastz = $defVars{'BLASTZ'} || "lastz";
# these variables will be 1 if there is zero usable sequence
# this could be a more extensive check than just for zero.
my $targetNoSequence = 0;
if ($target =~ m/\[/) {
$targetNoSequence = &checkSubRange($target);
} else {
$targetNoSequence =
`cat $target | faSize stdin | grep " 0 upper " | wc -l`;
}
chomp $targetNoSequence;
my $queryNoSequence = 0;
if ($query =~ m/\[/) {
$queryNoSequence = &checkSubRange($query);
} else {
$queryNoSequence =
`cat $query | faSize stdin | grep " 0 upper " | wc -l`;
}
chomp $queryNoSequence;
if ($targetNoSequence || $queryNoSequence) {
&emptyLav($raw);
+ $answer = 0; # not really running blastz
} else {
&run("$blastz $target $query $options > $raw");
}
+ return($answer);
}
sub selectRpts {
# This used to be an awk, piped to select_rpts, piped to sort -n.
# select_rpts is a little perl script, and getting the DEF PATH into the
# pipe command seems more trouble than it's worth -- so inline not only
# the awk but also the select_rpts here. Still pipe to sort -n just in case.
my ($allRepFile, $start, $end, $selRepFile) = @_;
&nfsNoodge($allRepFile);
open(ALL, "<$allRepFile") || &cleanDie("Can't open $allRepFile: $!\n");
open(SEL, "| sort -n > $selRepFile")
|| &cleanDie("Can't open pipe to sort -n > $selRepFile: $!\n");
print SEL "\%:repeats\n";
while (<ALL>) {
if (/^\s*\d+/) {
my @words = split;
my ($repStart, $repEnd) = ($words[5], $words[6]);
next if ((! defined $repStart) || (! defined $repEnd));
$repStart += 1;
next if ($repStart > $repEnd);
next if (($repEnd < $start) || ($repStart > $end));
$repStart = $start if ($repStart < $start);
$repEnd = $end if ($repEnd > $end);
$repStart -= ($start - 1);
$repEnd -= ($start - 1);
print SEL "$repStart $repEnd rpt\n";
}
}
close(SEL);
close(ALL);
}
sub abridgeLinSpecReps {
# Before running blastz, snip out lineage-specific repeats from each
# sequence. Then run blastz only on that strand. After running blastz,
# fix up the alignment coords to reflect the original sequence including
# the lineage-specific repeats.
# Currently this only works on per-chromosome sequence and repeat files.
my ($tSpec, $qSpec, $blastzOptions, $raw) = @_;
my ($tFname, $tSeq, $tStart, $tEnd) = &parseBlastzSpec($tSpec);
my ($qFname, $qSeq, $qStart, $qEnd) = &parseBlastzSpec($qSpec);
if ($tEnd == 0 || $qEnd == 0) {
&cleanDie("When abridging repeats, sequence filenames must be followed " .
"by range specs even if the whole sequence is aligned.\n");
}
my $S1X = "$defVars{SEQ1_SMSK}/$tSeq.out.spec";
my $S2X = "$defVars{SEQ2_SMSK}/$qSeq.out.spec";
&cleanDie("$S1X not found") if (! -f $S1X);
&cleanDie("$S2X not found") if (! -f $S2X);
my $TS1X = "$TMP/s1.rpts";
my $TS2X = "$TMP/s2.rpts";
my $TS1 = "$TMP/s1.fa";
my $TS2 = "$TMP/s2.fa";
my $TS1S = "$TMP/s1.strip.fa";
my $TS2S = "$TMP/s2.strip.fa";
my $TS2R = "$TMP/s2.rev.fa";
my $TS2SR = "$TMP/s2.strip.rev.fa";
my $TZF = "$TMP/fwd.lav";
my $TZR = "$TMP/rev.lav";
my $TZS = "$TMP/std.lav";
# Get repeats in range:
&selectRpts($S1X, $tStart, $tEnd, $TS1X);
&selectRpts($S2X, $qStart, $qEnd, $TS2X);
# Get sequence in range:
&run("fasta-subseq $tFname $tStart $tEnd > $TS1");
&run("fasta-subseq $qFname $qStart $qEnd > $TS2");
# Snip out the repeats:
&run("strip_rpts $TS1 $TS1X > $TS1S");
&run("strip_rpts $TS2 $TS2X > $TS2S");
# Run blastz on query forward strand and restore repeats:
- &plainBlastz($TS1S, $TS2S, "B=0" . $blastzOptions, $TZF);
+ my $blastzOK = &plainBlastz($TS1S, $TS2S, "B=0" . $blastzOptions, $TZF);
+ if ($blastzOK ) {
&run("restore_rpts $TZF " .
"$TS1 \"\\\"$tFname\\\" $tStart $tEnd\" " .
"$TS2 \"\\\"$qFname\\\" $qStart $qEnd\" " .
"$TS1X $TS2X > $raw");
-
+ } else {
+ &run("/bin/cp -p $TZF $raw");
+ }
# Run blastz on query reverse strand and restore repeats:
&run("revcomp $TS2S > $TS2SR");
- &plainBlastz($TS1S, $TS2SR, "B=0" . $blastzOptions, $TZR);
+ $blastzOK = &plainBlastz($TS1S, $TS2SR, "B=0" . $blastzOptions, $TZR);
+ if ($blastzOK ) {
&run("restore_rpts $TZR " .
"$TS1 \"\\\"$tFname\\\" $tStart $tEnd\" " .
"$TS2 \"\\\"$qFname-\\\" $qStart $qEnd\" " .
"$TS1X $TS2X reverse >> $raw");
+ } else {
+ &run("/bin/cat $TZF >> $raw");
+ }
}
sub liftLav {
# Run blastz-normalizeLav to lift up chunk coords to sequence level.
my ($raw, $out, $tSeq, $qSeq) = @_;
my $tLen = $tSizes{$tSeq};
my $qLen = $qSeq ? $qSizes{$qSeq} : "0";
&run("blastz-normalizeLav $tLen $qLen < $raw > $out");
}
sub convertOutput {
# Convert lav file to psl or axt file, optionally dropping trivial self al's.
my ($lav, $out) = @_;
my $tSeq = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'};
my $qSeq = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'};
my $tSizes = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'};
my $qSizes = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'};
if ($opt_outFormat eq 'axt') {
my $dropSelf = ($opt_dropSelf ? "-dropSelf " : "");
&run("lavToAxt $dropSelf $lav $tSeq $qSeq $out");
} elsif ($opt_dropSelf) {
# lavToPsl doesn't have lavToAxt's -dropSelf functionality, so pipe
# lavToAxt -dropSelf to axtToPsl.
&run("lavToAxt -dropSelf $lav $tSeq $qSeq stdout " .
"| axtToPsl stdin $tSizes $qSizes $out");
} else {
&run("lavToPsl $lav $out");
}
}
#########################################################################
#
# -- main --
&checkOptions();
&usage(1) if (scalar(@ARGV) != 4);
my ($target, $query, $DEF, $out) = @ARGV;
-&cleanDie("$out already exists\n") if (-f "$out");
+# It is OK to have a previous result existing
+if ( -f "$out") {
+ exit 0;
+}
&loadDef($DEF);
my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'};
my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'};
my $seq1Len = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'};
my $seq2Len = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'};
&loadSeqSizes($seq1Len, \%tSizes);
&loadSeqSizes($seq2Len, \%qSizes);
# Make a temporary directory for intermediate files:
if (exists($defVars{'TMPDIR'})) {
$TMP = `mktemp -d $defVars{'TMPDIR'}/blastz.XXXXXX` || &cleanDie("Can't mktemp");
} else {
$TMP = `mktemp -d /tmp/blastz.XXXXXX` || &cleanDie("Can't mktemp");
}
chomp $TMP;
print STDERR "temp files in $TMP -- will attempt cleanup before die-ing.\n";
# Some internal defaults for blastz params, carried over from
# Scott's blastz-run:
$defVars{'BLASTZ_H'} = 2000 if (! defined $defVars{'BLASTZ_H'});
my $blastzOptions = &getBlastzParams(%defVars);
my @tFileSpecs = &enumerateFileSpecs($target, $seq1Dir);
my @qFileSpecs = &enumerateFileSpecs($query, $seq2Dir);
if ($defVars{'BLASTZ_ABRIDGE_REPEATS'} && scalar(@qFileSpecs) > 1) {
&cleanDie("Sorry, DEF can't have BLASTZ_ABRIDGE_REPEATS set if a list of " .
"query sequences is passed in -- not supported.\n");
}
my $collapsed;
($collapsed, @qFileSpecs) = &collapseFileSpecs(@qFileSpecs);
# Local file to hold concatenated results of all {target, query} runs:
my $localOut = "$TMP/local.lav";
&run("/bin/cp /dev/null $localOut");
# Local temporary files that are overwritten by each {target, query} run:
my $littleRaw = "$TMP/little.raw";
my $littleOut = "$TMP/little.lav";
my $littleConv = "$TMP/little.conv";
# Run blastz on each {target, query} pair; lift and collect results.
foreach my $tSpec (@tFileSpecs) {
my ($tSeq, $tLocal) = &unpackForBlastz($tSpec, $seq1Dir, 't');
foreach my $qSpec (@qFileSpecs) {
my ($qSeq, $qLocal) = &unpackForBlastz($qSpec, $seq2Dir, 'q');
if ($defVars{'BLASTZ_ABRIDGE_REPEATS'}) {
&abridgeLinSpecReps($tLocal, $qLocal, $blastzOptions, $littleRaw);
} else {
&plainBlastz($tLocal, $qLocal, $blastzOptions, $littleRaw);
}
if ($collapsed) {
# Lift target side only:
&liftLav($littleRaw, $littleOut, $tSeq, undef);
} else {
&liftLav($littleRaw, $littleOut, $tSeq, $qSeq);
if ($qLocal =~ /^$TMP/) {
$qLocal =~ s/\[.*\]$//;
&run("/bin/rm $qLocal");
}
}
if ($opt_outFormat) {
&convertOutput($littleOut, $littleConv);
&run("/bin/cat $littleConv >> $localOut");
} else {
&run("/bin/cat $littleOut >> $localOut");
}
}
if ($tLocal =~ /^$TMP/) {
$tLocal =~ s/\[.*\]$//;
&run("/bin/rm $tLocal");
}
}
if ($opt_gz) {
&run("/bin/gzip $localOut");
$localOut .= ".gz";
}
# Move collected results to final location.
&run("/bin/mv $localOut $out");
# Clean up temporary directory:
&run("/bin/rm -rf $TMP");