a76cda6a69557a8c71016489147e4afd31266295 angie Mon Feb 3 15:19:13 2020 -0800 blastz-run-ucsc: catch pipe errors; use pslDropOverlap for -dropSelf, for both lav and axt output. refs #24694, #24695 In MLQ #24587 a user reported some trivial self-alignments that should have been filtered - but when lastz produced axt output (for 2bit without suffix), we were not doing anything to filter those out. Now we use pslDropOverlap there. Worse, lavToAxt was failing when lastz output included double-sided gaps, and the failure was not propagated through the piped system() command, so undetected errors caused lots of alignments to be lost. So instead of using lavToAxt because of its -dropSelf option, use lavToPsl | pslDropOverlap. diff --git src/hg/utils/automation/blastz-run-ucsc src/hg/utils/automation/blastz-run-ucsc index 5e57dea..0e6528d 100755 --- src/hg/utils/automation/blastz-run-ucsc +++ src/hg/utils/automation/blastz-run-ucsc @@ -1,726 +1,728 @@ #!/usr/bin/env perl # DO NOT EDIT the /cluster/bin/scripts copy of this file -- # edit ~/kent/src/hg/utils/automation/blastz-run-ucsc instead. # Based on Scott Schwartz's bash script blastz-run; # rewritten in perl and extended to handle .2bit inputs and # multi-record fasta for target as well as query by Angie Hinrichs. # $Id: blastz-run-ucsc,v 1.11 2009/08/13 23:15:57 hiram Exp $ use Getopt::Long; use strict; use warnings; use vars qw/ $opt_outFormat $opt_gz $opt_dropSelf $opt_help /; sub usage { my ($status) = @_; my $base = $0; $base =~ s/^(.*\/)?//; print STDERR " usage: $base target query DEF out options: -outFormat (psl|axt) Convert blastz output from lav to psl or axt. -gz Compress output with gzip. -dropSelf When converting to psl or axt, drop alignment blocks that cross the diagonal (trivial self- alignments). Unpacks sequence (if necessary), runs blastz and lifts/\"normalizes\" output (if necessary). Target and query can be list files containing one sequence specifier per line, or sequence specifiers. A sequence specifier is a .2bit, .nib or .fa(.gz) files, possibly with a sequence and range specifier appended. DEF is a Scott Schwartz-style bash script containing blastz parameters. out is the (lifted/\"normalized\") output of blastz (optionally converted to axt or psl, and/or gzipped). This script does not pay attention to out's file suffixes -- it is up to you to make sure that they are consistent with output format and compression options. \n"; exit $status; } sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions("outFormat=s", "gz", "dropSelf", "help"); &usage(1) if (!$ok); &usage(0) if ($opt_help); if ($opt_outFormat) { $opt_outFormat =~ tr/A-Z/a-z/; if ($opt_outFormat eq 'lav') { $opt_outFormat = undef; } elsif ($opt_outFormat !~ /^(psl|axt)$/) { print STDERR "\nUnsupported -outFormat $opt_outFormat.\n"; &usage(1); } } if ($opt_dropSelf && !($opt_outFormat && $opt_outFormat =~ /^(psl|axt)$/)) { print STDERR "\n-dropSelf is only supported for -outFormat axt or psl.\n"; &usage(1); } } # my $machtype = $ENV{'MACHTYPE'}; my $machtype = "x86_64"; # This hash will contain the params defined in the Scott-style DEF script: my %defVars = (); # Temporary local directory for intermediate files: my $TMP; # For lifting, we'll need to know the sequence sizes: my %tSizes = (); my %qSizes = (); my $machType = `uname -m`; chomp $machType; $machType = "i386" if ($machType eq "i686"); my @potentialPath = ("/cluster/bin/penn/$machType", "/cluster/bin/penn", "/cluster/bin/scripts", "/cluster/bin/$machType", "/bin", "/usr/bin"); # this defaultPath will be cleaned up at start of program my $defaultPath = ("/cluster/bin/penn/$machType:/cluster/bin/penn:" . "/cluster/bin/scripts:/cluster/bin/$machType:" . '/bin:/usr/bin'); # these commands will be filled in with full path names to # avoid constant shell searches for commands through PATH # more efficient when commands are often repeated in a cluster job my $rmCmd = "rm"; my $cpCmd = "cp"; my $mvCmd = "mv"; my $shCmd = "sh"; my $lsCmd = "ls"; my $catCmd = "cat"; my $gzipCmd = "gzip"; my $gunzipCmd = "gunzip"; my $twoBitToFaCmd = "twoBitToFa"; sub cleanDie { # Clean up $TMP (if it has been created) and then die. my ($msg) = @_; if ($TMP && -d $TMP) { system("$shCmd", "-c", "$rmCmd -rf $TMP"); } die $msg; } sub run { # Run a command in sh, with PATH specified in %defVars. my ($cmd) = @_; my $setPath = "export PATH="; $setPath .= "$defVars{PATH}:" if (defined $defVars{'PATH'}); $setPath .= "$defaultPath; "; - system("$shCmd", "-c", $setPath . $cmd) == 0 || + my $failPath = "set -xbeEu -o pipefail; "; + system("$shCmd", "-c", $setPath . $failPath . $cmd) == 0 || &cleanDie("Command failed:\n$setPath\n$cmd\n"); } sub findCommand { my ($cmd) = @_; my $setPath = "PATH="; $setPath .= "$defVars{PATH}:" if (defined $defVars{'PATH'}); $setPath .= "$defaultPath; "; my $which = qx{$setPath type -ap $cmd 2> /dev/null | head -1}; chomp $which; $which = $cmd if ($which eq ""); return $which; } sub nfsNoodge { # Try to access the given file/dir; if it fails, don't die, just # wait a second, try again. # Scott's blastz-run included a similar trick. my ($file) = @_; my $ret = system("$shCmd", "-c", "$lsCmd -d $file > /dev/null"); if ($ret != 0) { sleep 1; $ret = system("$shCmd", "-c", "$lsCmd -d $file > /dev/null"); sleep 1 if ($ret != 0); } } sub loadDef { # Read parameters from a bash script with Scott's param variable names: my ($def) = @_; &nfsNoodge($def); open(DEF, "$def") || &cleanDie("Can't open def file $def: $!\n"); while () { s/^\s*export\s+//; next if (/^\s*#/ || /^\s*$/); if (/(\w+)\s*=\s*(.*)/) { my ($var, $val) = ($1, $2); while ($val =~ /\$(\w+)/) { my $subst = $defVars{$1}; $val =~ s/\$$1/$subst/ if (defined $subst); } $defVars{$var} = $val; } } close(DEF); } sub loadSeqSizes { # Load up sequence -> size mapping from $sizeFile into $hashRef. my ($sizeFile, $hashRef) = @_; &nfsNoodge($sizeFile); open(SIZE, "$sizeFile") || &cleanDie("Can't open size file $sizeFile: $!\n"); while () { chomp; my ($seq, $size) = split; $hashRef->{$seq} = $size; } close(SIZE); } sub fileBase { # Get rid of leading path and .suffix (and .gz too if it's there). my ($path) = @_; $path =~ s/^(.*\/)?(\S+)\.\w+(\.gz)?/$2/; return $path; } sub parseFileSpec { # Take a file spec with possible seq name, start and end specifiers and # split into components. my ($info, $dir) = @_; my ($fName, $seq, $start, $end); if ($info =~ /^(\S+):(\S+):(\d+)-(\d+)$/) { ($fName, $seq, $start, $end) = ($1, $2, $3, $4); } elsif ($info =~ /^(\S+):(\d+)-(\d+)$/) { ($fName, $seq, $start, $end) = ($1, &fileBase($info), $2, $3); } else { ($fName, $seq, $start, $end) = ($info, &fileBase($info), 0, 0); } $fName = "$dir/$fName" if ($fName !~ /^[\/.]/); return ($fName, $seq, $start, $end); } sub enumerateFileSpecs { # Given a sequence file name, sequence file name + offset spec, or # a name of a file containing a list of such file specs, return a # list of file specs. my ($listOrSpec, $seqDir) = @_; my @specs = (); my ($fName, $seq, $start, $end) = &parseFileSpec($listOrSpec, $seqDir); if ($fName =~ /\.(fa(\.gz)?|nib|2bit)$/) { # It appears to be a single spec not a list. push @specs, $listOrSpec; } else { # It doesn't have a recognized file ending -- it had better be a list. # Assume each line contains a single spec. &nfsNoodge($listOrSpec); open(LST, "$listOrSpec") || &cleanDie("Couldn't open list file $listOrSpec: $!\n"); while () { chomp; push @specs, $_; } close(LST); } return @specs; } sub collapseFileSpecs { # If we get a big old list of query sequences that are not subranged, # then cat them all into one big .fa file so we can invoke blastz # only once, and don't have to lift afterwards. my @fileSpecs = @_; my $collapsed = 0; if (scalar(@fileSpecs) == 1) { # Only one spec in the list ==> no collapsing to do. return ($collapsed, @fileSpecs); } # Now see if there are any specs that look like subranges: foreach my $spec (@fileSpecs) { my ($fname, $seq, $start, $end) = &parseFileSpec($spec, "."); if ($start > 0) { # Found a subrange; no collapsing. return ($collapsed, @fileSpecs); } } # OK, we're in the clear to concatenate all of these complete sequences # onto one local file. my $big = "$TMP/bigCollapse.fa"; # avoiding exec of shell here instead of going through &run # and making an empty new file $big &run("$cpCmd /dev/null $big"); foreach my $spec (@fileSpecs) { my ($seq, $local) = unpackForBlastz($spec, '.', 'tmp'); $local =~ s/\[\S+\]$//; if ($local =~ /\.nib$/) { my $nib = $local; $local =~ s/\.nib/.fa/; $local =~ s/(.*\/)/tmp./; my $size = `nibSize $nib | awk '{print \$3;}'`; chomp $size; &run("nibFrag -masked $nib 0 $size + $local"); } &run("$catCmd $local >> $big"); &run("$rmCmd -f $local") if ($local =~ /^$TMP/); } $collapsed = 1; @fileSpecs = ($big); return ($collapsed, @fileSpecs); } sub getBlastzParams { # Look for BLASTZ_? param specs in %defVars; turn into blastz command line # option string. (blastz command line option names are all 1-character, # and all 1-char BLASTZ_? DEF params are blastz command line options.) my $params = " "; foreach my $p (keys %defVars) { if ($p =~ /^BLASTZ_(\w)$/) { $params .= "$1=$defVars{$p} "; } } return $params; } sub blastzRangeSpec { # Return a blastz-style range spec unless a null range is passed in. my ($start, $end) = @_; if ($end != 0) { $start += 1; return "[$start,$end]"; } else { return ""; } } sub unpackForBlastz { # If the sequence file is .fa.gz or .2bit, uncompress/extract fa into # a local file; if it's something that blastz already supports like # .fa or .nib, translate the offset spec (if any) into blastz's format. my ($spec, $seqDir, $prefix) = @_; my ($fname, $seq, $start, $end) = &parseFileSpec($spec, $seqDir); my $local = "$TMP/$prefix.$seq.$start.$end.fa"; if ($fname =~ /\.2bit$/) { # Don't use twoBitToFa's range extraction capabilities -- that # confuses blastz/normalizeLav. Just extract the whole thing as # fa and tell blastz what range we're using: &run("$twoBitToFaCmd $fname:$seq $local"); $local .= &blastzRangeSpec($start, $end); } elsif ($fname =~ /\.fa(\.gz)?$/) { if ($1) { &run("$gunzipCmd -c $fname > $local"); } else { $local = $fname; } $local .= &blastzRangeSpec($start, $end); } elsif ($fname =~ /\.nib$/) { $local = $fname; $local .= &blastzRangeSpec($start, $end); } else { &cleanDie("Don't know what type of sequence file this is: $fname"); } return ($seq, $local); } sub parseBlastzSpec { # Extract info back out of blastz spec. Leave start 1-based. my ($spec) = @_; my ($fname, $seq, $start, $end); if ($spec =~ /^(\S*\/)?(\S+)\.(fa|nib)(\[(\d+),(\d+)\])?$/) { $fname = "$1$2.$3"; $seq = $2; if ($4) { $start = $5; $end = $6; } else { $start = 0; $end = 0; } } else { &cleanDie("Can't parse this blastz spec: $spec"); } return ($fname, $seq, $start, $end); } sub emptyLav { # satisify later processing steps with an empty lav result my ($raw) = @_; open (FH,">$raw") || &cleanDie("emptyLav: can not write to $raw $!\n"); print FH <<_EOF_ #:lav d { "blastz target query H=2000 A C G T 91 -114 -31 -123 -114 100 -125 -31 -31 -125 100 -114 -123 -31 -114 91 O = 400, E = 30" } m { n 0 } #:eof _EOF_ ; close (FH); } sub checkSubRange { # given an fa or nib file name: file.fa|nib[one_based_start,end] # verify that bit of sequence has usable sequence for blastz my ($faNibFile) = @_; my ($file, $seq, $one_start, $end) = &parseBlastzSpec($faNibFile); my $start = $one_start - 1; # 1 based becomes 0 based coord my $answer = 0; # default answer is usable sequence exists return($answer) if ((0 == $end) || (($end - $start) > 10000)); if ($file =~ m/.nib$/) { $answer = `nibFrag -masked $file $start $end "+" stdout | faSize stdin | grep " 0 upper " | wc -l`; } else { $answer = `faFrag -mixed $file $start $end stdout | faSize stdin | grep " 0 upper " | wc -l`; } return($answer); } sub plainBlastz { # just run blastz -- target and query can have "[$start,$end]" specifiers # where $start is 1-based. Put output in file name $raw. my ($target, $query, $options, $raw) = @_; my $answer = 1; # default answer is that blastz was run OK my $blastz = $defVars{'BLASTZ'} || "lastz"; # these variables will be 1 if there is zero usable sequence # this could be a more extensive check than just for zero. my $targetNoSequence = 0; if ($target !~ m/.2bit$/) { if ($target =~ m/\[/) { $targetNoSequence = &checkSubRange($target); } else { $targetNoSequence = `cat $target | faSize stdin | grep " 0 upper " | wc -l`; } } chomp $targetNoSequence; my $queryNoSequence = 0; if ($query =~ m/\[/) { $queryNoSequence = &checkSubRange($query); } elsif ($query !~ m/.2bit/) { $queryNoSequence = `cat $query | faSize stdin | grep " 0 upper " | wc -l`; } chomp $queryNoSequence; if ($targetNoSequence || $queryNoSequence) { &emptyLav($raw); $answer = 0; # not really running blastz } else { if ($target =~ m/.2bit$/) { $target .= "[multiple]"; $options .= " --format=axt+"; } &run("$blastz $target $query $options > $raw"); } return($answer); } sub selectRpts { # This used to be an awk, piped to select_rpts, piped to sort -n. # select_rpts is a little perl script, and getting the DEF PATH into the # pipe command seems more trouble than it's worth -- so inline not only # the awk but also the select_rpts here. Still pipe to sort -n just in case. my ($allRepFile, $start, $end, $selRepFile) = @_; &nfsNoodge($allRepFile); open(ALL, "<$allRepFile") || &cleanDie("Can't open $allRepFile: $!\n"); open(SEL, "| sort -n > $selRepFile") || &cleanDie("Can't open pipe to sort -n > $selRepFile: $!\n"); print SEL "\%:repeats\n"; while () { if (/^\s*\d+/) { my @words = split; my ($repStart, $repEnd) = ($words[5], $words[6]); next if ((! defined $repStart) || (! defined $repEnd)); $repStart += 1; next if ($repStart > $repEnd); next if (($repEnd < $start) || ($repStart > $end)); $repStart = $start if ($repStart < $start); $repEnd = $end if ($repEnd > $end); $repStart -= ($start - 1); $repEnd -= ($start - 1); print SEL "$repStart $repEnd rpt\n"; } } close(SEL); close(ALL); } sub abridgeLinSpecReps { # Before running blastz, snip out lineage-specific repeats from each # sequence. Then run blastz only on that strand. After running blastz, # fix up the alignment coords to reflect the original sequence including # the lineage-specific repeats. # Currently this only works on per-chromosome sequence and repeat files. my ($tSpec, $qSpec, $blastzOptions, $raw) = @_; my ($tFname, $tSeq, $tStart, $tEnd) = &parseBlastzSpec($tSpec); my ($qFname, $qSeq, $qStart, $qEnd) = &parseBlastzSpec($qSpec); if ($tEnd == 0 || $qEnd == 0) { &cleanDie("When abridging repeats, sequence filenames must be followed " . "by range specs even if the whole sequence is aligned.\n"); } my $S1X = "$defVars{SEQ1_SMSK}/$tSeq.out.spec"; my $S2X = "$defVars{SEQ2_SMSK}/$qSeq.out.spec"; &cleanDie("$S1X not found") if (! -f $S1X); &cleanDie("$S2X not found") if (! -f $S2X); my $TS1X = "$TMP/s1.rpts"; my $TS2X = "$TMP/s2.rpts"; my $TS1 = "$TMP/s1.fa"; my $TS2 = "$TMP/s2.fa"; my $TS1S = "$TMP/s1.strip.fa"; my $TS2S = "$TMP/s2.strip.fa"; my $TS2R = "$TMP/s2.rev.fa"; my $TS2SR = "$TMP/s2.strip.rev.fa"; my $TZF = "$TMP/fwd.lav"; my $TZR = "$TMP/rev.lav"; my $TZS = "$TMP/std.lav"; # Get repeats in range: &selectRpts($S1X, $tStart, $tEnd, $TS1X); &selectRpts($S2X, $qStart, $qEnd, $TS2X); # Get sequence in range: &run("fasta-subseq $tFname $tStart $tEnd > $TS1"); &run("fasta-subseq $qFname $qStart $qEnd > $TS2"); # Snip out the repeats: &run("strip_rpts $TS1 $TS1X > $TS1S"); &run("strip_rpts $TS2 $TS2X > $TS2S"); # Run blastz on query forward strand and restore repeats: my $blastzOK = &plainBlastz($TS1S, $TS2S, "B=0" . $blastzOptions, $TZF); if ($blastzOK ) { &run("restore_rpts $TZF " . "$TS1 \"\\\"$tFname\\\" $tStart $tEnd\" " . "$TS2 \"\\\"$qFname\\\" $qStart $qEnd\" " . "$TS1X $TS2X > $raw"); } else { &run("$cpCmd -p $TZF $raw"); } # Run blastz on query reverse strand and restore repeats: &run("revcomp $TS2S > $TS2SR"); $blastzOK = &plainBlastz($TS1S, $TS2SR, "B=0" . $blastzOptions, $TZR); if ($blastzOK ) { &run("restore_rpts $TZR " . "$TS1 \"\\\"$tFname\\\" $tStart $tEnd\" " . "$TS2 \"\\\"$qFname-\\\" $qStart $qEnd\" " . "$TS1X $TS2X reverse >> $raw"); } else { &run("$catCmd $TZF >> $raw"); } } sub liftLav { # Run blastz-normalizeLav to lift up chunk coords to sequence level. my ($raw, $out, $tSeq, $qSeq) = @_; my $tLen = $tSizes{$tSeq}; my $qLen = $qSeq ? $qSizes{$qSeq} : "0"; &run("blastz-normalizeLav $tLen $qLen < $raw > $out"); } sub convertOutput { # Convert lav file to psl or axt file, optionally dropping trivial self al's. my ($lav, $out) = @_; - my $tSeq = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'}; - my $qSeq = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'}; - my $tSizes = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'}; - my $qSizes = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'}; if ($opt_outFormat eq 'axt') { my $dropSelf = ($opt_dropSelf ? "-dropSelf " : ""); + my $tSeq = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'}; + my $qSeq = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'}; &run("lavToAxt $dropSelf $lav $tSeq $qSeq $out"); } elsif ($opt_dropSelf) { - # lavToPsl doesn't have lavToAxt's -dropSelf functionality, so pipe - # lavToAxt -dropSelf to axtToPsl. - &run("lavToAxt -dropSelf $lav $tSeq $qSeq stdout " . - "| axtToPsl stdin $tSizes $qSizes $out"); + &run("lavToPsl $lav stdout " . + "| pslDropOverlap stdin $out"); &run("pslCheck $out"); } else { &run("lavToPsl $lav $out"); &run("pslCheck $out"); } } ######################################################################### # # -- main -- # cleanup the defaultPath variable to eliminate non-existent paths: $defaultPath = ""; foreach my $path (@potentialPath) { if ( -d "$path" ) { if (length($defaultPath)) { $defaultPath .= ":$path"; } else { $defaultPath = $path; } } } &checkOptions(); &usage(1) if (scalar(@ARGV) != 4); my ($target, $query, $DEF, $out) = @ARGV; # It is OK to have a previous result existing if ( -f "$out") { + print STDERR "Output file $out exists; exiting.\n"; exit 0; } &loadDef($DEF); # find full pathNames for commands: $rmCmd = &findCommand("$rmCmd"); $cpCmd = &findCommand("$cpCmd"); $mvCmd = &findCommand("$mvCmd"); $shCmd = &findCommand("$shCmd"); $lsCmd = &findCommand("$lsCmd"); $catCmd = &findCommand("$catCmd"); $gzipCmd = &findCommand("$gzipCmd"); $gunzipCmd = &findCommand("$gunzipCmd"); $twoBitToFaCmd = &findCommand("$twoBitToFaCmd"); my $seq1Dir = $defVars{'SEQ1_CTGDIR'} || $defVars{'SEQ1_DIR'}; my $seq2Dir = $defVars{'SEQ2_CTGDIR'} || $defVars{'SEQ2_DIR'}; my $seq1Len = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'}; my $seq2Len = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'}; &loadSeqSizes($seq1Len, \%tSizes); &loadSeqSizes($seq2Len, \%qSizes); # Make a temporary directory for intermediate files: if (exists($defVars{'TMPDIR'})) { $TMP = `mktemp -d $defVars{'TMPDIR'}/blastz.XXXXXX` || &cleanDie("Can't mktemp"); } else { $TMP = `mktemp -d /tmp/blastz.XXXXXX` || &cleanDie("Can't mktemp"); } chomp $TMP; print STDERR "temp files in $TMP -- will attempt cleanup before die-ing.\n"; # Some internal defaults for blastz params, carried over from # Scott's blastz-run: $defVars{'BLASTZ_H'} = 2000 if (! defined $defVars{'BLASTZ_H'}); my $blastzOptions = &getBlastzParams(%defVars); my @tFileSpecs = &enumerateFileSpecs($target, $seq1Dir); my @qFileSpecs = &enumerateFileSpecs($query, $seq2Dir); if ($defVars{'BLASTZ_ABRIDGE_REPEATS'} && scalar(@qFileSpecs) > 1) { &cleanDie("Sorry, DEF can't have BLASTZ_ABRIDGE_REPEATS set if a list of " . "query sequences is passed in -- not supported.\n"); } my $collapsed = 0; my $query2bit = $query; $query2bit =~ s/.lst$/.2bit/; # if partition step constructed the 2bit file for this query, use it: if ( -s $query2bit ) { &run("$cpCmd -p $query2bit $TMP"); # it is used repeatedly for each tSpec $query2bit =~ s#qParts/##; $query2bit =~ s#tParts/##; # could be this for self alignment @qFileSpecs = undef; $qFileSpecs[0] = "$TMP/$query2bit"; $collapsed = 1; } else { ($collapsed, @qFileSpecs) = &collapseFileSpecs(@qFileSpecs); } # if partition step constructed the 2bit file for this target bit, use it # using a target 2bit can only result in axt or psl output, lav not possible # so don't do this if lav required output if ($opt_outFormat) { my $target2bit = $target; $target2bit =~ s/.lst$/.2bit/; if (-s $target2bit ) { @tFileSpecs = undef; $tFileSpecs[0] = "$target2bit"; # used only once for all qSpec } } # Local file to hold concatenated results of all {target, query} runs: my $localOut = "$TMP/local.lav"; &run("$cpCmd /dev/null $localOut"); # Local temporary files that are overwritten by each {target, query} run: my $littleRaw = "$TMP/little.raw"; my $littleOut = "$TMP/little.lav"; my $littleConv = "$TMP/little.conv"; # Run blastz on each {target, query} pair; lift and collect results. foreach my $tSpec (@tFileSpecs) { my ($tSeq, $tLocal) = ("", ""); my $axtResult = 0; if ($tSpec =~ m/.2bit$/) { $tLocal = $tSpec; # use .2bit file when it exists $axtResult = 1; # lastz can only output axt when using 2bit[multiple] } else { ($tSeq, $tLocal) = &unpackForBlastz($tSpec, $seq1Dir, 't'); } foreach my $qSpec (@qFileSpecs) { my ($qSeq, $qLocal) = ("", ""); if ($qSpec =~ m/.2bit$/) { $qLocal = $qSpec; } else { ($qSeq, $qLocal) = &unpackForBlastz($qSpec, $seq2Dir, 'q'); } if ($defVars{'BLASTZ_ABRIDGE_REPEATS'}) { &abridgeLinSpecReps($tLocal, $qLocal, $blastzOptions, $littleRaw); } else { &plainBlastz($tLocal, $qLocal, $blastzOptions, $littleRaw); } # there may be no lifting involved when axtResult TBD XXX if ($axtResult) { &run("$cpCmd -p $littleRaw $littleOut"); } else { if ($collapsed) { # Lift target side only: &liftLav($littleRaw, $littleOut, $tSeq, undef); } else { &liftLav($littleRaw, $littleOut, $tSeq, $qSeq); if ($qLocal =~ /^$TMP/) { $qLocal =~ s/\[.*\]$//; &run("$rmCmd $qLocal"); } } } if ($opt_outFormat) { if ($axtResult) { if ($opt_outFormat eq 'axt') { + if ($opt_dropSelf) { + die "-dropSelf not supported for AXT output from 2bit"; + } &run("$catCmd $littleOut >> $localOut"); } else { my $tSizes = $defVars{'SEQ1_CTGLEN'} || $defVars{'SEQ1_LEN'}; my $qSizes = $defVars{'SEQ2_CTGLEN'} || $defVars{'SEQ2_LEN'}; + my $dropSelfCmd = $opt_dropSelf ? "| pslDropOverlap stdin stdout" : ""; # carry through comments &run("grep '^#' $littleOut | egrep -v 'identity|coverage|num_masked' > $littleConv"); - &run("axtToPsl $littleOut $tSizes $qSizes stdout >> $littleConv"); + &run("axtToPsl $littleOut $tSizes $qSizes stdout $dropSelfCmd >> $littleConv"); &run("$catCmd $littleConv >> $localOut"); } } else { &convertOutput($littleOut, $littleConv); &run("$catCmd $littleConv >> $localOut"); } } else { &run("$catCmd $littleOut >> $localOut"); } } if ($tLocal =~ /^$TMP/) { $tLocal =~ s/\[.*\]$//; &run("$rmCmd $tLocal"); } } if ($opt_gz) { &run("$gzipCmd $localOut"); $localOut .= ".gz"; } # Move collected results to final location. &run("$mvCmd $localOut $out"); # Clean up temporary directory: &run("$rmCmd -rf $TMP");