src/hg/utils/automation/doHgNearBlastp.pl 1.8
1.8 2009/07/09 22:21:24 angie
Added back the varying blastp -e params that I lost when writing this. Have been using 0.01 for all blastp runs, but the original process used 0.001, 0.005, 0.01 depending on clade distance. Thanks Hiram and b0b!
Index: src/hg/utils/automation/doHgNearBlastp.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/doHgNearBlastp.pl,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 4 -r1.7 -r1.8
--- src/hg/utils/automation/doHgNearBlastp.pl 8 Jun 2009 18:41:32 -0000 1.7
+++ src/hg/utils/automation/doHgNearBlastp.pl 9 Jul 2009 22:21:24 -0000 1.8
@@ -13,10 +13,10 @@
use HgAutomate;
use HgRemoteScript;
# Hardcoded params:
-my $blastpParams = '-e 0.01 -m 8 -b'; # Keep -b at end -- value provided below.
my $tSplitCount = 1000;
+my $selfE = 0.01;
my $selfMaxPer = 1000;
my $pairwiseMaxPer = 1;
# Option variable names:
@@ -276,21 +276,55 @@
&HgAutomate::run("chmod a+x $bossScript");
&HgAutomate::run("$HgAutomate::runSSH $distrHost nice $bossScript");
} # formatSequence
+# Make a matrix of clade-pair -e (max error) values:
+my ($mammal, $fish, $fly, $worm, $yeast) = (0..5);
+my %dbToClade = ( hg => $mammal, mm => $mammal, rn => $mammal,
+ danRer => $fish,
+ dm => $fly,
+ ce => $worm,
+ sacCer => $yeast, );
+my @cladeEs;
+# Different species within clade (not self alignments -- see $selfE):
+$cladeEs[$mammal][$mammal] = $cladeEs[$fish][$fish] = $cladeEs[$fly][$fly] =
+ $cladeEs[$worm][$worm] = $cladeEs[$yeast][$yeast] = 0.001;
+# Cross-clade:
+$cladeEs[$mammal][$fish] = $cladeEs[$fish][$mammal] = 0.005;
+$cladeEs[$mammal][$fly] = $cladeEs[$fly][$mammal] = 0.01;
+$cladeEs[$mammal][$worm] = $cladeEs[$worm][$mammal] = 0.01;
+$cladeEs[$mammal][$yeast] = $cladeEs[$yeast][$mammal] = 0.01;
+$cladeEs[$fish][$fly] = $cladeEs[$fly][$fish] = 0.01;
+$cladeEs[$fish][$worm] = $cladeEs[$worm][$fish] = 0.01;
+$cladeEs[$fish][$yeast] = $cladeEs[$yeast][$fish] = 0.01;
+$cladeEs[$fly][$worm] = $cladeEs[$worm][$fly] = 0.01;
+$cladeEs[$fly][$yeast] = $cladeEs[$yeast][$fly] = 0.01;
+$cladeEs[$yeast][$yeast] = $cladeEs[$yeast][$yeast] = 0.01;
+
+sub calcE {
+ # Look up the blastp e parameter (max error) by clade distances
+ my ($tDb, $qDb) = @_;
+ (my $t = $tDb) =~ s/\d+$//;
+ defined ($t = $dbToClade{$t}) || die "Can't figure out clade of '$tDb'";
+ (my $q = $qDb) =~ s/\d+$//;
+ defined ($q = $dbToClade{$q}) || die "Can't figure out clade of '$qDb'";
+ my $e = $cladeEs[$t][$q] || die "No e for clades '$t', '$q'";
+ return $e;
+} # calcE
+
sub runPairwiseBlastp {
# Do a pairwise blastp cluster run for the given query.
# $b is the blastp -b param value... note -b is at the end of $blastpParams.
- my ($tDb, $qDb, $b) = @_;
+ my ($tDb, $qDb, $e, $b) = @_;
my $runDir = "$buildDir/run.$tDb.$qDb";
&HgAutomate::mustMkdir($runDir);
my $fh = &HgAutomate::mustOpen(">$runDir/blastSome");
print $fh <<_EOF_
#!/bin/csh -ef
setenv BLASTMAT $blastPath/data
$blastPath/bin/blastall -p blastp -d $scratchDir/$qDb.formatdb/$qDb \\
- -i \$1 -o \$2 $blastpParams $b
+ -i \$1 -o \$2 -e $e -m 8 -b $b
_EOF_
;
close($fh);
@@ -481,29 +515,30 @@
# Self blastp.
&formatSequence($tDb, $tFasta);
if (! $opt_noSelf && ! $opt_queryOnly) {
- &runPairwiseBlastp($tDb, $tDb, $selfMaxPer);
+ &runPairwiseBlastp($tDb, $tDb, $selfE, $selfMaxPer);
&loadPairwise($tDb, $tDb, $tGenesetPrefix, $selfMaxPer);
}
# For each query db, pairwise blastp.
my $tPrefix = &dbToPrefix($tDb);
foreach my $qDb (@qDbs) {
my $qFasta = $dbToFasta{$qDb};
my $qPrefix = &dbToPrefix($qDb);
+ my $e = &calcE($tDb, $qDb);
if ($recipBest{$qDb} || ! $opt_queryOnly) {
# tDb vs qDb
&formatSequence($qDb, $qFasta);
- &runPairwiseBlastp($tDb, $qDb, $pairwiseMaxPer);
+ &runPairwiseBlastp($tDb, $qDb, $e, $pairwiseMaxPer);
}
if (! $recipBest{$qDb} && ! $opt_queryOnly) {
&loadPairwise($tDb, $qDb, $qPrefix, $pairwiseMaxPer);
}
if ($recipBest{$qDb} || ! $opt_targetOnly) {
# qDb vs tDb
&splitSequence($qDb, $qFasta);
- &runPairwiseBlastp($qDb, $tDb, $pairwiseMaxPer);
+ &runPairwiseBlastp($qDb, $tDb, $e, $pairwiseMaxPer);
}
if (! $recipBest{$qDb} && ! $opt_targetOnly) {
&loadPairwise($qDb, $tDb, $tPrefix, $pairwiseMaxPer);
}