src/hg/utils/automation/doHgNearBlastp.pl 1.8

1.8 2009/07/09 22:21:24 angie
Added back the varying blastp -e params that I lost when writing this. Have been using 0.01 for all blastp runs, but the original process used 0.001, 0.005, 0.01 depending on clade distance. Thanks Hiram and b0b!
Index: src/hg/utils/automation/doHgNearBlastp.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/automation/doHgNearBlastp.pl,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 4 -r1.7 -r1.8
--- src/hg/utils/automation/doHgNearBlastp.pl	8 Jun 2009 18:41:32 -0000	1.7
+++ src/hg/utils/automation/doHgNearBlastp.pl	9 Jul 2009 22:21:24 -0000	1.8
@@ -13,10 +13,10 @@
 use HgAutomate;
 use HgRemoteScript;
 
 # Hardcoded params:
-my $blastpParams = '-e 0.01 -m 8 -b'; # Keep -b at end -- value provided below.
 my $tSplitCount = 1000;
+my $selfE = 0.01;
 my $selfMaxPer = 1000;
 my $pairwiseMaxPer = 1;
 
 # Option variable names:
@@ -276,21 +276,55 @@
   &HgAutomate::run("chmod a+x $bossScript");
   &HgAutomate::run("$HgAutomate::runSSH $distrHost nice $bossScript");
 } # formatSequence
 
+# Make a matrix of clade-pair -e (max error) values:
+my ($mammal, $fish, $fly, $worm, $yeast) = (0..5);
+my %dbToClade = ( hg => $mammal, mm => $mammal, rn => $mammal,
+		  danRer => $fish,
+		  dm => $fly,
+		  ce => $worm,
+		  sacCer => $yeast, );
+my @cladeEs;
+# Different species within clade (not self alignments -- see $selfE):
+$cladeEs[$mammal][$mammal] = $cladeEs[$fish][$fish] = $cladeEs[$fly][$fly] =
+    $cladeEs[$worm][$worm] = $cladeEs[$yeast][$yeast] = 0.001;
+# Cross-clade:
+$cladeEs[$mammal][$fish]  = $cladeEs[$fish][$mammal]  = 0.005;
+$cladeEs[$mammal][$fly]   = $cladeEs[$fly][$mammal]   = 0.01;
+$cladeEs[$mammal][$worm]  = $cladeEs[$worm][$mammal]  = 0.01;
+$cladeEs[$mammal][$yeast] = $cladeEs[$yeast][$mammal] = 0.01;
+$cladeEs[$fish][$fly]   = $cladeEs[$fly][$fish]   = 0.01;
+$cladeEs[$fish][$worm]  = $cladeEs[$worm][$fish]  = 0.01;
+$cladeEs[$fish][$yeast] = $cladeEs[$yeast][$fish] = 0.01;
+$cladeEs[$fly][$worm]  = $cladeEs[$worm][$fly]  = 0.01;
+$cladeEs[$fly][$yeast] = $cladeEs[$yeast][$fly] = 0.01;
+$cladeEs[$yeast][$yeast] = $cladeEs[$yeast][$yeast] = 0.01;
+
+sub calcE {
+  # Look up the blastp e parameter (max error) by clade distances
+  my ($tDb, $qDb) = @_;
+  (my $t = $tDb) =~ s/\d+$//;
+  defined ($t = $dbToClade{$t}) || die "Can't figure out clade of '$tDb'";
+  (my $q = $qDb) =~ s/\d+$//;
+  defined ($q = $dbToClade{$q}) || die "Can't figure out clade of '$qDb'";
+  my $e = $cladeEs[$t][$q] || die "No e for clades '$t', '$q'";
+  return $e;
+} # calcE
+
 sub runPairwiseBlastp {
   # Do a pairwise blastp cluster run for the given query.  
   # $b is the blastp -b param value... note -b is at the end of $blastpParams.
-  my ($tDb, $qDb, $b) = @_;
+  my ($tDb, $qDb, $e, $b) = @_;
 
   my $runDir = "$buildDir/run.$tDb.$qDb";
   &HgAutomate::mustMkdir($runDir);
   my $fh = &HgAutomate::mustOpen(">$runDir/blastSome");
   print $fh <<_EOF_
 #!/bin/csh -ef
 setenv BLASTMAT $blastPath/data
 $blastPath/bin/blastall -p blastp -d $scratchDir/$qDb.formatdb/$qDb \\
-    -i \$1 -o \$2 $blastpParams $b
+    -i \$1 -o \$2 -e $e -m 8 -b $b
 _EOF_
     ;
   close($fh);
 
@@ -481,29 +515,30 @@
 
 # Self blastp.
 &formatSequence($tDb, $tFasta);
 if (! $opt_noSelf && ! $opt_queryOnly) {
-  &runPairwiseBlastp($tDb, $tDb, $selfMaxPer);
+  &runPairwiseBlastp($tDb, $tDb, $selfE, $selfMaxPer);
   &loadPairwise($tDb, $tDb, $tGenesetPrefix, $selfMaxPer);
 }
 
 # For each query db, pairwise blastp.
 my $tPrefix = &dbToPrefix($tDb);
 foreach my $qDb (@qDbs) {
   my $qFasta = $dbToFasta{$qDb};
   my $qPrefix = &dbToPrefix($qDb);
+  my $e = &calcE($tDb, $qDb);
   if ($recipBest{$qDb} || ! $opt_queryOnly) {
     # tDb vs qDb
     &formatSequence($qDb, $qFasta);
-    &runPairwiseBlastp($tDb, $qDb, $pairwiseMaxPer);
+    &runPairwiseBlastp($tDb, $qDb, $e, $pairwiseMaxPer);
   }
   if (! $recipBest{$qDb} && ! $opt_queryOnly) {
     &loadPairwise($tDb, $qDb, $qPrefix, $pairwiseMaxPer);
   }
   if ($recipBest{$qDb} || ! $opt_targetOnly) {
     # qDb vs tDb
     &splitSequence($qDb, $qFasta);
-    &runPairwiseBlastp($qDb, $tDb, $pairwiseMaxPer);
+    &runPairwiseBlastp($qDb, $tDb, $e, $pairwiseMaxPer);
   }
   if (! $recipBest{$qDb} && ! $opt_targetOnly) {
     &loadPairwise($qDb, $tDb, $tPrefix, $pairwiseMaxPer);
   }