src/hg/tcga/pBamBam/pBamBamSNP.c 1.2

1.2 2010/06/01 21:46:13 jsanborn
added new mutation caller
Index: src/hg/tcga/pBamBam/pBamBamSNP.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/pBamBamSNP.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/tcga/pBamBam/pBamBamSNP.c	28 Apr 2010 00:37:21 -0000	1.1
+++ src/hg/tcga/pBamBam/pBamBamSNP.c	1 Jun 2010 21:46:13 -0000	1.2
@@ -52,8 +52,11 @@
     {"fracGerm", OPTION_FLOAT},
     {NULL, 0}
 };
 
+#define IN_PROB 0.94     // probability of getting base that's member of genotype
+                         // (1.0 - IN_PROB) / 3.0 = prob of getting error
+
 #define BAD_TID -999
 
 /* Option variables */
 char *faidx     = NULL;
@@ -762,8 +765,11 @@
 #define CC 9
 
 #define NUM_GENO 10
 
+#define MAX_COUNT 1000
+
+static long double *logvals;
 static long double **genoPrior;
 static long double **mutProb;
 static long double **mutProbGeno;
 static long double **baseProb;
@@ -774,8 +780,12 @@
 void initProbTables()
 {
 int i, j;
 
+logvals = (long double *)(malloc(MAX_COUNT * sizeof(long double)));
+for (i = 0; i < MAX_COUNT; i++)
+    logvals[i] = logl((long double) i + 1.0);
+
 genoPrior = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
 for (i = 0; i < NUM_GENO; i++)
     genoPrior[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
 
@@ -862,9 +872,9 @@
 baseProb = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
 for (i = 0; i < NUM_GENO; i++)
     baseProb[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
 
-long double inprob = 0.9;
+long double inprob = IN_PROB;
 long double outprob = (1.0 - inprob) / 3.0;
 
 for (i = 0; i < NUM_GENO; i++)
     {
@@ -896,13 +906,13 @@
     p += (long double) countsR[i] * baseProb[geno][i];
     
     tot += countsR[i];
     for (j = 0; j < countsR[i]; j++)
-	scale -= logl((long double) (j + 1.0));
+	scale -= logvals[j];
     }
 
 for (i = 0; i < tot; i++)
-    scale += logl((long double) (i + 1.0));
+    scale += logvals[i];
 
 return p + scale;
 }
 
@@ -916,13 +926,13 @@
     p += (long double) countsL[i] * (alpha * baseProb[genoG][i] + (1.0 - alpha) * baseProb[genoT][i]);
     
     tot += countsL[i];
     for (j = 0; j < countsL[i]; j++)
-	scale -= logl((long double) (j + 1.0));
+	scale -= logvals[j];
     }
 
 for (i = 0; i < tot; i++)
-    scale += logl((long double) (i + 1.0));
+    scale += logvals[i];
 
 return p + scale;
 }
 
@@ -940,36 +950,59 @@
     totalL += countsL[i];
     totalR += countsR[i];
     }
 
+if (totalL < ap->minSuppSNP || totalR < ap->minSuppSNP)
+    return 0;
+
+/* rescale counts to total to less than MAX_COUNT, if exceeded */
+if (totalL > MAX_COUNT)
+    for (i = 0; i < NUM_NUC; i++)
+	countsL[i] = (int) floor((float) countsL[i] * (float) MAX_COUNT / (float) totalL);
+if (totalR > MAX_COUNT)
+    for (i = 0; i < NUM_NUC; i++)
+	countsR[i] = (int) floor((float) countsR[i] * (float) MAX_COUNT / (float) totalR);
+
 long double alpha = (ap->fracGerm * (long double) totalR) / (long double) totalL;
 if (alpha > ap->fracGerm)
     alpha = ap->fracGerm;
 
-char rb = (d->ref && (int)d->posL < d->len)? toupper(d->ref[d->posL]) : 'N';
+char rb = (d->ref && (int)d->posL < d->len) ? toupper(d->ref[d->posL]) : 'N';
 int ri  = nuc_index(rb);
+int i1, i2, j1, j2;
 
 long double p, ptot = 0.0, pmax = -999999999.99;
 for (i = 0; i < NUM_GENO; i++)
     {  // loop germline genotypes
+    i1 = nuc_index(GENO_1[i]);
+    i2 = nuc_index(GENO_2[i]);   
+    if (!countsL[i1] && !countsL[i2] && !countsR[i1] && !countsR[i2])
+	continue;  // skip genotypes 
+
+    long double germP = 0.0;
+    if (ri >= 0)
+	germP = genoPrior[i][ri];
+    germP += dataProbGerm(i);
+
     for (j = 0; j < NUM_GENO; j++)
 	{  // loop tumor genotypes
-	if (ri < 0)
-	    p = 0.0;
-	else
-	    p = genoPrior[i][ri];
-	p = 0.0;
+	j1 = nuc_index(GENO_1[j]);
+	j2 = nuc_index(GENO_2[j]);   
+	if (!countsL[j1] && !countsL[j2] && !countsR[j1] && !countsR[j2])
+	    continue;
+	
+	p = germP;
 	p += mutProbGeno[i][j];
 	p += dataProbTum(i, j, alpha);
-	p += dataProbGerm(i);
+
+	ptot += expl(p);
 
 	if (p > pmax)
 	    {
 	    pmax = p;
 	    imax = i;
 	    jmax = j;
 	    }
-	ptot += expl(p);
 	}
     }
 
 if (imax < 0 || jmax < 0)