src/hg/tcga/pBamBam/pBamBamSNP.c 1.2
1.2 2010/06/01 21:46:13 jsanborn
added new mutation caller
Index: src/hg/tcga/pBamBam/pBamBamSNP.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/pBamBamSNP.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/tcga/pBamBam/pBamBamSNP.c 28 Apr 2010 00:37:21 -0000 1.1
+++ src/hg/tcga/pBamBam/pBamBamSNP.c 1 Jun 2010 21:46:13 -0000 1.2
@@ -52,8 +52,11 @@
{"fracGerm", OPTION_FLOAT},
{NULL, 0}
};
+#define IN_PROB 0.94 // probability of getting base that's member of genotype
+ // (1.0 - IN_PROB) / 3.0 = prob of getting error
+
#define BAD_TID -999
/* Option variables */
char *faidx = NULL;
@@ -762,8 +765,11 @@
#define CC 9
#define NUM_GENO 10
+#define MAX_COUNT 1000
+
+static long double *logvals;
static long double **genoPrior;
static long double **mutProb;
static long double **mutProbGeno;
static long double **baseProb;
@@ -774,8 +780,12 @@
void initProbTables()
{
int i, j;
+logvals = (long double *)(malloc(MAX_COUNT * sizeof(long double)));
+for (i = 0; i < MAX_COUNT; i++)
+ logvals[i] = logl((long double) i + 1.0);
+
genoPrior = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
for (i = 0; i < NUM_GENO; i++)
genoPrior[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
@@ -862,9 +872,9 @@
baseProb = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
for (i = 0; i < NUM_GENO; i++)
baseProb[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
-long double inprob = 0.9;
+long double inprob = IN_PROB;
long double outprob = (1.0 - inprob) / 3.0;
for (i = 0; i < NUM_GENO; i++)
{
@@ -896,13 +906,13 @@
p += (long double) countsR[i] * baseProb[geno][i];
tot += countsR[i];
for (j = 0; j < countsR[i]; j++)
- scale -= logl((long double) (j + 1.0));
+ scale -= logvals[j];
}
for (i = 0; i < tot; i++)
- scale += logl((long double) (i + 1.0));
+ scale += logvals[i];
return p + scale;
}
@@ -916,13 +926,13 @@
p += (long double) countsL[i] * (alpha * baseProb[genoG][i] + (1.0 - alpha) * baseProb[genoT][i]);
tot += countsL[i];
for (j = 0; j < countsL[i]; j++)
- scale -= logl((long double) (j + 1.0));
+ scale -= logvals[j];
}
for (i = 0; i < tot; i++)
- scale += logl((long double) (i + 1.0));
+ scale += logvals[i];
return p + scale;
}
@@ -940,36 +950,59 @@
totalL += countsL[i];
totalR += countsR[i];
}
+if (totalL < ap->minSuppSNP || totalR < ap->minSuppSNP)
+ return 0;
+
+/* rescale counts to total to less than MAX_COUNT, if exceeded */
+if (totalL > MAX_COUNT)
+ for (i = 0; i < NUM_NUC; i++)
+ countsL[i] = (int) floor((float) countsL[i] * (float) MAX_COUNT / (float) totalL);
+if (totalR > MAX_COUNT)
+ for (i = 0; i < NUM_NUC; i++)
+ countsR[i] = (int) floor((float) countsR[i] * (float) MAX_COUNT / (float) totalR);
+
long double alpha = (ap->fracGerm * (long double) totalR) / (long double) totalL;
if (alpha > ap->fracGerm)
alpha = ap->fracGerm;
-char rb = (d->ref && (int)d->posL < d->len)? toupper(d->ref[d->posL]) : 'N';
+char rb = (d->ref && (int)d->posL < d->len) ? toupper(d->ref[d->posL]) : 'N';
int ri = nuc_index(rb);
+int i1, i2, j1, j2;
long double p, ptot = 0.0, pmax = -999999999.99;
for (i = 0; i < NUM_GENO; i++)
{ // loop germline genotypes
+ i1 = nuc_index(GENO_1[i]);
+ i2 = nuc_index(GENO_2[i]);
+ if (!countsL[i1] && !countsL[i2] && !countsR[i1] && !countsR[i2])
+ continue; // skip genotypes
+
+ long double germP = 0.0;
+ if (ri >= 0)
+ germP = genoPrior[i][ri];
+ germP += dataProbGerm(i);
+
for (j = 0; j < NUM_GENO; j++)
{ // loop tumor genotypes
- if (ri < 0)
- p = 0.0;
- else
- p = genoPrior[i][ri];
- p = 0.0;
+ j1 = nuc_index(GENO_1[j]);
+ j2 = nuc_index(GENO_2[j]);
+ if (!countsL[j1] && !countsL[j2] && !countsR[j1] && !countsR[j2])
+ continue;
+
+ p = germP;
p += mutProbGeno[i][j];
p += dataProbTum(i, j, alpha);
- p += dataProbGerm(i);
+
+ ptot += expl(p);
if (p > pmax)
{
pmax = p;
imax = i;
jmax = j;
}
- ptot += expl(p);
}
}
if (imax < 0 || jmax < 0)