src/hg/tcga/pBamBam/asBamBam.c 1.11
1.11 2010/06/01 21:46:13 jsanborn
added new mutation caller
Index: src/hg/tcga/pBamBam/asBamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/asBamBam.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 4 -r1.10 -r1.11
--- src/hg/tcga/pBamBam/asBamBam.c 22 Apr 2010 18:10:50 -0000 1.10
+++ src/hg/tcga/pBamBam/asBamBam.c 1 Jun 2010 21:46:13 -0000 1.11
@@ -54,8 +54,10 @@
{"readsPerBin", OPTION_INT},
{NULL, 0}
};
+#define IN_PROB 0.94 // probability of getting base that's member of genotype
+ // (1.0 - IN_PROB) / 3.0 = prob of getting error
#define BAD_TID -999
/* Option variables */
char *faidx = NULL;
@@ -113,8 +115,10 @@
int svStopL;
int svStartR;
int svStopR;
+ long double fracGerm;
+
} analysis_params_t;
static int32_t countsL1[4];
static int32_t countsL2[4];
@@ -916,14 +921,14 @@
if (!p)
return;
bam1_core_t *c = &p->b->core;
-int qstrand = bam1_strand(p->b);
-int mstrand = bam1_mstrand(p->b);
int32_t qstart = (c)->pos;
int32_t qstop = qstart + (c)->l_qseq;
int32_t mstart = (c)->mpos;
int32_t mstop = mstart + (c)->l_qseq;
+int qstrand = bam1_strand(p->b);
+int mstrand = bam1_mstrand(p->b);
if (sv->tid == -1) // not set yet
{
sv->tid = (c)->tid;
@@ -1092,13 +1097,11 @@
/* check average mapping quality of discordants */
int qual = ceil((double) intraL.qual / (double) intraL.supp);
if (intraL.supp >= ap->minSuppSV && qual >= ap->avgMapQ)
{ // check mate pair direction trend... should be positive for properly paired (opposite strand)
-
- //return 1;
-
- if ((intraL.qstrand == intraL.mstrand && intraL.mdir < 0) ||
- (intraL.qstrand != intraL.mstrand && intraL.mdir > 0))
+ if ((intraL.qstrand && intraL.mstrand && intraL.mdir < 0) || // + +
+ (!intraL.qstrand && !intraL.mstrand && intraL.mdir < 0) || // - -
+ (intraL.qstrand != intraL.mstrand && intraL.mdir > 0)) // + - or - +
return 1;
}
reset_intra();
@@ -1116,13 +1119,11 @@
/* check average mapping quality of discordants */
int qual = ceil((double) interL.qual / (double) interL.supp);
if (interL.supp >= ap->minSuppSV && qual >= ap->avgMapQ)
{ // check mate pair direction trend... should be positive for properly paired (opposite strand)
-
- //return 1;
-
- if ((interL.qstrand == interL.mstrand && interL.mdir < 0) ||
- (interL.qstrand != interL.mstrand && interL.mdir > 0))
+ if ((interL.qstrand && interL.mstrand && interL.mdir < 0) || // + +
+ (!interL.qstrand && !interL.mstrand && interL.mdir < 0) || // - -
+ (interL.qstrand != interL.mstrand && interL.mdir > 0)) // + - or - +
return 1;
}
reset_inter();
}
@@ -1178,9 +1179,10 @@
if (interR.mstrand)
msR = '-';
if ( interR.supp > 0 && interL.mtid == interR.mtid &&
- (interR.qstart < (interL.qstop + BUF) && interR.qstop > (interL.qstart - BUF)) )
+ (interR.qstart < (interL.qstop + BUF) && interR.qstop > (interL.qstart - BUF)) &&
+ (interR.mstart < (interL.mstop + BUF) && interR.mstop > (interL.mstart - BUF)) )
printf("GIR\t%d\t%3.1f\t%d\t%3.1f\t%c(%s:%d-%d)>%c(%s:%d-%d)\t%c(%s:%d-%d)>%c(%s:%d-%d)",
interR.supp, (double) interR.qual / (double) interR.supp,
interL.supp, (double) interL.qual / (double) interL.supp,
qsR, d->hL->target_name[interR.tid], interR.qstart, interR.qstop,
@@ -1217,9 +1219,10 @@
if (intraR.mstrand)
msR = '-';
if ( intraR.supp > 0 &&
- (intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF)) )
+ (intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF)) &&
+ (intraR.mstart < (intraL.mstop + BUF) && intraR.mstop > (intraL.mstart - BUF)) )
printf("GIA\t%d\t%3.1f\t%d\t%3.1f\t%c(%s:%d-%d)>%c(%s:%d-%d)\t%c(%s:%d-%d)>%c(%s:%d-%d)",
intraR.supp, (double) intraR.qual / (double) intraR.supp,
intraL.supp, (double) intraL.qual / (double) intraL.supp,
qsR, d->hL->target_name[intraR.tid], intraR.qstart, intraR.qstop,
@@ -1364,9 +1367,9 @@
char ref, callL1, callL2, callR1, callR2;
if (!is_mut(d, ap, &ref, &callL1, &callL2, &callR1, &callR2))
return;
-printf("MUT\t%s\t%d\t%d\t",
+printf("MUT1\t%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
printf("%c\t", ref);
if (callR2 == 'N')
@@ -1383,8 +1386,306 @@
printf("%f\t", chi_sq);
print_nuc_info();
}
+#define A 0
+#define G 1
+#define T 2
+#define C 3
+
+#define NUM_NUC 4
+
+#define AA 0
+#define AG 1
+#define AT 2
+#define AC 3
+#define GA 1
+#define GG 4
+#define GT 5
+#define GC 6
+#define TA 2
+#define TG 5
+#define TT 7
+#define TC 8
+#define CA 3
+#define CG 6
+#define CT 8
+#define CC 9
+
+#define NUM_GENO 10
+
+#define MAX_COUNT 1000
+
+static long double *logvals;
+static long double **genoPrior;
+static long double **mutProb;
+static long double **mutProbGeno;
+static long double **baseProb;
+
+#define GENO_1 "AAAAGGGTTC"
+#define GENO_2 "AGTCGTCTCC"
+
+
+void initProbTables()
+{
+int i, j;
+
+logvals = (long double *)(malloc(MAX_COUNT * sizeof(long double)));
+for (i = 0; i < MAX_COUNT; i++)
+ logvals[i] = logl((long double) i + 1.0);
+
+genoPrior = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+ genoPrior[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+for (i = 0; i < NUM_GENO; i++)
+ {
+ int i1 = nuc_index(GENO_1[i]);
+ int i2 = nuc_index(GENO_2[i]);
+ for (j = 0; j < NUM_NUC; j++)
+ {
+ long double val;
+ if (i1 == j && i2 == j)
+ val = 0.80;
+ else if (i1 == j || i2 == j)
+ val = 0.10;
+ else
+ val = 0.05;
+
+ genoPrior[i][j] = logl(val);
+ }
+ }
+mutProb = (long double **)(malloc(NUM_NUC * sizeof(long double*)));
+for (i = 0; i < NUM_NUC; i++)
+ mutProb[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+long double val, tot;
+for (i = 0; i < NUM_NUC; i++)
+ {
+ tot = 0.0;
+ for (j = 0; j < NUM_NUC; j++)
+ {
+ /* else, transitions 4x more likely than transitions */
+ if (i == j)
+ val = 1.0;
+ else if ((NUCS[i] == 'A' && NUCS[j] == 'G') || (NUCS[i] == 'G' && NUCS[j] == 'A'))
+ val = 0.1;
+ else if ((NUCS[i] == 'C' && NUCS[j] == 'T') || (NUCS[i] == 'T' && NUCS[j] == 'C'))
+ val = 0.1;
+ else
+ val = 0.025;
+
+ mutProb[i][j] = val;
+ tot += val;
+ }
+
+ /* normalize probs, conditioned on germline call */
+ for (j = 0; j < NUM_NUC; j++)
+ {
+ mutProb[i][j] = logl(mutProb[i][j]/tot);
+ }
+ }
+
+mutProbGeno = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+ mutProbGeno[i] = (long double *)(malloc(NUM_GENO * sizeof(long double)));
+
+for (i = 0; i < NUM_GENO; i++)
+ {
+ int i1 = nuc_index(GENO_1[i]);
+ int i2 = nuc_index(GENO_2[i]);
+ for (j = 0; j < NUM_GENO; j++)
+ {
+ int j1 = nuc_index(GENO_1[j]);
+ int j2 = nuc_index(GENO_2[j]);
+
+ long double val1 = mutProb[i1][j1] + mutProb[i2][j2];
+ long double val2 = mutProb[i1][j2] + mutProb[i2][j1];
+
+ if (val1 > val2)
+ mutProbGeno[i][j] = val1;
+ else
+ mutProbGeno[i][j] = val2;
+ }
+ }
+baseProb = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+ baseProb[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+long double inprob = IN_PROB;
+long double outprob = (1.0 - inprob) / 3.0;
+
+for (i = 0; i < NUM_GENO; i++)
+ {
+ int i1 = nuc_index(GENO_1[i]);
+ int i2 = nuc_index(GENO_2[i]);
+ for (j = 0; j < NUM_NUC; j++)
+ {
+ if (i1 == j && i2 == j)
+ baseProb[i][j] = inprob;
+ else if (i1 == j || i2 == j)
+ baseProb[i][j] = (inprob + outprob) / 2.0;
+ else
+ baseProb[i][j] = outprob;
+
+ baseProb[i][j] = logl(baseProb[i][j]);
+ fprintf(stderr, "%c%c-%c %Lf\t", GENO_1[i], GENO_2[i], NUCS[j], baseProb[i][j]);
+ }
+ fprintf(stderr, "\n");
+ }
+}
+
+static long double dataProbGerm(int geno)
+{
+int i, j, tot = 0;
+long double p = 0.0, scale = 0.0;
+
+for (i = 0; i < NUM_NUC; i++)
+ {
+ p += (long double) countsR[i] * baseProb[geno][i];
+
+ tot += countsR[i];
+ for (j = 0; j < countsR[i]; j++)
+ scale -= logvals[j];
+ }
+
+for (i = 0; i < tot; i++)
+ scale += logvals[i];
+
+return p + scale;
+}
+
+static long double dataProbTum(int genoG, int genoT, long double alpha)
+{
+int i, j, tot = 0;
+long double p = 0.0, scale = 0.0;
+
+for (i = 0; i < NUM_NUC; i++)
+ {
+ p += (long double) countsL[i] * (alpha * baseProb[genoG][i] + (1.0 - alpha) * baseProb[genoT][i]);
+
+ tot += countsL[i];
+ for (j = 0; j < countsL[i]; j++)
+ scale -= logvals[j];
+ }
+
+for (i = 0; i < tot; i++)
+ scale += logvals[i];
+
+return p + scale;
+}
+
+
+static int is_mut_prob(dual_pileup_t *d, analysis_params_t *ap, char *ref,
+ char *callL1, char *callL2, char *callR1, char *callR2,
+ long double *conf)
+{
+if (majorR.vote == 0 || majorL.vote == 0)
+ return 0; // most likely a bad quality region.
+
+int i, j, imax = -1, jmax = -1;
+int totalL = 0, totalR = 0;
+for (i = 0; i < NUM_NUC; i++)
+ {
+ totalL += countsL[i];
+ totalR += countsR[i];
+ }
+
+if (totalL < ap->minSuppSNP || totalR < ap->minSuppSNP)
+ return 0;
+
+/* rescale counts to total to less than MAX_COUNT, if exceeded */
+if (totalL > MAX_COUNT)
+ for (i = 0; i < NUM_NUC; i++)
+ countsL[i] = (int) floor((float) countsL[i] * (float) MAX_COUNT / (float) totalL);
+if (totalR > MAX_COUNT)
+ for (i = 0; i < NUM_NUC; i++)
+ countsR[i] = (int) floor((float) countsR[i] * (float) MAX_COUNT / (float) totalR);
+
+long double alpha = (ap->fracGerm * (long double) totalR) / (long double) totalL;
+if (alpha > ap->fracGerm)
+ alpha = ap->fracGerm;
+
+char rb = (d->ref && (int)d->posL < d->len) ? toupper(d->ref[d->posL]) : 'N';
+int ri = nuc_index(rb);
+int i1, i2, j1, j2;
+
+long double p, ptot = 0.0, pmax = -999999999.99;
+for (i = 0; i < NUM_GENO; i++)
+ { // loop germline genotypes
+ i1 = nuc_index(GENO_1[i]);
+ i2 = nuc_index(GENO_2[i]);
+ if (!countsL[i1] && !countsL[i2] && !countsR[i1] && !countsR[i2])
+ continue; // skip genotypes
+
+ long double germP = 0.0;
+ if (ri >= 0)
+ germP = genoPrior[i][ri];
+ germP += dataProbGerm(i);
+
+ for (j = 0; j < NUM_GENO; j++)
+ { // loop tumor genotypes
+ j1 = nuc_index(GENO_1[j]);
+ j2 = nuc_index(GENO_2[j]);
+ if (!countsL[j1] && !countsL[j2] && !countsR[j1] && !countsR[j2])
+ continue;
+
+ p = germP;
+ p += mutProbGeno[i][j];
+ p += dataProbTum(i, j, alpha);
+
+ ptot += expl(p);
+
+ if (p > pmax)
+ {
+ pmax = p;
+ imax = i;
+ jmax = j;
+ }
+ }
+ }
+
+if (imax < 0 || jmax < 0)
+ return 0;
+
+*ref = rb;
+*callL1 = GENO_1[jmax];
+*callL2 = GENO_2[jmax];
+*callR1 = GENO_1[imax];
+*callR2 = GENO_2[imax];
+*conf = expl(pmax) / ptot;
+
+if (imax != jmax)
+ return 1;
+if (GENO_1[imax] != GENO_2[imax] || (rb != GENO_1[imax] && rb != GENO_2[imax]))
+ return 2;
+
+return 0;
+}
+
+
+static void do_mut_prob(dual_pileup_t *d, analysis_params_t *ap)
+{
+int ret;
+char ref, callL1, callL2, callR1, callR2;
+long double conf;
+if ((ret = is_mut_prob(d, ap, &ref, &callL1, &callL2, &callR1, &callR2, &conf)) == 0)
+ return;
+
+printf("MUT2\t%s\t%d\t%d\t",
+ d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+if (ret == 1)
+ printf("SOM\t");
+else
+ printf("GERM\t");
+
+printf("%c\t", ref);
+printf("%c%c>", callR1, callR2);
+printf("%c%c\t", callL1, callL2);
+
+printf("%Lf\t", conf);
+print_nuc_info();
+}
static void perform_analysis(dual_pileup_t *d, void *func_data)
{
@@ -1402,9 +1703,10 @@
/* het analysis: ASCN & phasing */
do_het_analysis(d, ap);
/* mutation discovery */
-do_mut(d, ap);
+do_mut(d, ap); // old way
+do_mut_prob(d, ap); // new way
fflush(stdout);
}