src/hg/tcga/pBamBam/asBamBam.c 1.11

1.11 2010/06/01 21:46:13 jsanborn
added new mutation caller
Index: src/hg/tcga/pBamBam/asBamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/asBamBam.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 4 -r1.10 -r1.11
--- src/hg/tcga/pBamBam/asBamBam.c	22 Apr 2010 18:10:50 -0000	1.10
+++ src/hg/tcga/pBamBam/asBamBam.c	1 Jun 2010 21:46:13 -0000	1.11
@@ -54,8 +54,10 @@
     {"readsPerBin", OPTION_INT},
     {NULL, 0}
 };
 
+#define IN_PROB 0.94     // probability of getting base that's member of genotype
+                         // (1.0 - IN_PROB) / 3.0 = prob of getting error  
 #define BAD_TID -999
 
 /* Option variables */
 char *faidx     = NULL;
@@ -113,8 +115,10 @@
     int svStopL;
     int svStartR;
     int svStopR;
 
+    long double fracGerm;
+
 } analysis_params_t;
 
 static int32_t countsL1[4];
 static int32_t countsL2[4];
@@ -916,14 +921,14 @@
 if (!p)
     return;
 
 bam1_core_t *c = &p->b->core;
-int qstrand    = bam1_strand(p->b);
-int mstrand    = bam1_mstrand(p->b);
 int32_t qstart = (c)->pos;
 int32_t qstop  = qstart + (c)->l_qseq;
 int32_t mstart = (c)->mpos;
 int32_t mstop  = mstart + (c)->l_qseq;
+int qstrand    = bam1_strand(p->b); 
+int mstrand    = bam1_mstrand(p->b);
 
 if (sv->tid == -1)  // not set yet
     {
     sv->tid     = (c)->tid;
@@ -1092,13 +1097,11 @@
     /* check average mapping quality of discordants */
     int qual = ceil((double) intraL.qual / (double) intraL.supp); 
     if (intraL.supp >= ap->minSuppSV && qual >= ap->avgMapQ)
 	{  // check mate pair direction trend... should be positive for properly paired (opposite strand)
-
-	//return 1;
-
-	if ((intraL.qstrand == intraL.mstrand && intraL.mdir < 0) || 
-	    (intraL.qstrand != intraL.mstrand && intraL.mdir > 0))
+	if ((intraL.qstrand && intraL.mstrand && intraL.mdir < 0) ||    // + +
+	    (!intraL.qstrand && !intraL.mstrand && intraL.mdir < 0) ||  // - -
+	    (intraL.qstrand != intraL.mstrand && intraL.mdir > 0))      // + - or - +
 	    return 1;
 	}
 
     reset_intra();
@@ -1116,13 +1119,11 @@
     /* check average mapping quality of discordants */
     int qual = ceil((double) interL.qual / (double) interL.supp); 
     if (interL.supp >= ap->minSuppSV && qual >= ap->avgMapQ)
 	{  // check mate pair direction trend... should be positive for properly paired (opposite strand)
-
-	//return 1;
-
-	if ((interL.qstrand == interL.mstrand && interL.mdir < 0) || 
-	    (interL.qstrand != interL.mstrand && interL.mdir > 0))
+	if ((interL.qstrand && interL.mstrand && interL.mdir < 0) ||     // + + 
+	    (!interL.qstrand && !interL.mstrand && interL.mdir < 0) ||   // - - 
+	    (interL.qstrand != interL.mstrand && interL.mdir > 0))       // + - or - +
 	    return 1;
 	}
     reset_inter();
     }
@@ -1178,9 +1179,10 @@
     if (interR.mstrand)
 	msR = '-';
 
     if ( interR.supp > 0 && interL.mtid == interR.mtid &&  
-	(interR.qstart < (interL.qstop + BUF) && interR.qstop > (interL.qstart - BUF)) )
+	(interR.qstart < (interL.qstop + BUF) && interR.qstop > (interL.qstart - BUF)) &&
+	(interR.mstart < (interL.mstop + BUF) && interR.mstop > (interL.mstart - BUF)) )
 	printf("GIR\t%d\t%3.1f\t%d\t%3.1f\t%c(%s:%d-%d)>%c(%s:%d-%d)\t%c(%s:%d-%d)>%c(%s:%d-%d)", 
 	       interR.supp, (double) interR.qual / (double) interR.supp, 
 	       interL.supp, (double) interL.qual / (double) interL.supp, 
 	       qsR, d->hL->target_name[interR.tid], interR.qstart, interR.qstop,
@@ -1217,9 +1219,10 @@
     if (intraR.mstrand)
 	msR = '-';
 
     if ( intraR.supp > 0 && 
-	(intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF)) )
+	(intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF)) &&
+	(intraR.mstart < (intraL.mstop + BUF) && intraR.mstop > (intraL.mstart - BUF)) )
 	printf("GIA\t%d\t%3.1f\t%d\t%3.1f\t%c(%s:%d-%d)>%c(%s:%d-%d)\t%c(%s:%d-%d)>%c(%s:%d-%d)", 
 	       intraR.supp, (double) intraR.qual / (double) intraR.supp, 
 	       intraL.supp, (double) intraL.qual / (double) intraL.supp, 
 	       qsR, d->hL->target_name[intraR.tid], intraR.qstart, intraR.qstop,
@@ -1364,9 +1367,9 @@
 char ref, callL1, callL2, callR1, callR2;
 if (!is_mut(d, ap, &ref, &callL1, &callL2, &callR1, &callR2))
     return;
 
-printf("MUT\t%s\t%d\t%d\t",
+printf("MUT1\t%s\t%d\t%d\t",
        d->hL->target_name[d->tidL], d->posL, d->posL + 1);
 
 printf("%c\t", ref);
 if (callR2 == 'N')
@@ -1383,8 +1386,306 @@
 printf("%f\t", chi_sq);
 print_nuc_info();
 }
 
+#define A 0
+#define G 1
+#define T 2
+#define C 3
+
+#define NUM_NUC 4
+
+#define AA 0
+#define AG 1
+#define AT 2
+#define AC 3
+#define GA 1
+#define GG 4
+#define GT 5
+#define GC 6
+#define TA 2
+#define TG 5
+#define TT 7
+#define TC 8
+#define CA 3
+#define CG 6
+#define CT 8
+#define CC 9
+
+#define NUM_GENO 10
+
+#define MAX_COUNT 1000
+
+static long double *logvals;
+static long double **genoPrior;
+static long double **mutProb;
+static long double **mutProbGeno;
+static long double **baseProb;
+
+#define GENO_1 "AAAAGGGTTC"
+#define GENO_2 "AGTCGTCTCC"
+
+
+void initProbTables()
+{
+int i, j;
+
+logvals = (long double *)(malloc(MAX_COUNT * sizeof(long double)));
+for (i = 0; i < MAX_COUNT; i++)
+    logvals[i] = logl((long double) i + 1.0);
+
+genoPrior = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+    genoPrior[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+for (i = 0; i < NUM_GENO; i++)
+    {
+    int i1 = nuc_index(GENO_1[i]);
+    int i2 = nuc_index(GENO_2[i]);
+    for (j = 0; j < NUM_NUC; j++)
+	{
+	long double val;
+	if (i1 == j && i2 == j)
+	    val = 0.80;
+	else if (i1 == j || i2 == j)
+	    val = 0.10;
+	else
+	    val = 0.05;
+
+	genoPrior[i][j] = logl(val);
+	}
+    }
+mutProb = (long double **)(malloc(NUM_NUC * sizeof(long double*)));
+for (i = 0; i < NUM_NUC; i++)
+    mutProb[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+long double val, tot;
+for (i = 0; i < NUM_NUC; i++)
+    {
+    tot = 0.0;
+    for (j = 0; j < NUM_NUC; j++)
+	{
+	/* else, transitions 4x more likely than transitions */
+	if (i == j)
+	    val = 1.0;
+	else if ((NUCS[i] == 'A' && NUCS[j] == 'G') || (NUCS[i] == 'G' && NUCS[j] == 'A'))
+	    val = 0.1;
+	else if ((NUCS[i] == 'C' && NUCS[j] == 'T') || (NUCS[i] == 'T' && NUCS[j] == 'C'))
+	    val = 0.1;
+	else
+	    val = 0.025;
+
+	mutProb[i][j] = val;
+	tot += val;
+	}
+
+    /* normalize probs, conditioned on germline call */
+    for (j = 0; j < NUM_NUC; j++)
+	{
+	mutProb[i][j] = logl(mutProb[i][j]/tot);
+	}
+    }
+
+mutProbGeno = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+    mutProbGeno[i] = (long double *)(malloc(NUM_GENO * sizeof(long double)));
+
+for (i = 0; i < NUM_GENO; i++)
+    {
+    int i1 = nuc_index(GENO_1[i]);
+    int i2 = nuc_index(GENO_2[i]);
+    for (j = 0; j < NUM_GENO; j++)
+	{
+	int j1 = nuc_index(GENO_1[j]);
+	int j2 = nuc_index(GENO_2[j]);
+
+	long double val1 = mutProb[i1][j1] + mutProb[i2][j2];
+	long double val2 = mutProb[i1][j2] + mutProb[i2][j1];
+
+	if (val1 > val2)
+	    mutProbGeno[i][j] = val1;
+	else
+	    mutProbGeno[i][j] = val2;
+	}
+    }
+baseProb = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+    baseProb[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+long double inprob = IN_PROB;
+long double outprob = (1.0 - inprob) / 3.0;
+
+for (i = 0; i < NUM_GENO; i++)
+    {
+    int i1 = nuc_index(GENO_1[i]);
+    int i2 = nuc_index(GENO_2[i]);
+    for (j = 0; j < NUM_NUC; j++)
+	{
+	if (i1 == j && i2 == j)
+	    baseProb[i][j] = inprob;
+	else if (i1 == j || i2 == j)
+	    baseProb[i][j] = (inprob + outprob) / 2.0;
+	else
+	    baseProb[i][j] = outprob;
+
+	baseProb[i][j] = logl(baseProb[i][j]);
+	fprintf(stderr, "%c%c-%c %Lf\t", GENO_1[i], GENO_2[i], NUCS[j], baseProb[i][j]);
+	}
+    fprintf(stderr, "\n");
+    }
+}
+
+static long double dataProbGerm(int geno)
+{
+int i, j, tot = 0;
+long double p = 0.0, scale = 0.0;
+
+for (i = 0; i < NUM_NUC; i++)
+    {
+    p += (long double) countsR[i] * baseProb[geno][i];
+
+    tot += countsR[i];
+    for (j = 0; j < countsR[i]; j++)
+	scale -= logvals[j];
+    }
+
+for (i = 0; i < tot; i++)
+    scale += logvals[i];
+
+return p + scale;
+}
+
+static long double dataProbTum(int genoG, int genoT, long double alpha)
+{
+int i, j, tot = 0;
+long double p = 0.0, scale = 0.0;
+
+for (i = 0; i < NUM_NUC; i++)
+    {
+    p += (long double) countsL[i] * (alpha * baseProb[genoG][i] + (1.0 - alpha) * baseProb[genoT][i]);
+
+    tot += countsL[i];
+    for (j = 0; j < countsL[i]; j++)
+	scale -= logvals[j];
+    }
+
+for (i = 0; i < tot; i++)
+    scale += logvals[i];
+
+return p + scale;
+}
+
+
+static int is_mut_prob(dual_pileup_t *d, analysis_params_t *ap, char *ref,
+		       char *callL1, char *callL2, char *callR1, char *callR2,
+		       long double *conf)
+{
+if (majorR.vote == 0 || majorL.vote == 0)
+    return 0;  // most likely a bad quality region.
+
+int i, j, imax = -1, jmax = -1;
+int totalL = 0, totalR = 0;
+for (i = 0; i < NUM_NUC; i++)
+    {
+    totalL += countsL[i];
+    totalR += countsR[i];
+    }
+
+if (totalL < ap->minSuppSNP || totalR < ap->minSuppSNP)
+    return 0;
+
+/* rescale counts to total to less than MAX_COUNT, if exceeded */
+if (totalL > MAX_COUNT)
+    for (i = 0; i < NUM_NUC; i++)
+	countsL[i] = (int) floor((float) countsL[i] * (float) MAX_COUNT / (float) totalL);
+if (totalR > MAX_COUNT)
+    for (i = 0; i < NUM_NUC; i++)
+	countsR[i] = (int) floor((float) countsR[i] * (float) MAX_COUNT / (float) totalR);
+
+long double alpha = (ap->fracGerm * (long double) totalR) / (long double) totalL;
+if (alpha > ap->fracGerm)
+    alpha = ap->fracGerm;
+
+char rb = (d->ref && (int)d->posL < d->len) ? toupper(d->ref[d->posL]) : 'N';
+int ri  = nuc_index(rb);
+int i1, i2, j1, j2;
+
+long double p, ptot = 0.0, pmax = -999999999.99;
+for (i = 0; i < NUM_GENO; i++)
+    {  // loop germline genotypes
+    i1 = nuc_index(GENO_1[i]);
+    i2 = nuc_index(GENO_2[i]);
+    if (!countsL[i1] && !countsL[i2] && !countsR[i1] && !countsR[i2])
+	continue;  // skip genotypes
+
+    long double germP = 0.0;
+    if (ri >= 0)
+	germP = genoPrior[i][ri];
+    germP += dataProbGerm(i);
+
+    for (j = 0; j < NUM_GENO; j++)
+	{  // loop tumor genotypes
+	j1 = nuc_index(GENO_1[j]);
+	j2 = nuc_index(GENO_2[j]);
+	if (!countsL[j1] && !countsL[j2] && !countsR[j1] && !countsR[j2])
+	    continue;
+
+	p = germP;
+	p += mutProbGeno[i][j];
+	p += dataProbTum(i, j, alpha);
+
+	ptot += expl(p);
+
+	if (p > pmax)
+	    {
+	    pmax = p;
+	    imax = i;
+	    jmax = j;
+	    }
+	}
+    }
+
+if (imax < 0 || jmax < 0)
+    return 0;
+
+*ref    = rb;
+*callL1 = GENO_1[jmax];
+*callL2 = GENO_2[jmax];
+*callR1 = GENO_1[imax];
+*callR2 = GENO_2[imax];
+*conf   = expl(pmax) / ptot;
+
+if (imax != jmax)
+    return 1;
+if (GENO_1[imax] != GENO_2[imax] || (rb != GENO_1[imax] && rb != GENO_2[imax]))
+    return 2;
+
+return 0;
+}
+
+
+static void do_mut_prob(dual_pileup_t *d, analysis_params_t *ap)
+{
+int ret;
+char ref, callL1, callL2, callR1, callR2;
+long double conf;
+if ((ret = is_mut_prob(d, ap, &ref, &callL1, &callL2, &callR1, &callR2, &conf)) == 0)
+    return;
+
+printf("MUT2\t%s\t%d\t%d\t",
+       d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+if (ret == 1)
+    printf("SOM\t");
+else
+    printf("GERM\t");
+
+printf("%c\t", ref);
+printf("%c%c>", callR1, callR2);
+printf("%c%c\t", callL1, callL2);
+
+printf("%Lf\t", conf);
+print_nuc_info();
+}
 
 
 static void perform_analysis(dual_pileup_t *d, void *func_data)
 {
@@ -1402,9 +1703,10 @@
 /* het analysis: ASCN & phasing */
 do_het_analysis(d, ap);
 
 /* mutation discovery */
-do_mut(d, ap);
+do_mut(d, ap);       // old way
+do_mut_prob(d, ap);  // new way
 
 fflush(stdout);
 }