src/hg/tcga/pBamBam/pBamBamSNP.c 1.1

1.1 2010/04/28 00:37:21 jsanborn
initial commit
Index: src/hg/tcga/pBamBam/pBamBamSNP.c
===================================================================
RCS file: src/hg/tcga/pBamBam/pBamBamSNP.c
diff -N src/hg/tcga/pBamBam/pBamBamSNP.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/tcga/pBamBam/pBamBamSNP.c	28 Apr 2010 00:37:21 -0000	1.1
@@ -0,0 +1,1128 @@
+/* asBamBam -- discovering variants from BAM file
+ *   code adapted from Heng Li's Samtools library functions
+ */
+
+#ifdef USE_BAM
+
+#include "common.h"
+#include "hCommon.h"
+#include "linefile.h"
+#include "hash.h"
+#include "hdb.h"
+#include "options.h"
+#include "jksql.h"
+
+#define _IOLIB 2
+#include "bam.h"
+#include "sam.h"
+#include "bam_helper.h"
+
+static char const rcsid[] = "$Id";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "bamBamSNP - Allele-Specific BamBam.\n"
+  "usage:\n"
+  "   asBamBam [options] <left.bam> <right.bam>\n"
+  "options:\n"
+  "   -fa=file          - FASTA filename of reference [NULL]\n"
+  "   -position=chr1:1-100 - Position to do analysis. [NULL=process all reads]\n" 
+  "   -minSuppSNP=INT   - Minimum Support for SNP/Het Calling. [4]\n"
+  "   -minSuppSV=INT    - Minimum Support for struct vars. [2]\n"
+  "   -minQ=INT         - Minimum acceptable base quality. [10]\n"
+  "   -minMapQ=INT      - Minimum acceptable mapping quality. [20]\n"
+  "   -avgMapQ=INT      - Minimum acceptable average mapping quality. [30]\n"
+  "   -minChiSq=INT     - Minimum acceptable chi square of nucleotide freqs. [250]\n"
+  "   -fracGerm=FLOAT   - Fraction of normal contamination [0.0]\n"
+  "\n"
+  );
+}
+
+static struct optionSpec options[] = {
+    {"fa", OPTION_STRING},
+    {"position", OPTION_STRING},
+    {"minSuppSNP", OPTION_INT},
+    {"minSuppSV", OPTION_INT},
+    {"minQ", OPTION_INT},
+    {"minMapQ", OPTION_INT},
+    {"avgMapQ", OPTION_INT},
+    {"minChiSq", OPTION_INT}, 
+    {"fracGerm", OPTION_FLOAT},
+    {NULL, 0}
+};
+
+#define BAD_TID -999
+
+/* Option variables */
+char *faidx     = NULL;
+char *position  = NULL;
+int minSuppSNP  = 4;
+int minSuppSV   = 2;
+int minQ        = 10;
+int minMapQ     = 20;
+int avgMapQ     = 30;
+int minChiSq    = 250;
+float fracGerm  = 0.0;
+
+typedef struct {
+    int minQ;
+    int minMapQ;
+    int avgMapQ;
+    int maxISize;
+    int minChiSq;
+    int minSuppSNP;
+    int minSuppSV;
+    int sites;
+
+    int readsPerBin;
+
+    int pStartPrev;
+    int pStopPrev;
+    int pStart;
+    int pStop;
+
+    /* Overall Copy Number */
+    int32_t tid;
+    int32_t lstart;
+    int32_t lstop;
+    int lcounts;
+    int32_t rstart;
+    int32_t rstop;
+    int rcounts;
+
+    int32_t prevTid;
+    int32_t prevStart;
+    int32_t prevStop;
+
+    /* ASCN */
+    int32_t tidAS;
+    int32_t startAS;
+    int32_t stopAS;
+    int lcountsMin;
+    int lcountsMax;
+    int rcountsMin;
+    int rcountsMax;
+
+    /* SV */
+    int svStartL;
+    int svStopL;
+    int svStartR;
+    int svStopR;
+    
+    long double fracGerm;
+} analysis_params_t;
+
+static int32_t countsL1[4];
+static int32_t countsL2[4];
+static int32_t countsL3[4];
+
+static int32_t countsR1[4];
+static int32_t countsR2[4];
+static int32_t countsR3[4];
+
+static int32_t votesL[4];
+static int32_t countsL[4];
+static int32_t votesR[4];
+static int32_t countsR[4];
+
+typedef struct {
+    char nuc;
+    int count;
+    int vote;
+} variant_t;
+
+variant_t minorL, majorL;
+variant_t minorR, majorR;
+
+static int32_t insL, delL, insR, delR;
+
+typedef struct {
+    int tid;
+    int mtid;
+
+    int32_t qstart;
+    int32_t qstop;
+    int32_t mstart;
+    int32_t mstop;
+
+    int supp;
+    int qual;
+    int mstrand;
+    int qstrand;
+
+    int mdir;      // trend of mate directions 
+    int pmstart;   // previos start
+} sv_t;
+
+#define BUF 100  // buffer around reads' starts/stops
+
+static int nuc_index(char c)
+{
+switch (c)
+    {
+    case 'A':
+	return 0;
+    case 'G':
+	return 1;
+    case 'T':
+	return 2;
+    case 'C':
+	return 3;
+    default:
+	return -1;
+    }
+return -1;
+}
+#define NUCS "AGTC"
+
+static inline void reset_globals(analysis_params_t *ap)
+{
+/* reset nuc counts */
+int i;
+for (i = 0; i < 4; i++)
+    { // sum up the qualities of each base
+    votesL[i]  = 0;
+    countsL[i] = 0;
+    votesR[i]  = 0; 
+    countsR[i] = 0;
+
+    countsL1[i] = 0;
+    countsL2[i] = 0;
+    countsL3[i] = 0;
+    countsR1[i] = 0;
+    countsR2[i] = 0;
+    countsR3[i] = 0;
+    }
+
+/* reset indel stats */
+insL = 0;
+insR = 0;
+delL = 0;
+delR = 0;
+
+/* reset pileup info */
+ap->pStart = -1, ap->pStop = -1;
+}
+
+static inline int check_read(const bam_pileup1_t *p, analysis_params_t *ap)
+{
+bam1_core_t *c = &p->b->core;
+/* Check if unmapped or PCR optical dupe */
+if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
+    return 0;
+
+/* check base quality */
+if (bam1_qual(p->b)[p->qpos] < ap->minQ)
+    return 0;
+
+/* check mapping quality */
+if ((c)->qual < ap->minMapQ)
+    return 0;
+
+/* all good */
+return 1;
+}
+
+
+static inline void nuc_counts(dual_pileup_t *d, analysis_params_t *ap)
+{ 
+/* reset global vars */
+reset_globals(ap);
+
+int i, index;
+int32_t start, stop;
+int32_t prevStart = -1;
+const bam_pileup1_t *p;
+
+char best_b1 = 'N';
+int best_q1  = -1;
+char best_b2 = 'N';
+int best_q2  = -1;
+int insertion = 0, deletion = 0;
+ 
+for (i = 0; i < d->nL; ++i)
+    { 
+    p = d->puL + i;
+    if (!check_read(p, ap))
+	continue;
+    bam1_core_t *c = &p->b->core;
+    int qual = bam1_qual(p->b)[p->qpos];    
+
+    /* Check if start & stop has already been visited, to dupes missed in prev step*/    
+    start = (c)->pos;
+    stop  = start + (c)->l_qseq;    
+
+    if (ap->pStart == -1)
+	{
+	ap->pStart = start;
+	ap->pStop  = stop;
+	}
+    else 
+	{
+	if (start < ap->pStart)
+	    ap->pStart = start;
+	if (stop  > ap->pStop)
+	    ap->pStop = stop;
+	}
+
+    if (prevStart == -1)
+	prevStart = start;
+
+    if (start < prevStart)
+	continue;
+
+    if (start == prevStart)
+	{
+	if (p->is_del)
+	    {
+	    if (p->indel > 0)
+		insertion = 1;
+	    else if (p->indel < 0)
+		deletion = 1;
+	    }
+	else
+	    {
+	    int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);    
+	    if (best_b1 == 'N')
+		{
+		best_b1 = b;
+		best_q1 = qual;
+		}
+	    else if (qual > best_q1)
+		{
+		best_b2 = best_b1;
+		best_q2 = best_q1;
+		
+		best_b1 = b;
+		best_q1 = qual;
+		}
+	    else if (qual > best_q2)
+		{
+		best_b2 = b;
+		best_q2 = qual;
+		}
+	    }
+	continue;
+	}
+    prevStart = start;
+    
+    if (insertion)
+	insL += 1;
+    else if (deletion)
+	delL += 1;
+
+    double lseq = (double) p->b->core.l_qseq;
+    if ((index = nuc_index(best_b1)) >= 0)
+	{
+	votesL[index] += best_q1;
+	countsL[index]++;
+
+        if (p->qpos < round(0.33 * lseq))
+	    countsL1[index]++;
+	else if (p->qpos < round(0.67 * lseq))
+	    countsL2[index]++;
+	else
+	    countsL3[index]++;
+	}
+    
+    if ((index = nuc_index(best_b2)) >= 0)
+	{
+	votesL[index] += best_q2;
+	countsL[index]++;
+	
+        if (p->qpos < round(0.33 * lseq))
+	    countsL1[index]++;
+	else if (p->qpos < round(0.67 * lseq))
+	    countsL2[index]++;
+	else
+	    countsL3[index]++;
+	}
+    best_b1 = 'N';
+    best_q1 = -1;
+    best_b2 = 'N';
+    best_q2 = -1;
+    
+    insertion = 0, deletion = 0;
+
+    if (p->is_del)
+	{
+	if (p->indel > 0)
+	    insertion = 1;
+	else if (p->indel < 0)
+	    deletion = 1;
+	}
+    else
+	{
+	int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
+	if (best_b1 == 'N')
+	    {
+	    best_b1 = b;
+	    best_q1 = qual;
+	    }
+	else if (qual > best_q1)
+	    {
+	    best_b2 = best_b1;
+	    best_q2 = best_q1;
+
+	    best_b1 = b;
+	    best_q1 = qual;
+	    }
+	else if (qual > best_q2)
+	    {
+	    best_b2 = b;
+	    best_q2 = qual;
+	    }
+	}
+    }
+
+prevStart = -1;
+for (i = 0; i < d->nR; ++i)
+    {
+    p = d->puR + i;
+    if (!check_read(p, ap))
+	continue;
+    bam1_core_t *c = &p->b->core;
+    int qual = bam1_qual(p->b)[p->qpos];    
+
+    /* Check if start & stop has already been visited, to dupes missed in prev step*/    
+    start = (c)->pos;
+    stop  = start + (c)->l_qseq;    
+    if (start <= prevStart)
+	continue;
+    prevStart = start;
+
+    if (ap->pStart == -1)
+	{
+	ap->pStart = start;
+	ap->pStop  = stop;
+	}
+    if (start < ap->pStart)
+	ap->pStart = start;
+    if (stop  > ap->pStop)
+	ap->pStop = stop;
+
+    if (p->indel > 0)
+	insR += 1;
+    else if (p->indel < 0)
+	delR += 1;
+    
+    if (!p->is_del && p->indel == 0)
+	{
+        double lseq = (double) p->b->core.l_qseq;
+	int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
+	if ((index = nuc_index(b)) < 0)
+	    continue;
+	votesR[index] += qual;
+	countsR[index]++;
+
+        if (p->qpos < round(0.33 * lseq))
+	    countsR1[index]++;
+	else if (p->qpos < round(0.67 * lseq))
+	    countsR2[index]++;
+	else
+	    countsR3[index]++;
+	}
+    }
+
+int iL1 = -1, iL2 = -1, maxL1 = 0, maxL2 = 0;
+int iR1 = -1, iR2 = -1, maxR1 = 0, maxR2 = 0;
+int countL1 = -1, countL2 = -1, countR1 = -1, countR2 = -1;
+for (i = 0; i < 4; i++)
+    {
+    if (countsL[i] > countL1)
+	{
+	iL2     = iL1;     // set prev max to max2
+	maxL2   = maxL1;
+	countL2 = countL1;
+
+	iL1     = i;
+	maxL1   = votesL[i];
+	countL1 = countsL[i];
+	}
+    else if (countsL[i] > countL2)
+	{
+	iL2     = i;
+	maxL2   = votesL[i];
+	countL2 = countsL[i];
+	}
+    if (countsR[i] > countR1)
+	{
+	iR2     = iR1;
+	maxR2   = maxR1;
+	countR2 = countR1;
+
+	iR1     = i;
+	maxR1   = votesR[i];
+	countR1 = countsR[i];
+	}
+    else if (countsR[i] > countR2)
+	{
+	iR2     = i;
+	maxR2   = votesR[i];
+	countR2 = countsR[i];
+	}
+    }
+
+majorL.nuc   = 'N';
+majorL.vote  = 0;
+majorL.count = 0;
+if (iL1 != -1 && countL1 >= ap->minSuppSNP)
+    {
+    majorL.nuc   = NUCS[iL1];
+    majorL.vote  = maxL1;
+    majorL.count = countL1;
+    }
+
+minorL.nuc   = 'N';
+minorL.vote  = 0;
+minorL.count = 0;
+if (iL2 != -1 && countL2 >= ap->minSuppSNP)
+    {
+    minorL.nuc   = NUCS[iL2];
+    minorL.vote  = maxL2;
+    minorL.count = countL2;
+    }
+
+majorR.nuc   = 'N';
+majorR.vote  = 0;
+majorR.count = 0;
+if (iR1 != -1 && countR1 >= ap->minSuppSNP)
+    {
+    majorR.nuc   = NUCS[iR1];
+    majorR.vote  = maxR1;
+    majorR.count = countR1;
+    }
+
+minorR.nuc   = 'N';
+minorR.vote  = 0;
+minorR.count = 0;
+if (iR2 != -1 && countR2 >= ap->minSuppSNP)
+    {
+    minorR.nuc   = NUCS[iR2];
+    minorR.vote  = maxR2;
+    minorR.count = countR2;
+    }
+}
+
+double chi_square(int obs1, int obs2, double p)  // p = binomial prob of getting 1
+{
+double pseudo = 1.0;
+int count = obs1 + obs2;
+
+double exp1 = p * ((double) count) + pseudo;
+double exp2 = (1.0 - p) * ((double) count) + pseudo;
+
+// apply Yates correction... will often have small observed values.
+return pow(abs((double)obs1 - exp1) - 0.5, 2.0)/exp1 + pow(abs((double)obs2 - exp2) - 0.5, 2.0)/exp2;
+}
+
+static int sig_diff(dual_pileup_t *d, analysis_params_t *ap)
+{
+int i;
+double pseudo = 1.0;
+int totalL = 0, totalR = 0;
+
+for (i = 0; i < 4; i++)
+    {
+    totalL += countsL[i];
+    totalR += countsR[i];
+    }
+
+double obs, exp, chi_sq = 0.0;
+for (i = 0; i < 4; i++)
+    {
+    obs = ((double) countsL[i] * 100.0) / (double) totalL + pseudo;
+    exp = ((double) countsR[i] * 100.0) / (double) totalR + pseudo;
+    chi_sq += pow(abs(obs - exp) - 0.5, 2.0) / exp;
+    }
+
+return chi_sq;
+}
+
+static int is_mut(dual_pileup_t *d, analysis_params_t *ap, char *ref,
+		  char *callL1, char *callL2, char *callR1, char *callR2)
+{
+if (majorR.vote == 0 || majorL.vote == 0)
+    return 0;  // most likely a bad quality region.
+
+if (sig_diff(d, ap) < ap->minChiSq)
+    return 0;
+
+int i = 0;
+int rCount = 0;
+for (i = 0; i < 4; i++)
+    rCount += countsR1[i] + countsR2[i] + countsR3[i];
+
+if (rCount < 10)
+    return 0;
+
+char rb = (d->ref && (int)d->posL < d->len)? d->ref[d->posL] : 'N';
+
+int retVal1 = 0, retVal2 = 0;
+for (i = 0; i < 4; i++)
+    {
+    if (NUCS[i] != majorL.nuc && NUCS[i] != minorL.nuc)
+	continue;
+
+    if (NUCS[i] == majorL.nuc && majorL.nuc != majorR.nuc && majorL.nuc != minorR.nuc)
+	{
+	if (countsL1[i] > 0 && countsL2[i] > 0 && countsL1[i] + countsL2[i] >= 3)
+	    retVal1 = 1;
+	else if (countsL1[i] > 0 && countsL3[i] > 0 && countsL1[i] + countsL3[i] >= 3)
+	    retVal1 = 1;
+	else if (countsL2[i] > 0 && countsL3[i] > 0 && countsL2[i] + countsL3[i] >= 3)
+	    retVal1 = 1;
+
+	if (retVal1)
+	    {
+	    if (rCount >= 30 && countsR[i] <= 1)
+		retVal1 = 1;
+	    else if (rCount < 30 && countsR[i] == 0)
+		retVal1 = 1;
+	    else
+		retVal1 = 0;
+	    }
+	}
+
+    if (NUCS[i] == minorL.nuc && (minorL.nuc != majorR.nuc && minorL.nuc != minorR.nuc))
+	{
+	if (countsL1[i] > 0 && countsL2[i] > 0 && countsL1[i] + countsL2[i] >= 3)
+	    retVal2 = 1;
+	else if (countsL1[i] > 0 && countsL3[i] > 0 && countsL1[i] + countsL3[i] >= 3)
+	    retVal2 = 1;
+	else if (countsL2[i] > 0 && countsL3[i] > 0 && countsL2[i] + countsL3[i] >= 3)
+	    retVal2 = 1;
+
+	if (retVal2)
+	    {
+	    if (rCount >= 30 && countsR[i] <= 1)
+		retVal2 = 1;
+	    else if (rCount < 30 && countsR[i] == 0)
+		retVal2 = 1;
+	    else
+		retVal2 = 0;
+	    }
+	}
+    }
+
+if (!retVal1 && !retVal2)
+    return 0;
+
+*ref    = rb;
+*callL1 = majorL.nuc;
+*callL2 = minorL.nuc;
+*callR1 = majorR.nuc;
+*callR2 = minorR.nuc;
+
+if (majorL.nuc != majorR.nuc && majorL.nuc != minorR.nuc)
+    return 1;
+if (minorL.nuc != 'N' && (minorL.nuc != majorR.nuc && minorL.nuc != minorR.nuc))
+    return 1;
+
+return 0;
+}
+
+void print_seq(uint32_t tid, uint32_t pos, int n,
+	       const bam_pileup1_t *pu, dual_pileup_t *d)
+{
+int i, rb;
+rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+for (i = 0; i < n; ++i)
+    {
+    const bam_pileup1_t *p = pu + i;
+    if (p->b->core.qual < minMapQ)
+	continue;
+
+    int qual = bam1_qual(p->b)[p->qpos];
+    if (qual < minQ)
+	continue;
+
+    if (!p->is_del)
+	{
+	int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+	c = toupper(c);
+	putchar(c);
+	}
+    }
+return;
+putchar('\t');
+
+// print base quality
+for (i = 0; i < n; ++i)
+    {
+    const bam_pileup1_t *p = pu + i;
+    if (p->b->core.qual < minMapQ)
+	continue;
+
+    int qual = bam1_qual(p->b)[p->qpos];
+    if (qual < minQ)
+	continue;
+
+    int c = qual + 33;
+    if (c > 126)
+	c = 126;
+    putchar(c);
+    }
+putchar('\t');
+
+// print mapping quality
+for (i = 0; i < n; ++i)
+    {
+    const bam_pileup1_t *p = pu + i;
+    int qual = p->b->core.qual;
+    if (qual < minMapQ)
+	continue;
+
+    if (bam1_qual(p->b)[p->qpos] < minQ)
+	continue;
+
+    int c = qual + 33;
+    if (c > 126)
+	c = 126;
+    putchar(c);
+    }
+}
+
+static void print_nuc_info()
+{
+int i, first = 1;
+for (i = 0; i < 4; i++)
+    {
+    if (countsR[i] == 0)
+	continue;
+    if (!first)
+	putchar(' ');
+    printf("%c:%d/%2.1f", NUCS[i], countsR[i], (double) votesR[i] / (double) countsR[i]);
+    first = 0;
+    }
+putchar('\t');
+first = 1;
+for (i = 0; i < 4; i++)
+    {
+    if (countsL[i] == 0)
+	continue;
+    if (!first)
+	putchar(' ');
+    printf("%c:%d/%2.1f", NUCS[i], countsL[i], (double) votesL[i] / (double) countsL[i]);
+    first = 0;
+    }
+
+putchar('\n');
+}
+
+static void do_mut(dual_pileup_t *d, analysis_params_t *ap)
+{
+char ref, callL1, callL2, callR1, callR2;
+if (!is_mut(d, ap, &ref, &callL1, &callL2, &callR1, &callR2))
+    return;
+
+printf("MUT1\t%s\t%d\t%d\t",
+       d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+printf("%c\t", ref);
+if (callR2 == 'N')
+    printf("%c>", callR1);
+else
+    printf("%c/%c>", callR1, callR2);
+
+if (callL2 == 'N')
+    printf("%c\t", callL1);
+else
+    printf("%c/%c\t", callL1, callL2);
+
+double chi_sq = sig_diff(d, ap);
+printf("%f\t", chi_sq);
+print_nuc_info();
+}
+
+#define A 0
+#define G 1
+#define T 2
+#define C 3
+
+#define NUM_NUC 4
+
+#define AA 0
+#define AG 1
+#define AT 2
+#define AC 3
+#define GA 1
+#define GG 4
+#define GT 5
+#define GC 6
+#define TA 2
+#define TG 5
+#define TT 7
+#define TC 8
+#define CA 3
+#define CG 6
+#define CT 8
+#define CC 9
+
+#define NUM_GENO 10
+
+static long double **genoPrior;
+static long double **mutProb;
+static long double **mutProbGeno;
+static long double **baseProb;
+
+#define GENO_1 "AAAAGGGTTC"
+#define GENO_2 "AGTCGTCTCC"
+
+void initProbTables()
+{
+int i, j;
+
+genoPrior = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+    genoPrior[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+for (i = 0; i < NUM_GENO; i++)
+    {
+    int i1 = nuc_index(GENO_1[i]);
+    int i2 = nuc_index(GENO_2[i]);
+    for (j = 0; j < NUM_NUC; j++)
+	{
+	long double val;
+	if (i1 == j && i2 == j)
+	    val = 0.80;
+	else if (i1 == j || i2 == j)
+	    val = 0.10;
+	else
+	    val = 0.05;
+
+	genoPrior[i][j] = logl(val);
+	fprintf(stderr, "%c%c-%c %Lf\t", GENO_1[i], GENO_2[i], NUCS[i], genoPrior[i][j]);
+	}
+    fprintf(stderr, "\n");
+    }
+
+
+mutProb = (long double **)(malloc(NUM_NUC * sizeof(long double*)));
+for (i = 0; i < NUM_NUC; i++)
+    mutProb[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+long double val, tot;
+for (i = 0; i < NUM_NUC; i++)
+    {
+    tot = 0.0;
+    for (j = 0; j < NUM_NUC; j++)
+	{
+	/* else, transitions 4x more likely than transitions */
+	if (i == j)
+	    val = 1.0;
+	else if ((NUCS[i] == 'A' && NUCS[j] == 'G') || (NUCS[i] == 'G' && NUCS[j] == 'A'))
+	    val = 0.1;
+	else if ((NUCS[i] == 'C' && NUCS[j] == 'T') || (NUCS[i] == 'T' && NUCS[j] == 'C'))
+	    val = 0.1;
+	else
+	    val = 0.025;
+		
+	mutProb[i][j] = val;
+	tot += val;
+	}
+
+    /* normalize probs, conditioned on germline call */
+    for (j = 0; j < NUM_NUC; j++)
+	{
+	mutProb[i][j] = logl(mutProb[i][j]/tot);
+	fprintf(stderr, "%c%c %Lf\t", NUCS[i], NUCS[j], mutProb[i][j]);
+	}
+    fprintf(stderr, "\n");
+    }
+
+mutProbGeno = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+    mutProbGeno[i] = (long double *)(malloc(NUM_GENO * sizeof(long double)));
+
+for (i = 0; i < NUM_GENO; i++)
+    {
+    int i1 = nuc_index(GENO_1[i]);
+    int i2 = nuc_index(GENO_2[i]);
+    for (j = 0; j < NUM_GENO; j++)
+	{
+	int j1 = nuc_index(GENO_1[j]);
+	int j2 = nuc_index(GENO_2[j]);
+
+	long double val1 = mutProb[i1][j1] + mutProb[i2][j2];
+	long double val2 = mutProb[i1][j2] + mutProb[i2][j1];
+
+	if (val1 > val2)
+	    mutProbGeno[i][j] = val1;
+	else
+	    mutProbGeno[i][j] = val2;
+
+	fprintf(stderr, "%c%c->%c%c %Lf\t", GENO_1[i], GENO_2[i], GENO_1[j], GENO_2[j], mutProbGeno[i][j]);
+	}
+    fprintf(stderr, "\n");
+    }
+
+baseProb = (long double **)(malloc(NUM_GENO * sizeof(long double*)));
+for (i = 0; i < NUM_GENO; i++)
+    baseProb[i] = (long double *)(malloc(NUM_NUC * sizeof(long double)));
+
+long double inprob = 0.9;
+long double outprob = (1.0 - inprob) / 3.0;
+
+for (i = 0; i < NUM_GENO; i++)
+    {
+    int i1 = nuc_index(GENO_1[i]);
+    int i2 = nuc_index(GENO_2[i]);
+    for (j = 0; j < NUM_NUC; j++)
+	{
+	if (i1 == j && i2 == j)
+	    baseProb[i][j] = inprob;
+	else if (i1 == j || i2 == j)
+	    baseProb[i][j] = (inprob + outprob) / 2.0;
+	else
+	    baseProb[i][j] = outprob;
+
+	baseProb[i][j] = logl(baseProb[i][j]);
+	fprintf(stderr, "%c%c-%c %Lf\t", GENO_1[i], GENO_2[i], NUCS[j], baseProb[i][j]);
+	}
+    fprintf(stderr, "\n");
+    }
+}
+
+static long double dataProbGerm(int geno)
+{
+int i, j, tot = 0;
+long double p = 0.0, scale = 0.0;
+
+for (i = 0; i < NUM_NUC; i++)
+    {
+    p += (long double) countsR[i] * baseProb[geno][i];
+    
+    tot += countsR[i];
+    for (j = 0; j < countsR[i]; j++)
+	scale -= logl((long double) (j + 1.0));
+    }
+
+for (i = 0; i < tot; i++)
+    scale += logl((long double) (i + 1.0));
+
+return p + scale;
+}
+
+static long double dataProbTum(int genoG, int genoT, long double alpha)
+{
+int i, j, tot = 0;
+long double p = 0.0, scale = 0.0;
+
+for (i = 0; i < NUM_NUC; i++)
+    {
+    p += (long double) countsL[i] * (alpha * baseProb[genoG][i] + (1.0 - alpha) * baseProb[genoT][i]);
+    
+    tot += countsL[i];
+    for (j = 0; j < countsL[i]; j++)
+	scale -= logl((long double) (j + 1.0));
+    }
+
+for (i = 0; i < tot; i++)
+    scale += logl((long double) (i + 1.0));
+
+return p + scale;
+}
+
+static int is_mut_prob(dual_pileup_t *d, analysis_params_t *ap, char *ref,
+		       char *callL1, char *callL2, char *callR1, char *callR2, 
+		       long double *conf)
+{
+if (majorR.vote == 0 || majorL.vote == 0)
+    return 0;  // most likely a bad quality region.
+
+int i, j, imax = -1, jmax = -1;
+int totalL = 0, totalR = 0;
+for (i = 0; i < NUM_NUC; i++)
+    {
+    totalL += countsL[i];
+    totalR += countsR[i];
+    }
+
+long double alpha = (ap->fracGerm * (long double) totalR) / (long double) totalL;
+if (alpha > ap->fracGerm)
+    alpha = ap->fracGerm;
+
+char rb = (d->ref && (int)d->posL < d->len)? toupper(d->ref[d->posL]) : 'N';
+int ri  = nuc_index(rb);
+
+long double p, ptot = 0.0, pmax = -999999999.99;
+for (i = 0; i < NUM_GENO; i++)
+    {  // loop germline genotypes
+    for (j = 0; j < NUM_GENO; j++)
+	{  // loop tumor genotypes
+	if (ri < 0)
+	    p = 0.0;
+	else
+	    p = genoPrior[i][ri];
+	p = 0.0;
+	p += mutProbGeno[i][j];
+	p += dataProbTum(i, j, alpha);
+	p += dataProbGerm(i);
+
+	if (p > pmax)
+	    {
+	    pmax = p;
+	    imax = i;
+	    jmax = j;
+	    }
+	ptot += expl(p);
+	}
+    }
+
+if (imax < 0 || jmax < 0)
+    return 0;
+
+*ref    = rb;
+*callL1 = GENO_1[jmax];
+*callL2 = GENO_2[jmax];
+*callR1 = GENO_1[imax];
+*callR2 = GENO_2[imax];
+*conf   = expl(pmax) / ptot;
+
+if (imax != jmax)
+    return 1;
+if (GENO_1[imax] != GENO_2[imax] || (rb != GENO_1[imax] && rb != GENO_2[imax]))
+    return 2;
+
+return 0;
+}
+
+
+static void do_mut_prob(dual_pileup_t *d, analysis_params_t *ap)
+{
+int ret;
+char ref, callL1, callL2, callR1, callR2;
+long double conf;
+if ((ret = is_mut_prob(d, ap, &ref, &callL1, &callL2, &callR1, &callR2, &conf)) == 0)
+    return;
+
+printf("MUT2\t%s\t%d\t%d\t",
+       d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+if (ret == 1)
+    printf("SOM\t");
+else 
+    printf("GERM\t");
+
+printf("%c\t", ref);
+printf("%c%c>", callR1, callR2);
+printf("%c%c\t", callL1, callL2);
+
+printf("%Lf\t", conf);
+print_nuc_info();
+}
+
+
+
+static void perform_analysis(dual_pileup_t *d, void *func_data)
+{
+analysis_params_t *ap = (analysis_params_t *)func_data;
+
+/* processes pileup for het and mut analysis */
+nuc_counts(d, ap);
+
+/* mutation discovery */
+do_mut(d, ap);  // old way
+
+do_mut_prob(d, ap);  // new way
+
+fflush(stdout);
+}
+
+void bamBam(char *inbamL, char *inbamR, char *faidx, int mask)
+{
+analysis_params_t ap;
+ap.minQ     = minQ;
+ap.minChiSq = minChiSq;
+ap.minMapQ  = minMapQ;
+ap.avgMapQ  = avgMapQ;
+ap.minSuppSNP  = minSuppSNP;
+ap.minSuppSV   = minSuppSV;
+ap.sites    = 0;
+
+ap.pStart = -1;
+ap.pStop  = -1;
+ap.pStartPrev = -1;
+ap.pStopPrev  = -1;
+
+/* Overall Copy Number */
+ap.tid       = BAD_TID;
+ap.lstart    = -1;
+ap.lstop     = -1;
+ap.lcounts   = 0;
+ap.rstart    = -1;
+ap.rstop     = -1;
+ap.rcounts   = 0;
+ap.prevTid   = BAD_TID;
+ap.prevStart = -1;
+ap.prevStop  = -1;
+
+/* ASCN */
+ap.tidAS      = BAD_TID;
+ap.startAS   = -1;
+ap.stopAS    = -1;
+ap.lcountsMin = 0;
+ap.lcountsMax = 0;
+ap.rcountsMin = 0;
+ap.rcountsMax = 0;
+
+/* SV */
+ap.svStartL = -1;
+ap.svStopL  = -1;
+
+ap.svStartR = -1;
+ap.svStopR  = -1;
+
+ap.fracGerm = (long double) fracGerm;
+
+initProbTables();
+
+bam_bam_file(inbamL, inbamR, faidx, position, mask, perform_analysis, &ap);
+}
+
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc < 3)
+    usage();
+
+if (optionExists("fa"))
+    faidx = optionVal("fa", NULL);
+if (optionExists("position"))
+    position = optionVal("position", NULL);
+if (optionExists("minSuppSNP"))
+    minSuppSNP = optionInt("minSuppSNP", 4);
+if (optionExists("minSuppSV"))
+    minSuppSV = optionInt("minSuppSV", 2);
+if (optionExists("minQ"))
+    minQ = optionInt("minQ", 10);
+if (optionExists("minMapQ"))
+    minMapQ = optionInt("minMapQ", 20);
+if (optionExists("avgMapQ"))
+    avgMapQ = optionInt("avgMapQ", 30);
+if (optionExists("minChiSq"))
+    minChiSq = optionInt("minChiSq", 250);
+if (optionExists("fracGerm"))
+    fracGerm = optionFloat("fracGerm", 0.0);
+
+char *inbamL = argv[1];
+char *inbamR = argv[2];
+bamBam(inbamL, inbamR, faidx, BAM_DEF_MASK);
+
+return 0;
+}
+
+#else // USE_BAM not defined
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+printf("BAM not installed");
+
+return 0;
+}
+#endif