src/hg/tcga/pBamBam/asBamBam.c 1.2

1.2 2010/03/06 02:06:48 jsanborn
updated
Index: src/hg/tcga/pBamBam/asBamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/asBamBam.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/tcga/pBamBam/asBamBam.c	5 Mar 2010 19:53:24 -0000	1.1
+++ src/hg/tcga/pBamBam/asBamBam.c	6 Mar 2010 02:06:48 -0000	1.2
@@ -1,944 +1,1357 @@
 /* asBamBam -- discovering variants from BAM file
  *   code adapted from Heng Li's Samtools library functions
  */
 
 #ifdef USE_BAM
 
 #include "common.h"
 #include "hCommon.h"
 #include "linefile.h"
 #include "hash.h"
 #include "hdb.h"
 #include "options.h"
 #include "jksql.h"
 
 #define _IOLIB 2
 #include "bam.h"
 #include "sam.h"
 #include "bam_helper.h"
 
 static char const rcsid[] = "$Id";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "asBamBam - Allele-specific BamBam.\n"
   "usage:\n"
   "   asBamBam [options] <left.bam> <right.bam>\n"
   "options:\n"
   "   -position=chrX:1-100 - Position to do analysis. [None]\n" 
   "   -minSupport=INT   - Minimum Support in Germline. [10]\n"
   "   -minQ=INT         - Minimum acceptable base quality. [0]\n"
   "   -minMapQ=INT      - Minimum acceptable mapping quality. [0]\n"
+  "   -avgMapQ=INT      - Minimum acceptable average mapping quality. [0]\n"
   "   -minChiSq=INT   - Minimum acceptable chi square of nucleotide freqs. [0]\n"
+  "   -maxISize=INT   - Maximum Insert Size. [10000]\n"
   "   -readsPerBin=INT  - Reads per CNV bin [500]\n"
-  "   -readLength=INT   - Length of read [100]\n"
   "\n"
   );
 }
 
 static struct optionSpec options[] = {
     {"position", OPTION_STRING},
     {"minSupport", OPTION_INT},
     {"minQ", OPTION_INT},
     {"minMapQ", OPTION_INT},
+    {"avgMapQ", OPTION_INT},
+    {"maxISize", OPTION_INT},
     {"minChiSq", OPTION_INT}, 
     {"readsPerBin", OPTION_INT},
-    {"readLength", OPTION_INT},
     {NULL, 0}
 };
 
 #define BAD_TID -999
 
 /* Option variables */
 char *position = NULL;
 int minSupport = 2;
 int minQ     = 0;
 int minMapQ  = 0;
+int avgMapQ  = 0;
+int maxISize = 10000;
 int minChiSq = 0;
 int readsPerBin = 500;
 int readLength = 100;
 
 
 typedef struct {
     int minQ;
     int minMapQ;
+    int avgMapQ;
+    int maxISize;
     int minChiSq;
     int minSupp;
     int sites;
 
     int readLength;
     int readsPerBin;
 
     int pStartPrev;
     int pStopPrev;
     int pStart;
     int pStop;
 
     /* Overall Copy Number */
     int32_t tid;
     int32_t lstart;
     int32_t lstop;
     int lcounts;
     int32_t rstart;
     int32_t rstop;
     int rcounts;
 
     int32_t prevTid;
     int32_t prevStart;
     int32_t prevStop;
 
     /* ASCN */
     int32_t tidAS;
     int32_t startAS;
     int32_t stopAS;
     int lcountsMin;
     int lcountsMax;
     int rcountsMin;
     int rcountsMax;
 
+    /* SV */
+    int svStartL;
+    int svStopL;
+    int svStartR;
+    int svStopR;
+
 } analysis_params_t;
 
 static int32_t countsL1[4];
 static int32_t countsL2[4];
 static int32_t countsL3[4];
 
 static int32_t countsR1[4];
 static int32_t countsR2[4];
 static int32_t countsR3[4];
 
 static int32_t votesL[4];
 static int32_t countsL[4];
 static int32_t votesR[4];
 static int32_t countsR[4];
 
 typedef struct {
     char nuc;
     int count;
     int vote;
 } variant_t;
 
 variant_t minorL, majorL;
 variant_t minorR, majorR;
 
+static int32_t insL, delL, insR, delR;
+
+typedef struct {
+    int tid;
+
+    int32_t qstart;
+    int32_t qstop;
+    int32_t mstart;
+    int32_t mstop;
+
+    int supp;
+    int orientation;
+} sv_t;
+
+#define BUF 500  // 500 base buffer around start/stop
+sv_t intraL, intraR;
+sv_t interL, interR;
+
 static int nuc_index(char c)
 {
 switch (c)
     {
     case 'A':
 	return 0;
     case 'G':
 	return 1;
     case 'C':
 	return 2;
     case 'T':
 	return 3;
     default:
 	return -1;
     }
 return -1;
 }
 #define NUCS "AGCT"
 
-static inline void nuc_counts(dual_pileup_t *d, analysis_params_t *ap)
+static inline void reset_globals(analysis_params_t *ap)
 { 
-int i, index;
+/* reset nuc counts */
+int i;
 for (i = 0; i < 4; i++)
     { // sum up the qualities of each base
     votesL[i]  = 0;
     countsL[i] = 0;
     votesR[i]  = 0; 
     countsR[i] = 0;
 
     countsL1[i] = 0;
     countsL2[i] = 0;
     countsL3[i] = 0;
     countsR1[i] = 0;
     countsR2[i] = 0;
     countsR3[i] = 0;
     }
 
+/* reset indel stats */
+insL = 0;
+insR = 0;
+delL = 0;
+delR = 0;
+
 /* reset pileup info */
 ap->pStart = -1, ap->pStop = -1;
+}
+
+static inline int check_read(const bam_pileup1_t *p, analysis_params_t *ap)
+{
+bam1_core_t *c = &p->b->core;
+/* Check if unmapped or PCR optical dupe */
+if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
+    return 0;
+
+/* check base quality */
+if (bam1_qual(p->b)[p->qpos] < ap->minQ)
+    return 0;
 
+/* check mapping quality */
+if ((c)->qual < ap->minMapQ)
+    return 0;
+
+/* all good */
+return 1;
+}
+
+
+static inline void nuc_counts(dual_pileup_t *d, analysis_params_t *ap)
+{ 
+/* reset global vars */
+reset_globals(ap);
+
+int i, index;
 int32_t start, stop;
 int32_t prevStart = -1, prevStop = -1;
 const bam_pileup1_t *p;
 for (i = 0; i < d->nL; ++i)
     { 
     p = d->puL + i;
-    bam1_core_t *c = &p->b->core;
-
-    if ((c)->qual < ap->minMapQ)
+    if (!check_read(p, ap))
 	continue;
+    bam1_core_t *c = &p->b->core;
     int qual = bam1_qual(p->b)[p->qpos];    
-    if (qual < ap->minQ)
-	continue;
-
-    /* Check if unmapped or PCR optical dupe */
-    if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
-	continue;
 
     /* Check if start & stop has already been visited, to dupes missed in prev step*/    
     start = (c)->pos;
     stop  = start + (c)->l_qseq;    
     if (start <= prevStart && stop <= prevStop)
 	continue;
     prevStart = start;
     prevStop  = stop;
 
     if (ap->pStart == -1)
 	{
 	ap->pStart = start;
 	ap->pStop  = stop;
 	}
     if (start < ap->pStart)
 	ap->pStart = start;
     if (stop  > ap->pStop)
 	ap->pStop = stop;
 
+    if (p->indel > 0)
+	insL += 1;
+    else if (p->indel < 0)
+	delL += 1;
+    
     if (!p->is_del && p->indel == 0)
 	{
         double lseq = (double) p->b->core.l_qseq;
 	int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);    
 	if ((index = nuc_index(b)) < 0)
 	    continue;
 
 	votesL[index] += qual;
 	countsL[index]++;
 
         if (p->qpos < round(0.33 * lseq))
 	    countsL1[index]++;
 	else if (p->qpos < round(0.67 * lseq))
 	    countsL2[index]++;
 	else
 	    countsL3[index]++;
 	}
     }
 
 prevStart = -1;
 prevStop  = -1;
 for (i = 0; i < d->nR; ++i)
     {
     p = d->puR + i;
-    bam1_core_t *c = &p->b->core;
-
-    if ((c)->qual < ap->minMapQ)
+    if (!check_read(p, ap))
 	continue;
+    bam1_core_t *c = &p->b->core;
     int qual = bam1_qual(p->b)[p->qpos];    
-    if (qual < ap->minQ)
-	continue;
-
-    /* Check if unmapped or PCR optical dupe */
-    if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
-	continue;
 
     /* Check if start & stop has already been visited, to dupes missed in prev step*/    
     start = (c)->pos;
     stop  = start + (c)->l_qseq;    
     if (start <= prevStart && stop <= prevStop)
 	continue;
     prevStart = start;
     prevStop  = stop;
 
     if (ap->pStart == -1)
 	{
 	ap->pStart = start;
 	ap->pStop  = stop;
 	}
     if (start < ap->pStart)
 	ap->pStart = start;
     if (stop  > ap->pStop)
 	ap->pStop = stop;
 
+    if (p->indel > 0)
+	insR += 1;
+    else if (p->indel < 0)
+	delR += 1;
+    
     if (!p->is_del && p->indel == 0)
 	{
         double lseq = (double) p->b->core.l_qseq;
 	int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
 	if ((index = nuc_index(b)) < 0)
 	    continue;
 	votesR[index] += qual;
 	countsR[index]++;
 
         if (p->qpos < round(0.33 * lseq))
 	    countsR1[index]++;
 	else if (p->qpos < round(0.67 * lseq))
 	    countsR2[index]++;
 	else
 	    countsR3[index]++;
 	}
     }
 
 int iL1 = -1, iL2 = -1, maxL1 = 0, maxL2 = 0;
 int iR1 = -1, iR2 = -1, maxR1 = 0, maxR2 = 0;
 int countL1 = -1, countL2 = -1, countR1 = -1, countR2 = -1;
 for (i = 0; i < 4; i++)
     {
     if (countsL[i] > countL1)
 	{
 	iL2     = iL1;     // set prev max to max2
 	maxL2   = maxL1;
 	countL2 = countL1;
 
 	iL1     = i;
 	maxL1   = votesL[i];
 	countL1 = countsL[i];
 	}
     else if (countsL[i] > countL2)
 	{
 	iL2     = i;
 	maxL2   = votesL[i];
 	countL2 = countsL[i];
 	}
     if (countsR[i] > countR1)
 	{
 	iR2     = iR1;
 	maxR2   = maxR1;
 	countR2 = countR1;
 
 	iR1     = i;
 	maxR1   = votesR[i];
 	countR1 = countsR[i];
 	}
     else if (countsR[i] > countR2)
 	{
 	iR2     = i;
 	maxR2   = votesR[i];
 	countR2 = countsR[i];
 	}
     }
 
 majorL.nuc   = 'N';
 majorL.vote  = 0;
 majorL.count = 0;
 if (iL1 != -1 && countL1 >= ap->minSupp)
     {
     majorL.nuc   = NUCS[iL1];
     majorL.vote  = maxL1;
     majorL.count = countL1;
     }
 
 minorL.nuc   = 'N';
 minorL.vote  = 0;
 minorL.count = 0;
 if (iL2 != -1 && countL2 >= ap->minSupp)
     {
     minorL.nuc   = NUCS[iL2];
     minorL.vote  = maxL2;
     minorL.count = countL2;
     }
 
 majorR.nuc   = 'N';
 majorR.vote  = 0;
 majorR.count = 0;
 if (iR1 != -1 && countR1 >= ap->minSupp)
     {
     majorR.nuc   = NUCS[iR1];
     majorR.vote  = maxR1;
     majorR.count = countR1;
     }
 
 minorR.nuc   = 'N';
 minorR.vote  = 0;
 minorR.count = 0;
 if (iR2 != -1 && countR2 >= ap->minSupp)
     {
     minorR.nuc   = NUCS[iR2];
     minorR.vote  = maxR2;
     minorR.count = countR2;
     }
 }
 
 double chi_square(int obs1, int obs2, double p)  // p = binomial prob of getting 1
 {
 double pseudo = 1.0;
 int count = obs1 + obs2;
 
 double exp1 = p * ((double) count) + pseudo;
 double exp2 = (1.0 - p) * ((double) count) + pseudo;
 
 // apply Yates correction... will often have small observed values.
 return pow(abs((double)obs1 - exp1) - 0.5, 2.0)/exp1 + pow(abs((double)obs2 - exp2) - 0.5, 2.0)/exp2;
 }
 
 static int sig_diff(dual_pileup_t *d, analysis_params_t *ap)
 {
 int i;
 double pseudo = 1.0;
 int totalL = 0, totalR = 0;
 
 for (i = 0; i < 4; i++)
     {
     totalL += countsL[i];
     totalR += countsR[i];
     }
 
 double obs, exp, chi_sq = 0.0;
 for (i = 0; i < 4; i++)
     {
     obs = ((double) countsL[i] * 100.0) / (double) totalL + pseudo;
     exp = ((double) countsR[i] * 100.0) / (double) totalR + pseudo;
     chi_sq += pow(abs(obs - exp) - 0.5, 2.0) / exp;
     }
 
 return chi_sq;
 }
 
 static int is_mut(dual_pileup_t *d, analysis_params_t *ap,
-		  char *callL1, char *callL2, char *callR1, char *callR2,
-		  float *hetL, float *hetR, int *stateL, int *stateR, int *sites)
+		  char *callL1, char *callL2, char *callR1, char *callR2)
 {
-*sites += 1;
-
 if (majorR.vote == 0 || majorL.vote == 0)
     return 0;  // most likely a bad quality region.
 
 if (sig_diff(d, ap) < ap->minChiSq)
     return 0;
 
 int i = 0;
 int rCount = 0;
 for (i = 0; i < 4; i++)
     rCount += countsR1[i] + countsR2[i] + countsR3[i];
 
 if (rCount < 10)
     return 0;
 
 
 int retVal1 = 0, retVal2 = 0;
 for (i = 0; i < 4; i++)
     {
     if (NUCS[i] != majorL.nuc && NUCS[i] != minorL.nuc)
 	continue;
 
     if (NUCS[i] == majorL.nuc && majorL.nuc != majorR.nuc && majorL.nuc != minorR.nuc)
 	{
 	//      if (countsL1[i] >= 2 && countsL2[i] >= 1)
 	//          retVal1 = 1;
 	//      else if (countsL1[i] > 0 && countsL2[i] > 0 && countsL3[i] > 0)
 	//          retVal1 = 1;
 
 	if (countsL1[i] > 0 && countsL2[i] > 0 && countsL1[i] + countsL2[i] >= 3)
 	    retVal1 = 1;
 	else if (countsL1[i] > 0 && countsL3[i] > 0 && countsL1[i] + countsL3[i] >= 3)
 	    retVal1 = 1;
 	else if (countsL2[i] > 0 && countsL3[i] > 0 && countsL2[i] + countsL3[i] >= 3)
 	    retVal1 = 1;
 
 	if (retVal1)
 	    {
 	    if (rCount >= 30 && countsR[i] <= 1)
 		retVal1 = 1;
 	    else if (rCount < 30 && countsR[i] == 0)
 		retVal1 = 1;
 	    else
 		retVal1 = 0;
 	    }
 	}
 
     if (NUCS[i] == minorL.nuc && (minorL.nuc != majorR.nuc && minorL.nuc != minorR.nuc))
 	{
 	//      if (countsL1[i] >= 2 && countsL2[i] >= 1)
 	//          retVal2 = 1;
 	//      else if (countsL1[i] > 0 && countsL2[i] > 0 && countsL3[i] > 0)
 	//          retVal2 = 1;
 
 	if (countsL1[i] > 0 && countsL2[i] > 0 && countsL1[i] + countsL2[i] >= 3)
 	    retVal2 = 1;
 	else if (countsL1[i] > 0 && countsL3[i] > 0 && countsL1[i] + countsL3[i] >= 3)
 	    retVal2 = 1;
 	else if (countsL2[i] > 0 && countsL3[i] > 0 && countsL2[i] + countsL3[i] >= 3)
 	    retVal2 = 1;
 
 	if (retVal2)
 	    {
 	    if (rCount >= 30 && countsR[i] <= 1)
 		retVal2 = 1;
 	    else if (rCount < 30 && countsR[i] == 0)
 		retVal2 = 1;
 	    else
 		retVal2 = 0;
 	    }
 	}
     }
 
 if (!retVal1 && !retVal2)
     return 0;
 
-
-float freqL = ((float) minorL.count) / ((float) majorL.count + (float) minorL.count);
-float freqR = ((float) minorR.count) / ((float) majorR.count + (float) minorR.count);
-
-*hetL = freqL;
-*hetR = freqR;
-
 *callL1 = majorL.nuc;
 *callL2 = minorL.nuc;
 *callR1 = majorR.nuc;
 *callR2 = minorR.nuc;
 
 if (majorL.nuc != majorR.nuc && majorL.nuc != minorR.nuc)
     return 1;
 if (minorL.nuc != 'N' && (minorL.nuc != majorR.nuc && minorL.nuc != minorR.nuc))
     return 1;
 
 return 0;
 }
 
 
 
 
 static int calc_cnv(dual_pileup_t *d, analysis_params_t *ap)
 {
 if ((ap->tid != BAD_TID) && (ap->tid != d->tidL))  // onto next chromosome
     return 2;
 
 int i;
 const bam_pileup1_t *p;
 if ((ap->prevTid != BAD_TID) && (ap->prevTid == d->tidL))
     {
     for (i = 0; i < d->nL; ++i)
 	{
 	p = d->puL + i;
 	bam1_core_t *c = &p->b->core;
 	if ((c)->pos <= ap->prevStop)
 	    return 0;
 	}
 
     for (i = 0; i < d->nR; ++i)
 	{
 	p = d->puR + i;
 	bam1_core_t *c = &p->b->core;
 	if ((c)->pos <= ap->prevStop)
 	    return 0;
 	}
     }
 
 ap->tid = d->tidL;
 
 int32_t start, stop;
 int32_t lstart = ap->lstart, lstop = ap->lstop;
 int32_t prevPos = lstop - 1;
 for (i = 0; i < d->nL; ++i)
     {
     p = d->puL + i;
     bam1_core_t *c = &p->b->core;
     if ((c)->qual < ap->minMapQ)
 	continue;
 
     if ( !((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FDUP) )
 	{ // read is mapped and not an optical/PCR dupe
 	start = (c)->pos;
 	stop  = start + (c)->l_qseq;
 
 	if (lstart == -1)
 	    {
 	    lstart = start;
 	    lstop  = start + 1;
 	    ap->lcounts += 1;
 	    }
 	else if ((start > prevPos) && (start > lstart)) // && stop > lstop))
 	    {  // make sure there are no possible dupes by removing all by
 	    // comparing left-most position of each read, only allowing one per pos.
 	    prevPos = start;
 	    lstop = start + 1;
 	    ap->lcounts += 1;
 	    }
 	}
     }
 
 int32_t rstart = ap->rstart, rstop = ap->rstop;
 prevPos = rstop - 1;
 for (i = 0; i < d->nR; ++i)
     {
     p = d->puR + i;
     bam1_core_t *c = &p->b->core;
     if ((c)->qual < ap->minMapQ)
 	continue;
 
     if ( !((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FDUP) )
 	{ // read is mapped and not an optical/PCR dupe
 	start = (c)->pos;
 	stop  = start + (c)->l_qseq;
 
 	if (rstart == -1)
 	    {
 	    rstart = start;
 	    rstop  = start + 1;
 	    ap->rcounts += 1;
 	    }
 	else if ((start > prevPos) && (start > rstart)) //  && (stop > rstop))
 	    {  // make sure there are no possible dupes by removing all by
 	    // comparing left-most position of each read, only allowing one per pos.
 	    prevPos = start;
 	    rstop   = start + 1;
 	    ap->rcounts += 1;
 	    }
 	}
     }
 
 ap->lstart = lstart;
 ap->lstop  = lstop;
 
 ap->rstart = rstart;
 ap->rstop  = rstop;
 
 if ((ap->lcounts >= ap->readsPerBin) || (ap->rcounts >= ap->readsPerBin))
     return 1;
 
 return 0;
 }
 
 static int calc_ascn(dual_pileup_t *d, analysis_params_t *ap, 
 		     int maxS, int minS, int maxG, int minG)
 {
 if ((ap->tidAS != BAD_TID) && (ap->tidAS != d->tidL))  // onto next chromosome
     return 2;
 
 if ( ap->pStopPrev != -1 && (d->posL < ap->pStopPrev))
     return 0; // within previous position's territory, avoid double-counting
 
 ap->tidAS = d->tidL;
 
 if (ap->startAS == -1)
     {
     ap->startAS = d->posL;
     ap->stopAS  = d->posL;
     }
 
 if (d->posL > ap->stopAS)
     ap->stopAS = d->posL;
 
 ap->lcountsMin += minS;
 ap->lcountsMax += maxS;
 
 ap->rcountsMin += minG;
 ap->rcountsMax += maxG;
 
 int totalL = ap->lcountsMin + ap->lcountsMax;
 int totalR = ap->rcountsMin + ap->rcountsMax;
 
 if (totalL >= ap->readsPerBin || totalR >= ap->readsPerBin)
     return 1;
 
 return 0;
 }
 
 
 static int is_het(dual_pileup_t *d, analysis_params_t *ap)
 {
 int i = 0;
 int rCount = 0, maxCountR = 0;
 for (i = 0; i < 4; i++)
     {
     if (countsR[i] < ap->minSupp)
 	continue;
     rCount += countsR[i];
     if (countsR[i] > maxCountR)
 	maxCountR = countsR[i];
     }
 
 double het = (double) maxCountR / (double) rCount;
 if (het > 0.4 && het < 0.6)
     return 1;  // need a smarter test here. 
 
 return 0;
 }
 
-static int is_sv(dual_pileup_t *d, analysis_params_t *ap)
-{
-return 0;
-}
-
 
 static void print_nuc_info()
 {
 int i, first = 1;
 for (i = 0; i < 4; i++)
     {
     if (countsR[i] == 0)
 	continue;
     if (!first)
 	putchar(' ');
     printf("%c:%d/%2.1f", NUCS[i], countsR[i], (double) votesR[i] / (double) countsR[i]);
     first = 0;
     }
 putchar('\t');
 first = 1;
 for (i = 0; i < 4; i++)
     {
     if (countsL[i] == 0)
 	continue;
     if (!first)
 	putchar(' ');
     printf("%c:%d/%2.1f", NUCS[i], countsL[i], (double) votesL[i] / (double) countsL[i]);
     first = 0;
     }
 
 putchar('\n');
 }
 
-
-static void perform_analysis(dual_pileup_t *d, void *func_data)
+static void do_cnv(dual_pileup_t *d, analysis_params_t *ap)
 {
-analysis_params_t *ap = (analysis_params_t *)func_data;
-
-int i, ret;
-if ((ret = calc_cnv(d, ap)) > 0)
-    {  // do overall cnv
-    int start = ap->rstart;
-    int stop  = ap->rstop;
-    if (ap->lstart < start)
+int ret;
+if ((ret = calc_cnv(d, ap)) == 0)
+    return;
+
+int start = ap->rstart;
+int stop  = ap->rstop;
+if (ap->lstart < start)
 	start = ap->lstart;
-    if (ap->lstop > stop)
+if (ap->lstop > stop)
 	stop  = ap->lstop;
 
-    printf("CNV\t%s\t%d\t%d\t%f\t%d\t%d\n",
+printf("CNV\t%s\t%d\t%d\t%f\t%d\t%d\n",
 	   d->hL->target_name[ap->tid],
 	   start, stop, (double) ap->lcounts / (double) ap->rcounts, 
 	   ap->rcounts, ap->lcounts);
 
-    ap->prevTid   = ap->tid;
-    ap->prevStart = ap->lstart;
-    ap->prevStop  = ap->lstop;
+ap->prevTid   = ap->tid;
+ap->prevStart = ap->lstart;
+ap->prevStop  = ap->lstop;
 
-    ap->lcounts = 0;
-    ap->rcounts = 0;
+ap->lcounts = 0;
+ap->rcounts = 0;
 
-    if (ret == 2)
+if (ret == 2)
 	{  // switching chroms
 	ap->tid   = BAD_TID;
 
 	ap->lstart = -1;
 	ap->lstop  = -1;
 	ap->rstart = -1;
 	ap->rstop  = -1;
 	}
-    else
+else
 	{ // on same chrom
 	ap->lstart = ap->lstop;
 	ap->rstart = ap->rstop;
 	}
+}
+
+#define NUM_CHROMS 100
+static int mateChromsL[NUM_CHROMS];
+static int mateChromsR[NUM_CHROMS];
+static int somaticSV[NUM_CHROMS];
+static int germlineSV[NUM_CHROMS];
+
+static int inter_struct_variant(dual_pileup_t *d, analysis_params_t *ap)
+{
+int i;
+for (i = 0; i < NUM_CHROMS; ++i)
+    {
+    mateChromsL[i] = 0;
+    mateChromsR[i] = 0;
+    somaticSV[i]   = 0;
+    germlineSV[i]  = 0;
+    }
+
+const bam_pileup1_t *p;
+int avgMapQ = 0;
+for (i = 0; i < d->nL; ++i)
+    {
+    p = d->puL + i;
+    avgMapQ += p->b->core.qual;
+    }
+if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
+    return 0;
+
+avgMapQ = 0;
+for (i = 0; i < d->nR; ++i)
+    {
+    p = d->puR + i;
+    avgMapQ += p->b->core.qual;
+    }
+if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
+    return 0;
+
+for (i = 0; i < d->nL; ++i)
+    {
+    p = d->puL + i;
+
+    if (p->b->core.qual < ap->minMapQ)
+	continue;
+
+    bam1_core_t *c = &p->b->core;
+    if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+	{ // read is paired and mapped.
+	if (!((c)->flag & BAM_FMUNMAP))
+	    {  // mate is mapped
+	    if ((c)->mtid != (c)->tid) // different chroms.
+		mateChromsL[(c)->mtid]++;
+	    }
+	}
+    }
+
+for (i = 0; i < d->nR; ++i)
+    {
+    p = d->puR + i;
+
+    if (p->b->core.qual < ap->minMapQ)
+	continue;
+
+    bam1_core_t *c = &p->b->core;
+    if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+	{ // read is paired and mapped.
+	if (!((c)->flag & BAM_FMUNMAP))
+	    {  // mate is mapped
+	    if ((c)->mtid != (c)->tid)  // different chroms
+		mateChromsR[(c)->mtid]++;
+	    }
+	}
+    }
+
+int retVal = 0;
+for (i = 0; i < NUM_CHROMS; i++)
+    {
+    if (mateChromsL[i] < ap->minSupp)
+	continue;
+    retVal = 1;
+    if (mateChromsR[i] < ap->minSupp)
+	somaticSV[i]  = mateChromsL[i];
+    else
+	germlineSV[i] = mateChromsL[i] + mateChromsR[i];
+    }
+
+return retVal;
+}
+
+
+
+
+static int intra_struct_variant(dual_pileup_t *d, analysis_params_t *ap)
+{
+intraL.tid   = -1;
+intraL.qstart = -1;
+intraL.qstop  = -1;
+intraL.mstart = -1;
+intraL.mstop  = -1;
+intraL.supp  = 0;
+intraL.orientation = -1;
+
+intraR.tid   = -1;
+intraR.qstart = -1;
+intraR.qstop  = -1;
+intraR.mstart = -1;
+intraR.mstop  = -1;
+intraR.supp  = 0;
+intraR.orientation = -1;
+
+// First check average mapping quality of read pairs
+int i;
+const bam_pileup1_t *p;
+int avgMapQ = 0;
+for (i = 0; i < d->nL; ++i)
+    {
+    p = d->puL + i;
+    avgMapQ += p->b->core.qual;
+    }
+if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
+    return 0;
+
+avgMapQ = 0;
+for (i = 0; i < d->nR; ++i)
+    {
+    p = d->puR + i;
+    avgMapQ += p->b->core.qual;
+    }
+if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
+    return 0;
+
+// Satisfied average mapping quality constraint
+
+int dupeL = 0, dupeR = 0;
+for (i = 0; i < d->nL; ++i)
+    {
+    p = d->puL + i;
+    if (p->b->core.qual < ap->minMapQ)
+	continue;
+    bam1_core_t *c = &p->b->core;
+    if (!((c)->flag & BAM_FPAIRED) || ((c)->flag & BAM_FUNMAP))
+	continue; // read is not paired or unmapped.
+    if ((c)->flag & BAM_FMUNMAP)
+	continue; // mate is unmapped.
+    if ((c)->mtid != (c)->tid)
+	continue;  // mate is on different chrom. MERGE INTER CODE HERE?
+
+    int orientation = (bam1_strand(p->b) == bam1_mstrand(p->b));
+    
+    int32_t qstart = (c)->pos;
+    int32_t qstop  = qstart + (c)->l_qseq;
+    int32_t mstart = (c)->mpos;
+    int32_t mstop = mstart;
+    if (orientation)
+	mstop -= (c)->l_qseq;
+    else
+	mstop += (c)->l_qseq;
+
+    if (qstart <= ap->svStartL)
+	continue;
+    if (orientation || abs((c)->isize) > maxISize)
+	{
+	if (intraL.tid == -1)
+	    {
+	    intraL.tid = (c)->tid;
+	    intraL.qstart = qstart;
+	    intraL.qstop  = qstop;
+	    intraL.mstart = mstart;
+	    intraL.mstop  = mstop;
+	    intraL.orientation = orientation;
+	    intraL.supp  = 1;
+	    }
+	else if (qstart == intraL.qstart)
+	    { // potential dupe... same start position.
+	    dupeL += 1;
+	    }
+	else if (qstart < (intraL.qstop + BUF) && qstop > (intraL.qstart - BUF) &&
+		 mstart < (intraL.mstop + BUF) && mstop > (intraL.mstart - BUF))  
+	    {
+	    if (intraL.qstart > qstart)
+		intraL.qstart = qstart;
+	    if (intraL.qstop < qstop)
+		intraL.qstop = qstop;
+	    if (intraL.mstart > mstart)
+		intraL.mstart = mstart;
+	    if (intraL.mstop < mstop)
+		intraL.mstop = mstop;
+
+	    intraL.supp += 1;
+	    }
+	}
+    }
+
+for (i = 0; i < d->nR; ++i)
+    {
+    p = d->puR + i;
+    if (p->b->core.qual < ap->minMapQ)
+	continue;
+    bam1_core_t *c = &p->b->core;
+    if (!((c)->flag & BAM_FPAIRED) || ((c)->flag & BAM_FUNMAP))
+	continue; // read is not paired or unmapped.
+    if ((c)->flag & BAM_FMUNMAP)
+	continue; // mate is unmapped.
+    if ((c)->mtid != (c)->tid)
+	continue;  // mate is on different chrom. MERGE INTER CODE HERE?
+
+    int orientation = (bam1_strand(p->b) == bam1_mstrand(p->b));
+    
+    int32_t qstart = (c)->pos;
+    int32_t qstop  = qstart + (c)->l_qseq;
+    int32_t mstart = (c)->mpos;
+    int32_t mstop = mstart;
+    if (orientation)
+	mstop -= (c)->l_qseq;
+    else
+	mstop += (c)->l_qseq;
+
+    if (qstart <= ap->svStartR)
+	continue;
+      
+    if (orientation || abs((c)->isize) > maxISize)
+	{
+	if (intraR.tid == -1)
+	    {
+	    intraR.tid = (c)->tid;
+	    intraR.qstart = qstart;
+	    intraR.qstop  = qstop;
+	    intraR.mstart = mstart;
+	    intraR.mstop  = mstop;
+	    intraR.orientation = orientation;
+	    intraR.supp  = 1;
+	    }
+	else if (qstart == intraR.qstart)
+	    { // potential dupe... same start position.
+	    dupeR += 1;
+	    }
+	else if (qstart < (intraR.qstop + BUF) && qstop > (intraR.qstart - BUF) &&
+		 mstart < (intraR.mstop + BUF) && mstop > (intraR.mstart - BUF))  
+	    {
+	    if (intraR.qstart > qstart)
+		intraR.qstart = qstart;
+	    if (intraR.qstop < qstop)
+		intraR.qstop = qstop;
+	    if (intraR.mstart > mstart)
+		intraR.mstart = mstart;
+	    if (intraR.mstop < mstop)
+		intraR.mstop = mstop;
+
+	    intraR.supp += 1;
+	    }
+	}
+    }
+
+if (intraL.supp > ap->minSupp)
+    {
+    ap->svStartL = intraL.qstart;
+    ap->svStopL  = intraL.qstop;
+
+    ap->svStartR = intraR.qstart;
+    ap->svStopR  = intraR.qstop;
+    return 1;
+    }
+
+return 0;
+}
+
+static int indel_variant(dual_pileup_t *d, analysis_params_t *ap,
+			 int *isIns, int *isDel, int *insGerm, int *delGerm)
+{
+if (insL >= ap->minSupp)
+    {
+    *isIns = 1;
+    if (insR > 0)  // if you see even once, believe it.
+	*insGerm = 1;
+    else
+	*insGerm = 0;
+    return 1;
+    }
+
+if (delL >= ap->minSupp)
+    {
+    *isDel = 1;
+    if (delR > 0)  // if you see even once, believe it.
+	*delGerm = 1;
+    else
+	*delGerm = 0;
+    return 1;
+    }
+
+return 0;
+}
+
+
+static void do_struct_var(dual_pileup_t *d, analysis_params_t *ap)
+{
+int i;
+if (0 && inter_struct_variant(d, ap))
+    {
+    printf("SVS\t%s\t%d\t%d\t",
+	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+    for (i = 0; i < NUM_CHROMS; ++i)
+	{
+	if (somaticSV[i] > 0)
+	    printf("SSV:%s(%d),", d->hL->target_name[i], somaticSV[i]);
+	if (germlineSV[i] > 0)
+	    printf("GSV:%s(%d),", d->hL->target_name[i], germlineSV[i]);
+	}
+    putchar('\n');
     }
 
-if (is_sv(d, ap))
-    {  // do struct vars
+if (intra_struct_variant(d, ap))
+    {
     printf("SVS\t%s\t%d\t%d\t", 
 	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
        
-    print_nuc_info();
+    if (intraR.supp > 0)
+	{  // possibly germline, see if they overlap
+	if (intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF))
+	    printf("GIC\t%d,%d(%d-%d)>(%d-%d)\t%d,%d(%d-%d)>(%d-%d)", 
+		   intraR.orientation, intraR.supp, intraR.qstart, intraR.qstop,
+		   intraR.mstart, intraR.mstop,
+		   intraL.orientation, intraL.supp, intraL.qstart, intraL.qstop,
+		   intraL.mstart, intraL.mstop);
     }
+    else
+	printf("SIC\t%d,%d(%d-%d)>(%d-%d)", 
+	       intraL.orientation, intraL.supp, intraL.qstart, intraL.qstop, 
+	       intraL.mstart, intraL.mstop);
 
-nuc_counts(d, ap);
-if (is_het(d, ap))
-    {  // do ascn & phasing
-    char minGa = 'N';
-    char maxGa = 'N';
-    char minSa = 'N';
-    char maxSa = 'N';
-    int minGc = 0;
-    int maxGc = 0;
-    int minSc = 0;
-    int maxSc = 0;
-    for (i = 0; i < 4; i++)
+    putchar('\n');
+    }
+
+int isIns = 0, isDel = 0, insGerm = 0, delGerm = 0;
+if (indel_variant(d, ap, &isIns, &isDel, &insGerm, &delGerm))
+    {
+    printf("IDL\t%s\t%d\t%d\t",
+	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+    if (isIns && insGerm)
+	printf("GIns\t(%d,%d),", insR, insL);
+    else if (isIns && !insGerm)
+	printf("SIns\t(%d,%d),", insR, insL);
+
+    if (isDel && delGerm)
+	printf("GDel\t(%d,%d)", delR, delL);
+    else if (isDel && !delGerm)
+	printf("SDel\t(%d,%d)", delR, delL);
+
+    putchar('\n');
+    }
+}
+
+static void do_het_analysis(dual_pileup_t *d, analysis_params_t *ap)
+{
+if (!is_het(d, ap))
+    return;
+
+// do ascn & phasing
+char minGa = 'N', maxGa = 'N', minSa = 'N', maxSa = 'N';
+int i, minGc = 0, maxGc = 0, minSc = 0, maxSc = 0;
+for (i = 0; i < 4; i++)
 	{
 	int cr = countsR[i];
 	int cl = countsL[i];
 	if (cr > maxGc)
 	    {
 	    minGc = maxGc;
 	    minGa = maxGa;
  
 	    maxGc = cr;
 	    maxGa = NUCS[i];
 	    }
 	else if (cr > minGc)
 	    {
 	    minGc = cr;
 	    minGa = NUCS[i];
 	    }
 	
 	if (cl > maxSc)
 	    {
 	    minSc = maxSc;
 	    minSa = maxSa;
  
 	    maxSc = cl;
 	    maxSa = NUCS[i];
 	    }
 	else if (cl > minSc)
 	    {
 	    minSc = cl;
 	    minSa = NUCS[i];
 	    }
 	}
     
-    if (minSc < ap->minSupp)
+if (minSc < ap->minSupp)
 	{
 	minSc = 0;
 	minSa = 'N';
 	}
 
-    char g1 = 'N';
-    char g2 = 'N';
-    if (maxSa == maxGa)
+char g1 = 'N', g2 = 'N';
+if (maxSa == maxGa)
 	{
 	g1 = maxGa;
 	g2 = minGa;
 	}
-    else if (maxSa == minGa)
+else if (maxSa == minGa)
 	{
 	g1 = minGa;
 	g2 = maxGa;
 	} 
-    // else, weird case!
+// else, weird case!
     
-    if (g1 != 'N' && g2 != 'N')
-	{
-	printf("HAP\t%s\t%d\t%d\t%c %c\t%c %c\t", 
+if (g1 == 'N' || g2 == 'N')
+    return;
+
+printf("HAP\t%s\t%d\t%d\t%c %c\t%c %c\t", 
 	       d->hL->target_name[d->tidL], d->posL, d->posL + 1, 
 	       g1, g2, maxSa, minSa);
-	print_nuc_info();
+print_nuc_info();
 	
-	if ((ret = calc_ascn(d, ap, maxSc, minSc, maxGc, minGc)) > 0)
-	    {
-	    double minCN = (double) (ap->lcountsMin) / (double) (ap->rcountsMin + ap->rcountsMax);
-	    double maxCN = (double) (ap->lcountsMax) / (double) (ap->rcountsMin + ap->rcountsMax);
+int ret;
+if ((ret = calc_ascn(d, ap, maxSc, minSc, maxGc, minGc)) == 0)
+    return;
+
+double minCN = (double) (ap->lcountsMin) / (double) (ap->rcountsMin + ap->rcountsMax);
+double maxCN = (double) (ap->lcountsMax) / (double) (ap->rcountsMin + ap->rcountsMax);
 	    
-	    printf("ASC\t%s\t%d\t%d\t%f\t%f\t%d\t%d\t%d\t%d\n",
+printf("ASC\t%s\t%d\t%d\t%f\t%f\t%d\t%d\t%d\t%d\n",
 		   d->hL->target_name[ap->tidAS],
 		   ap->startAS, ap->stopAS,
 		   maxCN, minCN, 
 		   ap->rcountsMax, ap->rcountsMin, 
 		   ap->lcountsMax, ap->lcountsMin);
 	    
-	    ap->startAS    = -1;
-	    ap->stopAS     = -1;	
-	    ap->lcountsMax = 0;
-	    ap->lcountsMin = 0;
-	    ap->rcountsMax = 0;
-	    ap->rcountsMin = 0;
+ap->startAS    = -1;
+ap->stopAS     = -1;	
+ap->lcountsMax = 0;
+ap->lcountsMin = 0;
+ap->rcountsMax = 0;
+ap->rcountsMin = 0;
 	    
-	    if (ret == 2)
+ap->pStartPrev = ap->pStart;
+ap->pStopPrev  = ap->pStop;
+
+if (ret == 2)
 		{
 		ap->tidAS  = BAD_TID;
 		ap->pStartPrev = -1;
 		ap->pStopPrev  = -1;
 		}
-	    }
-	ap->pStartPrev = ap->pStart;
-	ap->pStopPrev  = ap->pStop;
-	}
-    }
+}
 
-int stateL, stateR;
+static void do_mut(dual_pileup_t *d, analysis_params_t *ap)
+{
 char callL1, callL2, callR1, callR2;
-float hetL, hetR;
+if (!is_mut(d, ap, &callL1, &callL2, &callR1, &callR2))
+    return;
 
-if (is_mut(d, ap, &callL1, &callL2, &callR1, &callR2,
-	   &hetL, &hetR, &stateL, &stateR, &ap->sites))
-    {
-    printf("MUT\t%s\t%d\t%d\t",
+printf("MUT\t%s\t%d\t%d\t",
 	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
 
-    if (callR2 == 'N')
+if (callR2 == 'N')
 	printf("%c>", callR1);
-    else
+else
 	printf("%c/%c>", callR1, callR2);
 
-    if (callL2 == 'N')
+if (callL2 == 'N')
 	printf("%c\t", callL1);
-    else
+else
 	printf("%c/%c\t", callL1, callL2);
 
-    double chi_sq = sig_diff(d, ap);
-    printf("%f\t", chi_sq);
-    print_nuc_info();
-    }
+double chi_sq = sig_diff(d, ap);
+printf("%f\t", chi_sq);
+print_nuc_info();
+}
+
+
+
+static void perform_analysis(dual_pileup_t *d, void *func_data)
+{
+analysis_params_t *ap = (analysis_params_t *)func_data;
+
+/* processes pileup for het and mut analysis */
+nuc_counts(d, ap);
+
+/* overall cnv */
+do_cnv(d, ap);
+
+/* structural variation */
+do_struct_var(d, ap);
+
+/* het analysis: ASCN & phasing */
+do_het_analysis(d, ap);
+
+/* mutation discovery */
+do_mut(d, ap);
 
 fflush(stdout);
 }
 
 void bamBam(char *inbamL, char *inbamR, int mask)
 {
 analysis_params_t ap;
 ap.minQ     = minQ;
 ap.minMapQ  = minMapQ;
+ap.avgMapQ  = avgMapQ;
+ap.maxISize = maxISize;
 ap.minChiSq = minChiSq;
 ap.minSupp  = minSupport;
 ap.sites    = 0;
 
-ap.readLength  = readLength;
 ap.readsPerBin = readsPerBin;
 
 ap.pStart = -1;
 ap.pStop  = -1;
 ap.pStartPrev = -1;
 ap.pStopPrev  = -1;
 
 /* Overall Copy Number */
 ap.tid       = BAD_TID;
 ap.lstart    = -1;
 ap.lstop     = -1;
 ap.lcounts   = 0;
 ap.rstart    = -1;
 ap.rstop     = -1;
 ap.rcounts   = 0;
 ap.prevTid   = BAD_TID;
 ap.prevStart = -1;
 ap.prevStop  = -1;
 
 /* ASCN */
 ap.tidAS      = BAD_TID;
 ap.startAS   = -1;
 ap.stopAS    = -1;
 ap.lcountsMin = 0;
 ap.lcountsMax = 0;
 ap.rcountsMin = 0;
 ap.rcountsMax = 0;
 
+/* SV */
+ap.svStartL = -1;
+ap.svStopL  = -1;
+
+ap.svStartR = -1;
+ap.svStopR  = -1;
+
 bam_bam_file(inbamL, inbamR, position, mask, perform_analysis, &ap);
-fprintf(stderr, "# qualified sites = %d\n", ap.sites);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 
 if (optionExists("minSupport"))
     minSupport = optionInt("minSupport", 2);
 if (optionExists("minQ"))
     minQ = optionInt("minQ", 0);
 if (optionExists("minMapQ"))
     minMapQ = optionInt("minMapQ", 0);
+if (optionExists("avgMapQ"))
+    avgMapQ = optionInt("avgMapQ", 0);
+if (optionExists("maxISize"))
+    maxISize = optionInt("maxISize", 10000);
 if (optionExists("minChiSq"))
     minChiSq = optionInt("minChiSq", 0);
 if (optionExists("position"))
     position = optionVal("position", NULL);
 if (optionExists("readsPerBin"))
     readsPerBin = optionInt("readsPerBin", 500);
-if (optionExists("readLength"))
-    readLength = optionInt("readLength", 100);
 
 char *inbamL = argv[1];
 char *inbamR = argv[2];
 bamBam(inbamL, inbamR, BAM_DEF_MASK);
 
 return 0;
 }
 
 #else // USE_BAM not defined
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 printf("BAM not installed");
 
 return 0;
 }
 #endif