src/hg/tcga/pBamBam/asBamBam.c 1.3

1.3 2010/03/07 02:36:46 jsanborn
updated
Index: src/hg/tcga/pBamBam/asBamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/asBamBam.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/tcga/pBamBam/asBamBam.c	6 Mar 2010 02:06:48 -0000	1.2
+++ src/hg/tcga/pBamBam/asBamBam.c	7 Mar 2010 02:36:46 -0000	1.3
@@ -22,20 +22,20 @@
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "asBamBam - Allele-specific BamBam.\n"
+  "asBamBam - Allele-Specific BamBam.\n"
   "usage:\n"
   "   asBamBam [options] <left.bam> <right.bam>\n"
   "options:\n"
   "   -position=chrX:1-100 - Position to do analysis. [None]\n" 
-  "   -minSupport=INT   - Minimum Support in Germline. [10]\n"
-  "   -minQ=INT         - Minimum acceptable base quality. [0]\n"
-  "   -minMapQ=INT      - Minimum acceptable mapping quality. [0]\n"
-  "   -avgMapQ=INT      - Minimum acceptable average mapping quality. [0]\n"
-  "   -minChiSq=INT   - Minimum acceptable chi square of nucleotide freqs. [0]\n"
+  "   -minSupport=INT   - Minimum Support in Germline. [4]\n"
+  "   -minQ=INT         - Minimum acceptable base quality. [10]\n"
+  "   -minMapQ=INT      - Minimum acceptable mapping quality. [20]\n"
+  "   -avgMapQ=INT      - Minimum acceptable average mapping quality. [30]\n"
+  "   -minChiSq=INT     - Minimum acceptable chi square of nucleotide freqs. [250]\n"
   "   -maxISize=INT   - Maximum Insert Size. [10000]\n"
-  "   -readsPerBin=INT  - Reads per CNV bin [500]\n"
+  "   -readsPerBin=INT  - Reads per CNV bin [2500]\n"
   "\n"
   );
 }
 
@@ -54,17 +54,15 @@
 #define BAD_TID -999
 
 /* Option variables */
 char *position = NULL;
-int minSupport = 2;
-int minQ     = 0;
-int minMapQ  = 0;
-int avgMapQ  = 0;
+int minSupport  = 4;
+int minQ        = 10;
+int minMapQ     = 20;
+int avgMapQ     = 30;
 int maxISize = 10000;
-int minChiSq = 0;
-int readsPerBin = 500;
-int readLength = 100;
-
+int minChiSq    = 250;
+int readsPerBin = 2500;
 
 typedef struct {
     int minQ;
     int minMapQ;
@@ -73,9 +71,8 @@
     int minChiSq;
     int minSupp;
     int sites;
 
-    int readLength;
     int readsPerBin;
 
     int pStartPrev;
     int pStopPrev;
@@ -144,13 +141,15 @@
     int32_t mstart;
     int32_t mstop;
 
     int supp;
-    int orientation;
+    int qual;
+    int mstrand;
+    int qstrand;
 } sv_t;
 
-#define BUF 500  // 500 base buffer around start/stop
-sv_t intraL, intraR;
+#define BUF 100  // 500 base buffer around start/stop
+static sv_t intraL, intraR;
 sv_t interL, interR;
 
 static int nuc_index(char c)
 {
@@ -851,53 +850,41 @@
 
 return retVal;
 }
 
-
-
-
-static int intra_struct_variant(dual_pileup_t *d, analysis_params_t *ap)
+static inline void reset_intra()
 {
 intraL.tid   = -1;
 intraL.qstart = -1;
 intraL.qstop  = -1;
 intraL.mstart = -1;
 intraL.mstop  = -1;
 intraL.supp  = 0;
-intraL.orientation = -1;
+intraL.qual    =  0;
+intraL.qstrand = -1;
+intraL.mstrand = -1;
 
 intraR.tid   = -1;
 intraR.qstart = -1;
 intraR.qstop  = -1;
 intraR.mstart = -1;
 intraR.mstop  = -1;
 intraR.supp  = 0;
-intraR.orientation = -1;
+intraR.qual    =  0;
+intraR.qstrand = -1;
+intraR.mstrand = -1;
+}
 
+static int intra_struct_variant(dual_pileup_t *d, analysis_params_t *ap)
+{
 // First check average mapping quality of read pairs
 int i;
 const bam_pileup1_t *p;
-int avgMapQ = 0;
-for (i = 0; i < d->nL; ++i)
-    {
-    p = d->puL + i;
-    avgMapQ += p->b->core.qual;
-    }
-if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
-    return 0;
-
-avgMapQ = 0;
-for (i = 0; i < d->nR; ++i)
-    {
-    p = d->puR + i;
-    avgMapQ += p->b->core.qual;
-    }
-if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
-    return 0;
-
-// Satisfied average mapping quality constraint
+int deltaSupp = intraL.supp + intraR.supp;
 
-int dupeL = 0, dupeR = 0;
+int maxStartL = -1, maxStartR = -1;
+int nL = 0, nR = 0;
+//int dupeL = 0, dupeR = 0;
 for (i = 0; i < d->nL; ++i)
     {
     p = d->puL + i;
     if (p->b->core.qual < ap->minMapQ)
@@ -909,37 +896,39 @@
 	continue; // mate is unmapped.
     if ((c)->mtid != (c)->tid)
 	continue;  // mate is on different chrom. MERGE INTER CODE HERE?
 
-    int orientation = (bam1_strand(p->b) == bam1_mstrand(p->b));
+    int qstrand = bam1_strand(p->b);
+    int mstrand = bam1_mstrand(p->b);
+    int orientation = (qstrand == mstrand);
     
     int32_t qstart = (c)->pos;
     int32_t qstop  = qstart + (c)->l_qseq;
     int32_t mstart = (c)->mpos;
-    int32_t mstop = mstart;
-    if (orientation)
-	mstop -= (c)->l_qseq;
-    else
-	mstop += (c)->l_qseq;
+    int32_t mstop = mstart + (c)->l_qseq;
+
+    if (qstart > maxStartL)
+	maxStartL = qstart;
 
-    if (qstart <= ap->svStartL)
-	continue;
     if (orientation || abs((c)->isize) > maxISize)
 	{
+	if (qstart <= ap->svStartL)
+	    continue;
+	ap->svStartL = qstart;
+	nL += 1;
+
 	if (intraL.tid == -1)
 	    {
 	    intraL.tid = (c)->tid;
 	    intraL.qstart = qstart;
 	    intraL.qstop  = qstop;
 	    intraL.mstart = mstart;
 	    intraL.mstop  = mstop;
-	    intraL.orientation = orientation;
+	    intraL.qstrand = qstrand;
+	    intraL.mstrand = mstrand;
+	    intraL.qual    = (c)->qual;
 	    intraL.supp  = 1;
 	    }
-	else if (qstart == intraL.qstart)
-	    { // potential dupe... same start position.
-	    dupeL += 1;
-	    }
 	else if (qstart < (intraL.qstop + BUF) && qstop > (intraL.qstart - BUF) &&
 		 mstart < (intraL.mstop + BUF) && mstop > (intraL.mstart - BUF))  
 	    {
 	    if (intraL.qstart > qstart)
@@ -950,8 +939,9 @@
 		intraL.mstart = mstart;
 	    if (intraL.mstop < mstop)
 		intraL.mstop = mstop;
 
+	    intraL.qual += (c)->qual;
 	    intraL.supp += 1;
 	    }
 	}
     }
@@ -968,38 +958,39 @@
 	continue; // mate is unmapped.
     if ((c)->mtid != (c)->tid)
 	continue;  // mate is on different chrom. MERGE INTER CODE HERE?
 
-    int orientation = (bam1_strand(p->b) == bam1_mstrand(p->b));
+    int qstrand = bam1_strand(p->b);
+    int mstrand = bam1_mstrand(p->b);
+    int orientation = (qstrand == mstrand);
     
     int32_t qstart = (c)->pos;
     int32_t qstop  = qstart + (c)->l_qseq;
     int32_t mstart = (c)->mpos;
-    int32_t mstop = mstart;
-    if (orientation)
-	mstop -= (c)->l_qseq;
-    else
-	mstop += (c)->l_qseq;
+    int32_t mstop = mstart + (c)->l_qseq;
 
-    if (qstart <= ap->svStartR)
-	continue;
+    if (qstart > maxStartR)
+	maxStartR = qstart;
       
     if (orientation || abs((c)->isize) > maxISize)
 	{
+	if (qstart <= ap->svStartR)
+	    continue;
+	ap->svStartR = qstart;
+	nR += 1;
+
 	if (intraR.tid == -1)
 	    {
 	    intraR.tid = (c)->tid;
 	    intraR.qstart = qstart;
 	    intraR.qstop  = qstop;
 	    intraR.mstart = mstart;
 	    intraR.mstop  = mstop;
-	    intraR.orientation = orientation;
+	    intraR.mstrand = mstrand;
+	    intraR.qstrand = qstrand;
+	    intraR.qual    = (c)->qual;
 	    intraR.supp  = 1;
 	    }
-	else if (qstart == intraR.qstart)
-	    { // potential dupe... same start position.
-	    dupeR += 1;
-	    }
 	else if (qstart < (intraR.qstop + BUF) && qstop > (intraR.qstart - BUF) &&
 		 mstart < (intraR.mstop + BUF) && mstop > (intraR.mstart - BUF))  
 	    {
 	    if (intraR.qstart > qstart)
@@ -1010,21 +1001,35 @@
 		intraR.mstart = mstart;
 	    if (intraR.mstop < mstop)
 		intraR.mstop = mstop;
 
+	    intraR.qual += (c)->qual;
 	    intraR.supp += 1;
 	    }
 	}
     }
 
-if (intraL.supp > ap->minSupp)
+if (intraL.supp > 0) //  || intraR.supp > 0)
     {
-    ap->svStartL = intraL.qstart;
-    ap->svStopL  = intraL.qstop;
+    if (maxStartL <= intraL.qstop + BUF)
+	return 0;  // there still might be reads to overlap
 
-    ap->svStartR = intraR.qstart;
-    ap->svStopR  = intraR.qstop;
+    /* check average mapping quality of discordants */
+    int qual = ceil((double) intraL.qual / (double) intraL.supp); 
+    if (intraL.supp >= ap->minSupp && qual >= ap->avgMapQ)
     return 1;
+    reset_intra();
+    }
+
+return 0;
+
+deltaSupp = (intraL.supp + intraR.supp) - deltaSupp; 
+if (deltaSupp == 0 && (intraL.supp > 0 || intraR.supp > 0) )
+    {
+    fprintf(stderr, "SIC pos = %d\n", d->posL);
+    if (intraL.supp >= ap->minSupp)
+	return 1;
+    reset_intra();
     }
 
 return 0;
 }
@@ -1078,23 +1083,41 @@
     {
     printf("SVS\t%s\t%d\t%d\t",
 	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
     
-    if (intraR.supp > 0)
-	{  // possibly germline, see if they overlap
-	if (intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF))
-	    printf("GIC\t%d,%d(%d-%d)>(%d-%d)\t%d,%d(%d-%d)>(%d-%d)", 
-		   intraR.orientation, intraR.supp, intraR.qstart, intraR.qstop,
-		   intraR.mstart, intraR.mstop,
-		   intraL.orientation, intraL.supp, intraL.qstart, intraL.qstop,
-		   intraL.mstart, intraL.mstop);
-	}
+    char qsL = '+';
+    if (intraL.qstrand)
+	qsL = '-';
+    char msL = '+';
+    if (intraL.mstrand)
+	msL = '-';
+
+    char qsR = '+';
+    if (intraR.qstrand)
+	qsR = '-';
+    char msR = '+';
+    if (intraR.mstrand)
+	msR = '-';
+
+    
+    if ( intraR.supp > 0 && 
+	(intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF)) )
+	printf("GIC\t%d\t%3.1f\t%d\t%3.1f\t%c(%d-%d)>%c(%d-%d)\t%c(%d-%d)>%c(%d-%d)", 
+	       intraR.supp, (double) intraR.qual / (double) intraR.supp, 
+	       intraL.supp, (double) intraL.qual / (double) intraL.supp, 
+	       qsR, intraR.qstart, intraR.qstop,
+	       msR, intraR.mstart, intraR.mstop,
+	       qsL, intraL.qstart, intraL.qstop,
+	       msL, intraL.mstart, intraL.mstop);
     else
-	printf("SIC\t%d,%d(%d-%d)>(%d-%d)", 
-	       intraL.orientation, intraL.supp, intraL.qstart, intraL.qstop, 
-	       intraL.mstart, intraL.mstop);
+	printf("SIC\t%d\t%3.1f\t%c(%d-%d)>%c(%d-%d)", 
+	       intraL.supp, (double) intraL.qual / (double) intraL.supp, 
+	       qsL, intraL.qstart, intraL.qstop, 
+	       msL, intraL.mstart, intraL.mstop);
 
     putchar('\n');
+
+    reset_intra();
     }
 
 int isIns = 0, isDel = 0, insGerm = 0, delGerm = 0;
 if (indel_variant(d, ap, &isIns, &isDel, &insGerm, &delGerm))
@@ -1309,8 +1332,10 @@
 
 ap.svStartR = -1;
 ap.svStopR  = -1;
 
+reset_intra();
+
 bam_bam_file(inbamL, inbamR, position, mask, perform_analysis, &ap);
 }
 
 
@@ -1320,24 +1345,24 @@
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 
+if (optionExists("position"))
+    position = optionVal("position", NULL);
 if (optionExists("minSupport"))
-    minSupport = optionInt("minSupport", 2);
+    minSupport = optionInt("minSupport", 4);
 if (optionExists("minQ"))
-    minQ = optionInt("minQ", 0);
+    minQ = optionInt("minQ", 10);
 if (optionExists("minMapQ"))
-    minMapQ = optionInt("minMapQ", 0);
+    minMapQ = optionInt("minMapQ", 20);
 if (optionExists("avgMapQ"))
-    avgMapQ = optionInt("avgMapQ", 0);
+    avgMapQ = optionInt("avgMapQ", 30);
 if (optionExists("maxISize"))
     maxISize = optionInt("maxISize", 10000);
 if (optionExists("minChiSq"))
-    minChiSq = optionInt("minChiSq", 0);
-if (optionExists("position"))
-    position = optionVal("position", NULL);
+    minChiSq = optionInt("minChiSq", 250);
 if (optionExists("readsPerBin"))
-    readsPerBin = optionInt("readsPerBin", 500);
+    readsPerBin = optionInt("readsPerBin", 2500);
 
 char *inbamL = argv[1];
 char *inbamR = argv[2];
 bamBam(inbamL, inbamR, BAM_DEF_MASK);