src/hg/tcga/pBamBam/asBamBam.c 1.10

1.10 2010/04/22 18:10:50 jsanborn
added ref FASTA reading
Index: src/hg/tcga/pBamBam/asBamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/asBamBam.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 4 -r1.9 -r1.10
--- src/hg/tcga/pBamBam/asBamBam.c	15 Mar 2010 00:24:27 -0000	1.9
+++ src/hg/tcga/pBamBam/asBamBam.c	22 Apr 2010 18:10:50 -0000	1.10
@@ -26,8 +26,9 @@
   "asBamBam - Allele-Specific BamBam.\n"
   "usage:\n"
   "   asBamBam [options] <left.bam> <right.bam>\n"
   "options:\n"
+  "   -fa=file          - FASTA filename of reference [NULL]\n"
   "   -position=chr1:1-100 - Position to do analysis. [NULL=process all reads]\n" 
   "   -minSuppSNP=INT   - Minimum Support for SNP/Het Calling. [4]\n"
   "   -minSuppSV=INT    - Minimum Support for struct vars. [2]\n"
   "   -minQ=INT         - Minimum acceptable base quality. [10]\n"
@@ -40,8 +41,9 @@
   );
 }
 
 static struct optionSpec options[] = {
+    {"fa", OPTION_STRING},
     {"position", OPTION_STRING},
     {"minSuppSNP", OPTION_INT},
     {"minSuppSV", OPTION_INT},
     {"minQ", OPTION_INT},
@@ -55,8 +57,9 @@
 
 #define BAD_TID -999
 
 /* Option variables */
+char *faidx     = NULL;
 char *position  = NULL;
 int minSuppSNP  = 4;
 int minSuppSV   = 2;
 int minQ        = 10;
@@ -149,11 +152,15 @@
     int supp;
     int qual;
     int mstrand;
     int qstrand;
+
+    int mdir;      // trend of mate directions 
+    int pmstart;   // previos start
 } sv_t;
 
-#define BUF 100  // 500 base buffer around start/stop
+#define BUF 100  // buffer around reads' starts/stops
+
 static sv_t intraL, intraR;
 static sv_t interL, interR;
 
 static int nuc_index(char c)
@@ -233,13 +240,12 @@
 int32_t start, stop;
 int32_t prevStart = -1;
 const bam_pileup1_t *p;
 
-char best_n1  = 'N';
+char best_b1 = 'N';
 int best_q1 = -1;
-char best_n2  = 'N';
+char best_b2 = 'N';
 int best_q2 = -1;
-
 int insertion = 0, deletion = 0;
  
 for (i = 0; i < d->nL; ++i)
     { 
@@ -252,10 +258,8 @@
     /* Check if start & stop has already been visited, to dupes missed in prev step*/    
     start = (c)->pos;
     stop  = start + (c)->l_qseq;    
 
-    //if (start <= prevStart && stop <= prevStop)
-//	continue;
     if (ap->pStart == -1)
 	{
 	ap->pStart = start;
 	ap->pStop  = stop;
@@ -285,10 +289,8 @@
 	    }
 	else
 	    {
 	    int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);    
-	    if ((index = nuc_index(b)) < 0)
-		continue;
 	    if (best_b1 == 'N')
 		{
 		best_b1 = b;
 		best_q1 = qual;
@@ -316,9 +318,9 @@
     else if (deletion)
 	delL += 1;
 
     double lseq = (double) p->b->core.l_qseq;
-    if (!(index = nuc_index(best_b1)) < 0)
+    if ((index = nuc_index(best_b1)) >= 0)
 	{
 	votesL[index] += best_q1;
 	countsL[index]++;
 
@@ -329,9 +331,9 @@
 	else
 	    countsL3[index]++;
 	}
     
-    if (!(index = nuc_index(best_b2)) < 0)
+    if ((index = nuc_index(best_b2)) >= 0)
 	{
 	votesL[index] += best_q2;
 	countsL[index]++;
 	
@@ -347,8 +349,38 @@
     best_b2 = 'N';
     best_q2 = -1;
     
     insertion = 0, deletion = 0;
+
+    if (p->is_del)
+	{
+	if (p->indel > 0)
+	    insertion = 1;
+	else if (p->indel < 0)
+	    deletion = 1;
+	}
+    else
+	{
+	int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
+	if (best_b1 == 'N')
+	    {
+	    best_b1 = b;
+	    best_q1 = qual;
+	    }
+	else if (qual > best_q1)
+	    {
+	    best_b2 = best_b1;
+	    best_q2 = best_q1;
+
+	    best_b1 = b;
+	    best_q1 = qual;
+	    }
+	else if (qual > best_q2)
+	    {
+	    best_b2 = b;
+	    best_q2 = qual;
+	    }
+	}
     }
 
 prevStart = -1;
 for (i = 0; i < d->nR; ++i)
@@ -361,12 +393,11 @@
 
     /* Check if start & stop has already been visited, to dupes missed in prev step*/    
     start = (c)->pos;
     stop  = start + (c)->l_qseq;    
-    if (start <= prevStart && stop <= prevStop)
+    if (start <= prevStart)
 	continue;
     prevStart = start;
-    prevStop  = stop;
 
     if (ap->pStart == -1)
 	{
 	ap->pStart = start;
@@ -514,9 +545,9 @@
 
 return chi_sq;
 }
 
-static int is_mut(dual_pileup_t *d, analysis_params_t *ap,
+static int is_mut(dual_pileup_t *d, analysis_params_t *ap, char *ref,
 		  char *callL1, char *callL2, char *callR1, char *callR2)
 {
 if (majorR.vote == 0 || majorL.vote == 0)
     return 0;  // most likely a bad quality region.
@@ -531,8 +562,9 @@
 
 if (rCount < 10)
     return 0;
 
+char rb = (d->ref && (int)d->posL < d->len)? d->ref[d->posL] : 'N';
 
 int retVal1 = 0, retVal2 = 0;
 for (i = 0; i < 4; i++)
     {
@@ -582,8 +614,9 @@
 
 if (!retVal1 && !retVal2)
     return 0;
 
+*ref    = rb;
 *callL1 = majorL.nuc;
 *callL2 = minorL.nuc;
 *callR1 = majorR.nuc;
 *callR2 = minorR.nuc;
@@ -836,8 +866,10 @@
 interL.supp    =  0;
 interL.qual    =  0;
 interL.qstrand = -1;
 interL.mstrand = -1;
+interL.mdir    =  0;
+interL.pmstart = -1;
 
 interR.tid     = -1;
 interR.mtid    = -1;
 interR.qstart  = -1;
@@ -847,8 +879,10 @@
 interR.supp    =  0;
 interR.qual    =  0;
 interR.qstrand = -1;
 interR.mstrand = -1;
+interR.mdir    =  0;
+interR.pmstart = -1;
 }
 
 static inline void reset_intra()
 { 
@@ -860,8 +894,10 @@
 intraL.supp    =  0;
 intraL.qual    =  0;
 intraL.qstrand = -1;
 intraL.mstrand = -1;
+intraL.mdir    =  0;
+intraL.pmstart = -1;
 
 intraR.tid     = -1;
 intraR.qstart  = -1;
 intraR.qstop   = -1;
@@ -870,8 +906,10 @@
 intraR.supp    =  0;
 intraR.qual    =  0;
 intraR.qstrand = -1;
 intraR.mstrand = -1;
+intraR.mdir    =  0;
+intraR.pmstart = -1;
 }
 
 static inline void update_sv(const bam_pileup1_t *p, sv_t *sv)
 {
@@ -895,8 +933,10 @@
     sv->mstart  = mstart;
     sv->mstop   = mstop;
     sv->mstrand = mstrand;
     sv->qstrand = qstrand;
+    sv->mdir    = 0;
+    sv->pmstart = mstart;
     sv->qual    = (c)->qual;
     sv->supp    = 1;
     }
 else if ((c)->tid == sv->tid && (c)->mtid == sv->mtid &&
@@ -912,8 +952,11 @@
 	sv->mstart = mstart;
     if (sv->mstop < mstop)
 	sv->mstop = mstop;
     
+    sv->mdir   += (mstart - sv->pmstart);
+    sv->pmstart = mstart; 
+    
     sv->qual += (c)->qual;
     sv->supp += 1;
     }
 }
@@ -1048,9 +1091,17 @@
 
     /* check average mapping quality of discordants */
     int qual = ceil((double) intraL.qual / (double) intraL.supp); 
     if (intraL.supp >= ap->minSuppSV && qual >= ap->avgMapQ)
+	{  // check mate pair direction trend... should be positive for properly paired (opposite strand)
+
+	//return 1;
+
+	if ((intraL.qstrand == intraL.mstrand && intraL.mdir < 0) || 
+	    (intraL.qstrand != intraL.mstrand && intraL.mdir > 0))
 	return 1;
+	}
+
     reset_intra();
     }
 return 0;
 }
@@ -1064,9 +1115,16 @@
 
     /* check average mapping quality of discordants */
     int qual = ceil((double) interL.qual / (double) interL.supp); 
     if (interL.supp >= ap->minSuppSV && qual >= ap->avgMapQ)
+	{  // check mate pair direction trend... should be positive for properly paired (opposite strand)
+
+	//return 1;
+
+	if ((interL.qstrand == interL.mstrand && interL.mdir < 0) || 
+	    (interL.qstrand != interL.mstrand && interL.mdir > 0))
 	return 1;
+	}
     reset_inter();
     }
 return 0;
 }
@@ -1302,15 +1360,16 @@
 }
 
 static void do_mut(dual_pileup_t *d, analysis_params_t *ap)
 {
-char callL1, callL2, callR1, callR2;
-if (!is_mut(d, ap, &callL1, &callL2, &callR1, &callR2))
+char ref, callL1, callL2, callR1, callR2;
+if (!is_mut(d, ap, &ref, &callL1, &callL2, &callR1, &callR2))
     return;
 
 printf("MUT\t%s\t%d\t%d\t",
        d->hL->target_name[d->tidL], d->posL, d->posL + 1);
 
+printf("%c\t", ref);
 if (callR2 == 'N')
     printf("%c>", callR1);
 else
     printf("%c/%c>", callR1, callR2);
@@ -1348,9 +1407,9 @@
 
 fflush(stdout);
 }
 
-void bamBam(char *inbamL, char *inbamR, int mask)
+void bamBam(char *inbamL, char *inbamR, char *faidx, int mask)
 {
 analysis_params_t ap;
 ap.minQ     = minQ;
 ap.minMapQ  = minMapQ;
@@ -1398,9 +1457,9 @@
 
 reset_intra();
 reset_inter();
 
-bam_bam_file(inbamL, inbamR, position, mask, perform_analysis, &ap);
+bam_bam_file(inbamL, inbamR, faidx, position, mask, perform_analysis, &ap);
 }
 
 
 int main(int argc, char *argv[])
@@ -1409,8 +1468,10 @@
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 
+if (optionExists("fa"))
+    faidx = optionVal("fa", NULL);
 if (optionExists("position"))
     position = optionVal("position", NULL);
 if (optionExists("minSuppSNP"))
     minSuppSNP = optionInt("minSuppSNP", 4);
@@ -1430,9 +1491,9 @@
     readsPerBin = optionInt("readsPerBin", 2500);
 
 char *inbamL = argv[1];
 char *inbamR = argv[2];
-bamBam(inbamL, inbamR, BAM_DEF_MASK);
+bamBam(inbamL, inbamR, faidx, BAM_DEF_MASK);
 
 return 0;
 }