src/hg/tcga/bamBam/bamBam.c 1.9

1.9 2009/11/06 07:08:06 jsanborn
updated
Index: src/hg/tcga/bamBam/bamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamBam/bamBam.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -b -B -U 4 -r1.8 -r1.9
--- src/hg/tcga/bamBam/bamBam.c	1 Oct 2009 22:24:58 -0000	1.8
+++ src/hg/tcga/bamBam/bamBam.c	6 Nov 2009 07:08:06 -0000	1.9
@@ -32,8 +32,9 @@
   "   -minQ=INT       - Minimum acceptable base quality. [0]\n"
   "   -minMapQ=INT    - Minimum acceptable mapping quality. [0]\n"
   "   -avgMapQ=INT    - Minimum acceptable avg mapping quality. [0]\n"
   "   -minChiSq=INT    - Minimum acceptable chi square of nucleotide freqs. [0]\n"
+  "   -maxISize=INT   - Maximum Insert Size. [10000]\n"
   "\n"
   );
 }
 
@@ -43,20 +44,20 @@
     {"minQ", OPTION_INT},
     {"minMapQ", OPTION_INT},
     {"avgMapQ", OPTION_INT},
     {"minChiSq", OPTION_INT},
+    {"maxISize", OPTION_INT},
     {NULL, 0}
 };
 
-#define HET_THRESH 0.05
-
 /* Option variables */
 char *position = NULL;
 int minSupport = 2;
 int minQ     = 0;
 int minMapQ  = 0;
 int avgMapQ  = 0;
 int minChiSq = 0;
+int maxISize = 10000;
 
 typedef struct {
     int minQ;
     int minMapQ;
@@ -367,9 +368,9 @@
 static int mateChromsR[NUM_CHROMS];
 static int somaticSV[NUM_CHROMS];
 static int germlineSV[NUM_CHROMS];
 
-static int struct_variant(pileup_data_t *d, analysis_params_t *ap)
+static int inter_struct_variant(pileup_data_t *d, analysis_params_t *ap)
 {
 int i;
 for (i = 0; i < NUM_CHROMS; ++i)
     {
@@ -448,8 +449,106 @@
 
 return retVal;
 }
 
+
+
+static int intra_struct_variant(pileup_data_t *d, analysis_params_t *ap, 
+				int32_t *avgIsizeL, double *avgOrientL, int32_t *retCountL, 
+				int32_t *avgIsizeR, double *avgOrientR, int32_t *retCountR)
+{
+// First check average mapping quality of read pairs
+int i;
+const bam_pileup1_t *p;
+int avgMapQ = 0;
+for (i = 0; i < d->nL; ++i)
+    {
+    p = d->puL + i;    
+    avgMapQ += p->b->core.qual;
+    }
+if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
+    return 0;
+
+avgMapQ = 0;
+for (i = 0; i < d->nR; ++i)
+    {
+    p = d->puR + i;    
+    avgMapQ += p->b->core.qual;
+    }
+if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
+    return 0;
+
+// Satisfied average mapping quality constraint
+
+int32_t isizeL = 0;
+double orientL = 0.0;
+double countL = 0.0;
+for (i = 0; i < d->nL; ++i)
+    {
+    p = d->puL + i;
+
+    if (p->b->core.qual < ap->minMapQ)
+	continue;
+
+    bam1_core_t *c = &p->b->core;
+    if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+	{ // read is paired and mapped.
+	if (!((c)->flag & BAM_FMUNMAP))
+	    {  // mate is mapped
+	    if ( ((c)->mtid == (c)->tid) && (abs((c)->isize) > maxISize) )
+		{ // same chrom, exceeds maximum isize.
+		isizeL  += ((c)->mpos - (c)->pos);
+		orientL += (double) (bam1_strand(p->b) && bam1_mstrand(p->b));
+		countL  += 1.0;
+		}
+	    }
+	}
+    }
+
+int32_t isizeR = 0;
+double orientR = 0.0;
+double countR = 0.0;
+for (i = 0; i < d->nR; ++i)
+    {
+    p = d->puR + i;
+
+    if (p->b->core.qual < ap->minMapQ)
+	continue;
+
+    bam1_core_t *c = &p->b->core;
+    if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+	{ // read is paired and mapped.
+	if (!((c)->flag & BAM_FMUNMAP))
+	    {  // mate is mapped
+	    if ( ((c)->mtid == (c)->tid) && (abs((c)->isize) > maxISize) ) 
+		{ // same chrom, exceeds maximum insert size.
+		isizeR  += ((c)->mpos - (c)->pos);
+		orientR += (double) (bam1_strand(p->b) && bam1_mstrand(p->b));
+		countR  += 1.0;
+		}
+	    }
+	}
+    }
+
+if (countR > 0.0)
+    { // don't look to minSupport for germline, variants... if you see it once, it's enough
+      // assuming it's found in somatic. 
+    *avgIsizeR  = isizeR / (int32_t) countR;
+    *avgOrientR = orientR / countR;
+    *retCountR  = (int32_t) countR;
+    }
+
+if (countL > minSupport)
+    {
+    *avgIsizeL  = isizeL / (int32_t) countL;
+    *avgOrientL = orientL / countL;
+    *retCountL  = (int32_t) countL; 
+    return 1;   // only return successful if found a putative intra-chrom in somatic.
+    }
+
+return 0;
+}
+
 static int indel_variant(pileup_data_t *d, analysis_params_t *ap, 
 			 int *isIns, int *isDel, int *insGerm, int *delGerm)
 {
 if (insL >= ap->minSupp)
@@ -625,9 +724,9 @@
 
     fflush(stdout);
     }
 
-if (struct_variant(d, ap))
+if (inter_struct_variant(d, ap))
     {
     printf("%s\t%d\t%d\t", 
 	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
 
@@ -645,8 +744,31 @@
     putchar('\n');
 
     fflush(stdout);
     }
+
+int32_t avgIsizeL = 0, avgIsizeR = 0, countL = 0, countR = 0;
+double sameOrientL = 0.0, sameOrientR = 0.0;
+
+if (intra_struct_variant(d, ap, &avgIsizeL, &sameOrientL, &countL, 
+			 &avgIsizeR, &sameOrientR, &countR))
+    {
+    printf("%s\t%d\t%d\t", 
+	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+    if (avgIsizeL != 0.0 && avgIsizeR == 0.0)
+	printf("SIC:%d(%d,%f)", avgIsizeL, countL, sameOrientL);
+    else if (avgIsizeL != 0.0 && avgIsizeR != 0.0)
+	printf("GIC:%d(%d,%f)/%d(%d,%f)", avgIsizeR, countR, sameOrientR, 
+	       avgIsizeL, countL, sameOrientL);
+
+    putchar('\t');
+    print_seq(d->tidR, d->posR, d->nR, d->puR, d);
+    putchar('\t');
+    print_seq(d->tidL, d->posL, d->nL, d->puL, d);
+    putchar('\n');
+
+    fflush(stdout);
+    }
 }
 
 void bamBam(char *inbamL, char *inbamR)
 {
@@ -686,8 +808,10 @@
 if (optionExists("avgMapQ"))
     avgMapQ = optionInt("avgMapQ", 0);
 if (optionExists("minChiSq"))
     minChiSq = optionInt("minChiSq", 0);
+if (optionExists("maxISize"))
+    maxISize = optionInt("maxISize", 10000);
 if (optionExists("position"))
     position = optionVal("position", NULL);
 
 char *inbamL = argv[1];