src/hg/tcga/bamBam/bamBam.c 1.6

1.6 2009/09/29 04:52:56 jsanborn
updated
Index: src/hg/tcga/bamBam/bamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamBam/bamBam.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 4 -r1.5 -r1.6
--- src/hg/tcga/bamBam/bamBam.c	26 Sep 2009 18:29:47 -0000	1.5
+++ src/hg/tcga/bamBam/bamBam.c	29 Sep 2009 04:52:56 -0000	1.6
@@ -40,8 +40,10 @@
     {"minMapQ", OPTION_INT},
     {NULL, 0}
 };
 
+#define HET_THRESH 0.05
+
 /* Option variables */
 char *position = NULL;
 int minSupport = 2;
 int minQ = 0;
@@ -50,15 +52,26 @@
 typedef struct {
     int minQ;
     int minMapQ;
     int minSupp;
-} analysis_params;
+    int sites;
+} analysis_params_t;
 
 static int32_t votesL[4];
 static int32_t countsL[4];
 static int32_t votesR[4];
 static int32_t countsR[4];
 
+
+typedef struct {
+    char nuc;
+    int count;
+    int vote;
+} variant_t;
+
+variant_t minorL, majorL;
+variant_t minorR, majorR;
+
 static int nuc_index(char c)
 {
 switch (c)
     {
@@ -77,10 +90,9 @@
 }
 
 #define NUCS "AGCT"
 
-static int consensus_snp_variant(pileup_data_t *d, analysis_params *ap, 
-				 char *callL, char *callR)
+static inline void nuc_counts(pileup_data_t *d, analysis_params_t *ap)
 {
 int i, index;
 for (i = 0; i < 4; i++)
     { // sum up the qualities of each base
@@ -102,9 +114,8 @@
     int qual = bam1_qual(p->b)[p->qpos];    
     if (qual < ap->minQ)
 	continue;
     
-    index = nuc_index(c);
     if ((index = nuc_index(c)) < 0)
 	continue;
 
     votesL[index] += qual;
@@ -122,76 +133,499 @@
     int qual = bam1_qual(p->b)[p->qpos];    
     if (qual < ap->minQ)
 	continue;
 
-    index = nuc_index(c);
     if ((index = nuc_index(c)) < 0)
 	continue;
 
     votesR[index] += qual;
     countsR[index]++;
     }
 
-int iL = -1, maxL = -1, countL = -1;
-int iR = -1, maxR = -1, countR = -1;
+int iL1 = -1, iL2 = -1, maxL1 = -1, maxL2 = -1; 
+int iR1 = -1, iR2 = -1, maxR1 = -1, maxR2 = -1;
+int countL1 = -1, countL2 = -1, countR1 = -1, countR2 = -1;
 for (i = 0; i < 4; i++)
     {
-    if (votesL[i] > maxL)
+    if (votesL[i] > maxL1)
+	{
+	iL2     = iL1;     // set prev max to max2
+	maxL2   = maxL1;
+	countL2 = countL1;
+
+	iL1     = i;
+	maxL1   = votesL[i];
+	countL1 = countsL[i];
+	}
+    else if (votesL[i] > maxL2)
+	{
+	iL2     = i;
+	maxL2   = votesL[i];
+	countL2 = countsL[i];
+	}
+
+    if (votesR[i] > maxR1)
 	{
-	iL = i;
-	maxL = votesL[i];
-	countL = countsL[i];
+	iR2     = iR1;
+	maxR2   = maxR1;
+	countR2 = countR1;
+
+	iR1     = i;
+	maxR1   = votesR[i];
+	countR1 = countsR[i];
 	}
-    if (votesR[i] > maxR)
+    else if (votesR[i] > maxR2)
 	{
-	iR = i;
-	maxR = votesR[i];
-	countR = countsR[i];
+	iR2     = i;
+	maxR2   = votesR[i];
+	countR2 = countsR[i];
 	}
     }
 	   
-if (iL == -1 || iR == -1 || (iL == iR))  // either no good data or matching consensus.
-    return 0;
-if (countL < ap->minSupp || countR < ap->minSupp)
+majorL.nuc   = 'N';
+majorL.vote  = 0;
+majorL.count = 0;
+if (iL1 != -1 && countL1 >= ap->minSupp)
+    {
+    majorL.nuc   = NUCS[iL1];
+    majorL.vote  = maxL1;
+    majorL.count = countL1;
+    }
+
+minorL.nuc   = 'N';
+minorL.vote  = 0;
+minorL.count = 0;
+if (iL2 != -1 && countL2 >= ap->minSupp)
+    {
+    minorL.nuc   = NUCS[iL2];
+    minorL.vote  = maxL2;
+    minorL.count = countL2;
+    }
+
+majorR.nuc   = 'N';
+majorR.vote  = 0;
+majorR.count = 0;
+if (iR1 != -1 && countR1 >= ap->minSupp)
+    {
+    majorR.nuc   = NUCS[iR1];
+    majorR.vote  = maxR1;
+    majorR.count = countR1;
+    }
+
+minorR.nuc   = 'N';
+minorR.vote  = 0;
+minorR.count = 0;
+if (iR2 != -1 && countR2 >= ap->minSupp)
+    {
+    minorR.nuc   = NUCS[iR2];
+    minorR.vote  = maxR2;
+    minorR.count = countR2;
+    }
+}
+
+
+static double chi_square(int obs1, int obs2, double p)  // p = binomial prob of getting 1
+{
+int pseudo = 5;
+obs1 += pseudo;
+obs2 += pseudo;
+int count = obs1 + obs2;
+
+double exp1 = p * ((double) count) + pseudo;
+double exp2 = (1.0 - p) * ((double) count) + pseudo;
+
+// apply Yates correction... will often have small observed values.
+return pow(abs((double)obs1 - exp1) - 0.5, 2.0) / obs1 
+    + pow(abs((double)obs2 - exp2) - 0.5, 2.0) / obs2;
+}
+
+static int sig_diff(pileup_data_t *d, analysis_params_t *ap)
+{
+int i;
+double pseudo = 1.0;
+int totalL = 0, totalR = 0;
+
+for (i = 0; i < 4; i++)
+    {
+    totalL += votesL[i];
+    totalR += votesR[i];
+    }
+
+double obs, exp, chi_sq = 0.0;
+for (i = 0; i < 4; i++)
+    {
+    obs = ((double) votesL[i] * 100.0) / (double) totalL + pseudo;
+    exp = ((double) votesR[i] * 100.0) / (double) totalR + pseudo;
+    chi_sq += pow(abs(obs - exp) - 0.5, 2.0) / exp;
+    }
+
+return chi_sq;
+}
+
+
+static int which_model(double homo, double het, double tri)
+{
+if (homo <= het && homo <= tri)
+    return 1;
+if (het <= homo && het <= tri)
+    return 2;
+if (tri <= homo && tri <= het)
+    return 3;
+return 0;
+}
+
+
+static int model_select(pileup_data_t *d, analysis_params_t *ap, 
+			char *callL1, char *callL2, char *callR1, char *callR2, 
+			float *hetL, float *hetR, int *stateL, int *stateR, int *sites)
+{
+if (majorR.vote == 0 || majorL.vote == 0)
+    return 0;  // most likely a bad quality region.
+
+// might want to check here if the germline & tumor are significantly different, using
+// votesL & votesR
+
+if (!sig_diff(d, ap))
     return 0;
 
-*callL = NUCS[iL];
-*callR = NUCS[iR];
+float freqL = ((float) minorL.vote) / ((float) majorL.vote + (float) minorL.vote);
+float freqR = ((float) minorR.vote) / ((float) majorR.vote + (float) minorR.vote);
+
+*hetL = freqL;
+*hetR = freqR;
+
+*sites += 1;
+
+double modelR_homo = chi_square(majorR.vote, minorR.vote, 1.0);
+double modelR_het  = chi_square(majorR.vote, minorR.vote, 0.5);
+double modelR_tri  = chi_square(majorR.vote, minorR.vote, 0.66);
+
+double modelL_homo = chi_square(majorL.vote, minorL.vote, 1.0);
+double modelL_het  = chi_square(majorL.vote, minorL.vote, 0.5);
+double modelL_tri  = chi_square(majorL.vote, minorL.vote, 0.66);
+
+int typeL = which_model(modelL_homo, modelL_het, modelL_tri);
+int typeR = which_model(modelR_homo, modelR_het, modelR_tri);
+
+*stateL = typeL;
+*stateR = typeR;
+
+*callL1 = majorL.nuc;
+*callL2 = minorL.nuc;
+*callR1 = majorR.nuc;
+*callR2 = minorR.nuc;
+
+int retVal = 0;
+if (typeR == 1)
+    {  // normal most likely homozygous.
+    if (typeL == 1)
+	{  // tumor is also homozygous.
+	if (majorL.nuc != majorR.nuc) // homozygous SNP
+	    retVal = 1;
+	}
+    else if (typeL == 2)
+	retVal = 1;
+    else if (typeL == 3)
+	retVal = 1;
+    }
+else if (typeR == 2 || typeR == 3)
+    {  // normal most likely heterozygous.
+    if (typeL == 1)
+	retVal = 1; // tumor is homozygous... LOH!
+    else if (typeL == 2 || typeL == 3)
+	{ // tumor also heterozygous.
+	if (majorL.nuc == majorR.nuc && minorL.nuc == minorR.nuc)
+	    retVal = 0;
+	else if (majorL.nuc == minorR.nuc && minorL.nuc == majorR.nuc)
+	    retVal = 0;
+	else
+	    retVal = 1;
+	}
+    }
+
+return retVal;
+}
+
+/** usually set to static, removed to avoid compiler warning/error */
+
+int snp_variant(pileup_data_t *d, analysis_params_t *ap, 
+		       char *callL1, char *callL2, char *callR1, char *callR2, 
+		       float *hetL, float *hetR, int *sites)
+{
+float freqL = ((float) minorL.vote) / ((float) majorL.vote + (float) minorL.vote);
+float freqR = ((float) minorR.vote) / ((float) majorR.vote + (float) minorR.vote);
+
+*hetL = freqL;
+*hetR = freqR;
+
+*sites += 1;
+
+if (freqR > HET_THRESH && freqL <= HET_THRESH) 
+    { // hetero in norm, homo in normal (LOH?)
+    if (majorL.nuc != majorR.nuc)
+	{
+	*callL1 = majorL.nuc;
+	*callL2 = minorL.nuc;  
+	*callR1 = majorR.nuc;
+	*callR2 = minorR.nuc;
+	return 1;
+	}
+    else if (majorL.nuc != minorR.nuc)
+	{
+	*callL1 = majorL.nuc;
+	*callL2 = minorL.nuc;
+	*callR1 = minorR.nuc;
+	*callR2 = majorR.nuc;
+	return 1;
+	}
+    else 
+	return 0;
+    }
+else if (freqR > HET_THRESH && freqL > HET_THRESH) 
+    { // hetero in both tumor and normal
+    if ((majorL.nuc == majorR.nuc) && (minorL.nuc == minorR.nuc))
+	return 0;  // same nucs
+    else if ((minorL.nuc == majorR.nuc) && ( majorL.nuc == minorR.nuc))
+	return 0;  // same nucs, just changed proportions.
+    else if ((majorL.nuc != majorR.nuc) && (minorL.nuc == minorR.nuc))
+	{
+	*callL1 = majorL.nuc;
+	*callL2 = minorL.nuc;
+	*callR1 = majorR.nuc;
+	*callR2 = minorR.nuc;
+	return 1;
+	}
+    else if ((majorL.nuc == majorR.nuc) && (minorL.nuc != minorR.nuc))
+	{
+	*callL1 = minorL.nuc;
+	*callL2 = majorL.nuc;
+	*callR1 = minorR.nuc;
+	*callR2 = majorR.nuc;
+	return 1;
+	}
+    else if ((majorL.nuc == minorR.nuc) && (minorL.nuc != majorR.nuc))
+	{
+	*callL1 = minorL.nuc;
+	*callL2 = majorL.nuc;
+	*callR1 = majorR.nuc;
+	*callR2 = minorR.nuc;
+	return 1;
+	}
+    else if ((minorL.nuc == majorR.nuc) && (majorL.nuc != minorR.nuc))
+	{
+	*callL1 = majorL.nuc;
+	*callL2 = minorL.nuc;
+	*callR1 = minorR.nuc;
+	*callR2 = majorR.nuc;
+	return 1;
+	}
+    else
+	return 0;
+    }
+else if (freqR <= HET_THRESH && freqL > HET_THRESH) 
+    { // homo in right (germline), hetero in left (tumor)
+    if (majorL.nuc != majorR.nuc)
+	{
+	*callL1 = majorL.nuc;
+	*callL2 = minorL.nuc;
+	*callR1 = majorR.nuc;
+	*callR2 = minorR.nuc;
+	return 1;
+	}
+    else if (minorL.nuc != majorR.nuc)
+	{
+	*callL1 = minorL.nuc;
+	*callL2 = majorL.nuc;
+	*callR1 = majorR.nuc;
+	*callR2 = minorR.nuc;
+	return 1;
+	}
+    }
+else if (freqR <= HET_THRESH && freqL <= HET_THRESH) 
+    {  // homozygous in right and left (tumor)
+    if (majorL.nuc != majorR.nuc)
+	{
+	*callL1 = majorL.nuc;
+	*callL2 = minorL.nuc;
+	*callR1 = majorR.nuc;
+	*callR2 = minorR.nuc;
+	return 1;
+	}
+    }
     
-return 1;
+return 0;   // not a SNP variant.
 }
 
+#define NUM_CHROMS 100
+static int mateChromsL[NUM_CHROMS];
+static int mateChromsR[NUM_CHROMS];
+static int somaticSV[NUM_CHROMS];
+static int germlineSV[NUM_CHROMS];
+
+static int struct_variant(pileup_data_t *d, analysis_params_t *ap)
+{
+int i;
+for (i = 0; i < NUM_CHROMS; ++i)
+    {
+    mateChromsL[i] = 0;
+    mateChromsR[i] = 0;
+    somaticSV[i]   = 0;
+    germlineSV[i]  = 0;
+    }
+
+const bam_pileup1_t *p;
+for (i = 0; i < d->nL; ++i)
+    {
+    p = d->puL + i;
+
+    if (p->b->core.qual < minMapQ)
+	continue;
+
+    bam1_core_t *c = &p->b->core;
+    if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+	{ // read is paired and mapped.
+	if (!((c)->flag & BAM_FMUNMAP))
+	    {  // mate is mapped
+	    if ((c)->mtid != (c)->tid) // different chroms.
+		mateChromsL[(c)->mtid]++;
+	    }
+	}
+    }
+
+for (i = 0; i < d->nR; ++i)
+    {
+    p = d->puR + i;
+
+    if (p->b->core.qual < minMapQ)
+	continue;
+
+    bam1_core_t *c = &p->b->core;
+    if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+	{ // read is paired and mapped.
+	if (!((c)->flag & BAM_FMUNMAP))
+	    {  // mate is mapped
+	    if ((c)->mtid != (c)->tid)  // different chroms
+		mateChromsR[(c)->mtid]++;
+	    }
+	}
+    }
+
+int retVal = 0;
+for (i = 0; i < NUM_CHROMS; i++)
+    {
+    if (mateChromsL[i] < ap->minSupp)
+	continue;
+    retVal = 1;
+    if (mateChromsR[i] < ap->minSupp)
+	somaticSV[i]  = mateChromsL[i];
+    else
+	germlineSV[i] = mateChromsL[i] + mateChromsR[i];
+    }
+
+return retVal;
+}
 
 static void perform_analysis(void *data)
 {
 pileup_data_t *d = (pileup_data_t*)data;
-analysis_params *ap = (analysis_params*)d->func_data;
+analysis_params_t *ap = (analysis_params_t*)d->func_data;
 
-char callL, callR;
+int i;
+int stateL, stateR;
+char callL1, callL2, callR1, callR2;
+float hetL, hetR;
 
-if (!consensus_snp_variant(d, ap, &callL, &callR))
-    return;
+nuc_counts(d, ap);
 
 // remember, BAM data is starts at base zero, not base one like browser.
-printf("chr%s\t%d\t%d\t%c,%c\n", 
-       d->hL->target_name[d->tidL], d->posL, d->posL + 1, callL, callR);
-fflush(stdout);
+/***
+if (snp_variant(d, ap, &callL1, &callL2, &callR1, &callR2, &hetL, &hetR, &ap->sites))
+    {
+    printf("chr%s\t%d\t%d\t", 
+	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+    if (hetR > HET_THRESH && hetL > HET_THRESH) // both hetero
+	printf("SNP:%c/%c(%f)>%c/%c(%f)\t", callR1, callR2, 1.0-hetR, callL1, callL2, 1.0-hetL);
+    else if (hetR <= HET_THRESH && hetL <= HET_THRESH) // both homo
+	printf("SNP:%c(%f)>%c(%f)\t", callR1, 1.0-hetR, callL1, 1.0-hetL);	
+    else if (hetR > HET_THRESH && hetL <= HET_THRESH) // germ hetero, tumor hom
+	printf("LOH:%c/%c(%f)>%c(%f)\t", callR1, callR2, 1.0-hetR, callL1, 1.0-hetL);
+    else if (hetR <= HET_THRESH && hetL > HET_THRESH) // germ homo, tumor hetero
+	printf("GOH:%c(%f)>%c/%c(%f)\t", callR1, 1.0-hetR, callL1, callL2, 1.0-hetL);
+    else
+	printf("error\t");  // should never get here.
+
+
+    fflush(stdout);
+    }
+***/
+
+if (model_select(d, ap, &callL1, &callL2, &callR1, &callR2, 
+		 &hetL, &hetR, &stateL, &stateR, &ap->sites))
+    {
+    printf("chr%s\t%d\t%d\t", 
+	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+    if (stateR == 1)
+	printf("Hom:%c(%f)>", callR1, 1.0-hetR);
+    else if (stateR == 2)
+	printf("Het:%c/%c(%f)>", callR1, callR2, 1.0-hetR);
+    else if (stateR == 3)
+	printf("Tri:%c/%c(%f)>", callR1, callR2, 1.0-hetR);
+    
+    if (stateL == 1)
+	printf("Hom:%c(%f)\t", callL1, 1.0-hetL);
+    else if (stateL == 2)
+	printf("Het:%c/%c(%f)\t", callL1, callL2, 1.0-hetL);
+    else if (stateL == 3)
+	printf("Tri:%c/%c(%f)\t", callL1, callL2, 1.0-hetL);
+
+    double chi_sq = sig_diff(d, ap);
+    printf("%f\t", chi_sq);
+
+    for (i = 0; i < 4; i++)
+	printf("%c:%d,%d ", NUCS[i], votesR[i], countsR[i]);
+    putchar('\t');
+    
+    for (i = 0; i < 4; i++)
+	printf("%c:%d,%d ", NUCS[i], votesL[i], countsL[i]);
+    putchar('\n');
+    fflush(stdout);
+    }
+
+if (struct_variant(d, ap))
+    {
+    printf("chr%s\t%d\t%d\t", 
+	   d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+    for (i = 0; i < NUM_CHROMS; ++i)
+	{
+	if (somaticSV[i] > 0)
+	    printf("SSV:%s(%d),", d->hL->target_name[i], somaticSV[i]);
+	if (germlineSV[i] > 0)
+	    printf("GSV:%s(%d),", d->hL->target_name[i], germlineSV[i]);
+	}
+    putchar('\n');
+    fflush(stdout);
+    }
 }
 
 void bamBam(char *inbamL, char *inbamR)
 {
-analysis_params aps;
-aps.minQ    = minQ;
-aps.minMapQ = minMapQ;
-aps.minSupp = minSupport;
-
-bam_bam_file(inbamL, inbamR, perform_analysis, &aps);
+analysis_params_t ap;
+ap.minQ    = minQ;
+ap.minMapQ = minMapQ;
+ap.minSupp = minSupport;
+ap.sites   = 0;
+bam_bam_file(inbamL, inbamR, perform_analysis, &ap);
+fprintf(stderr, "# qualified sites = %d\n", ap.sites);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
+// Print out command line as a comment.
 int i;
 putchar('#');
 for (i = 0; i < argc; i++)
     printf(" %s", argv[i]);