src/hg/tcga/bamBam/bamBam.c 1.6
1.6 2009/09/29 04:52:56 jsanborn
updated
Index: src/hg/tcga/bamBam/bamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamBam/bamBam.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 4 -r1.5 -r1.6
--- src/hg/tcga/bamBam/bamBam.c 26 Sep 2009 18:29:47 -0000 1.5
+++ src/hg/tcga/bamBam/bamBam.c 29 Sep 2009 04:52:56 -0000 1.6
@@ -40,8 +40,10 @@
{"minMapQ", OPTION_INT},
{NULL, 0}
};
+#define HET_THRESH 0.05
+
/* Option variables */
char *position = NULL;
int minSupport = 2;
int minQ = 0;
@@ -50,15 +52,26 @@
typedef struct {
int minQ;
int minMapQ;
int minSupp;
-} analysis_params;
+ int sites;
+} analysis_params_t;
static int32_t votesL[4];
static int32_t countsL[4];
static int32_t votesR[4];
static int32_t countsR[4];
+
+typedef struct {
+ char nuc;
+ int count;
+ int vote;
+} variant_t;
+
+variant_t minorL, majorL;
+variant_t minorR, majorR;
+
static int nuc_index(char c)
{
switch (c)
{
@@ -77,10 +90,9 @@
}
#define NUCS "AGCT"
-static int consensus_snp_variant(pileup_data_t *d, analysis_params *ap,
- char *callL, char *callR)
+static inline void nuc_counts(pileup_data_t *d, analysis_params_t *ap)
{
int i, index;
for (i = 0; i < 4; i++)
{ // sum up the qualities of each base
@@ -102,9 +114,8 @@
int qual = bam1_qual(p->b)[p->qpos];
if (qual < ap->minQ)
continue;
- index = nuc_index(c);
if ((index = nuc_index(c)) < 0)
continue;
votesL[index] += qual;
@@ -122,76 +133,499 @@
int qual = bam1_qual(p->b)[p->qpos];
if (qual < ap->minQ)
continue;
- index = nuc_index(c);
if ((index = nuc_index(c)) < 0)
continue;
votesR[index] += qual;
countsR[index]++;
}
-int iL = -1, maxL = -1, countL = -1;
-int iR = -1, maxR = -1, countR = -1;
+int iL1 = -1, iL2 = -1, maxL1 = -1, maxL2 = -1;
+int iR1 = -1, iR2 = -1, maxR1 = -1, maxR2 = -1;
+int countL1 = -1, countL2 = -1, countR1 = -1, countR2 = -1;
for (i = 0; i < 4; i++)
{
- if (votesL[i] > maxL)
+ if (votesL[i] > maxL1)
+ {
+ iL2 = iL1; // set prev max to max2
+ maxL2 = maxL1;
+ countL2 = countL1;
+
+ iL1 = i;
+ maxL1 = votesL[i];
+ countL1 = countsL[i];
+ }
+ else if (votesL[i] > maxL2)
+ {
+ iL2 = i;
+ maxL2 = votesL[i];
+ countL2 = countsL[i];
+ }
+
+ if (votesR[i] > maxR1)
{
- iL = i;
- maxL = votesL[i];
- countL = countsL[i];
+ iR2 = iR1;
+ maxR2 = maxR1;
+ countR2 = countR1;
+
+ iR1 = i;
+ maxR1 = votesR[i];
+ countR1 = countsR[i];
}
- if (votesR[i] > maxR)
+ else if (votesR[i] > maxR2)
{
- iR = i;
- maxR = votesR[i];
- countR = countsR[i];
+ iR2 = i;
+ maxR2 = votesR[i];
+ countR2 = countsR[i];
}
}
-if (iL == -1 || iR == -1 || (iL == iR)) // either no good data or matching consensus.
- return 0;
-if (countL < ap->minSupp || countR < ap->minSupp)
+majorL.nuc = 'N';
+majorL.vote = 0;
+majorL.count = 0;
+if (iL1 != -1 && countL1 >= ap->minSupp)
+ {
+ majorL.nuc = NUCS[iL1];
+ majorL.vote = maxL1;
+ majorL.count = countL1;
+ }
+
+minorL.nuc = 'N';
+minorL.vote = 0;
+minorL.count = 0;
+if (iL2 != -1 && countL2 >= ap->minSupp)
+ {
+ minorL.nuc = NUCS[iL2];
+ minorL.vote = maxL2;
+ minorL.count = countL2;
+ }
+
+majorR.nuc = 'N';
+majorR.vote = 0;
+majorR.count = 0;
+if (iR1 != -1 && countR1 >= ap->minSupp)
+ {
+ majorR.nuc = NUCS[iR1];
+ majorR.vote = maxR1;
+ majorR.count = countR1;
+ }
+
+minorR.nuc = 'N';
+minorR.vote = 0;
+minorR.count = 0;
+if (iR2 != -1 && countR2 >= ap->minSupp)
+ {
+ minorR.nuc = NUCS[iR2];
+ minorR.vote = maxR2;
+ minorR.count = countR2;
+ }
+}
+
+
+static double chi_square(int obs1, int obs2, double p) // p = binomial prob of getting 1
+{
+int pseudo = 5;
+obs1 += pseudo;
+obs2 += pseudo;
+int count = obs1 + obs2;
+
+double exp1 = p * ((double) count) + pseudo;
+double exp2 = (1.0 - p) * ((double) count) + pseudo;
+
+// apply Yates correction... will often have small observed values.
+return pow(abs((double)obs1 - exp1) - 0.5, 2.0) / obs1
+ + pow(abs((double)obs2 - exp2) - 0.5, 2.0) / obs2;
+}
+
+static int sig_diff(pileup_data_t *d, analysis_params_t *ap)
+{
+int i;
+double pseudo = 1.0;
+int totalL = 0, totalR = 0;
+
+for (i = 0; i < 4; i++)
+ {
+ totalL += votesL[i];
+ totalR += votesR[i];
+ }
+
+double obs, exp, chi_sq = 0.0;
+for (i = 0; i < 4; i++)
+ {
+ obs = ((double) votesL[i] * 100.0) / (double) totalL + pseudo;
+ exp = ((double) votesR[i] * 100.0) / (double) totalR + pseudo;
+ chi_sq += pow(abs(obs - exp) - 0.5, 2.0) / exp;
+ }
+
+return chi_sq;
+}
+
+
+static int which_model(double homo, double het, double tri)
+{
+if (homo <= het && homo <= tri)
+ return 1;
+if (het <= homo && het <= tri)
+ return 2;
+if (tri <= homo && tri <= het)
+ return 3;
+return 0;
+}
+
+
+static int model_select(pileup_data_t *d, analysis_params_t *ap,
+ char *callL1, char *callL2, char *callR1, char *callR2,
+ float *hetL, float *hetR, int *stateL, int *stateR, int *sites)
+{
+if (majorR.vote == 0 || majorL.vote == 0)
+ return 0; // most likely a bad quality region.
+
+// might want to check here if the germline & tumor are significantly different, using
+// votesL & votesR
+
+if (!sig_diff(d, ap))
return 0;
-*callL = NUCS[iL];
-*callR = NUCS[iR];
+float freqL = ((float) minorL.vote) / ((float) majorL.vote + (float) minorL.vote);
+float freqR = ((float) minorR.vote) / ((float) majorR.vote + (float) minorR.vote);
+
+*hetL = freqL;
+*hetR = freqR;
+
+*sites += 1;
+
+double modelR_homo = chi_square(majorR.vote, minorR.vote, 1.0);
+double modelR_het = chi_square(majorR.vote, minorR.vote, 0.5);
+double modelR_tri = chi_square(majorR.vote, minorR.vote, 0.66);
+
+double modelL_homo = chi_square(majorL.vote, minorL.vote, 1.0);
+double modelL_het = chi_square(majorL.vote, minorL.vote, 0.5);
+double modelL_tri = chi_square(majorL.vote, minorL.vote, 0.66);
+
+int typeL = which_model(modelL_homo, modelL_het, modelL_tri);
+int typeR = which_model(modelR_homo, modelR_het, modelR_tri);
+
+*stateL = typeL;
+*stateR = typeR;
+
+*callL1 = majorL.nuc;
+*callL2 = minorL.nuc;
+*callR1 = majorR.nuc;
+*callR2 = minorR.nuc;
+
+int retVal = 0;
+if (typeR == 1)
+ { // normal most likely homozygous.
+ if (typeL == 1)
+ { // tumor is also homozygous.
+ if (majorL.nuc != majorR.nuc) // homozygous SNP
+ retVal = 1;
+ }
+ else if (typeL == 2)
+ retVal = 1;
+ else if (typeL == 3)
+ retVal = 1;
+ }
+else if (typeR == 2 || typeR == 3)
+ { // normal most likely heterozygous.
+ if (typeL == 1)
+ retVal = 1; // tumor is homozygous... LOH!
+ else if (typeL == 2 || typeL == 3)
+ { // tumor also heterozygous.
+ if (majorL.nuc == majorR.nuc && minorL.nuc == minorR.nuc)
+ retVal = 0;
+ else if (majorL.nuc == minorR.nuc && minorL.nuc == majorR.nuc)
+ retVal = 0;
+ else
+ retVal = 1;
+ }
+ }
+
+return retVal;
+}
+
+/** usually set to static, removed to avoid compiler warning/error */
+
+int snp_variant(pileup_data_t *d, analysis_params_t *ap,
+ char *callL1, char *callL2, char *callR1, char *callR2,
+ float *hetL, float *hetR, int *sites)
+{
+float freqL = ((float) minorL.vote) / ((float) majorL.vote + (float) minorL.vote);
+float freqR = ((float) minorR.vote) / ((float) majorR.vote + (float) minorR.vote);
+
+*hetL = freqL;
+*hetR = freqR;
+
+*sites += 1;
+
+if (freqR > HET_THRESH && freqL <= HET_THRESH)
+ { // hetero in norm, homo in normal (LOH?)
+ if (majorL.nuc != majorR.nuc)
+ {
+ *callL1 = majorL.nuc;
+ *callL2 = minorL.nuc;
+ *callR1 = majorR.nuc;
+ *callR2 = minorR.nuc;
+ return 1;
+ }
+ else if (majorL.nuc != minorR.nuc)
+ {
+ *callL1 = majorL.nuc;
+ *callL2 = minorL.nuc;
+ *callR1 = minorR.nuc;
+ *callR2 = majorR.nuc;
+ return 1;
+ }
+ else
+ return 0;
+ }
+else if (freqR > HET_THRESH && freqL > HET_THRESH)
+ { // hetero in both tumor and normal
+ if ((majorL.nuc == majorR.nuc) && (minorL.nuc == minorR.nuc))
+ return 0; // same nucs
+ else if ((minorL.nuc == majorR.nuc) && ( majorL.nuc == minorR.nuc))
+ return 0; // same nucs, just changed proportions.
+ else if ((majorL.nuc != majorR.nuc) && (minorL.nuc == minorR.nuc))
+ {
+ *callL1 = majorL.nuc;
+ *callL2 = minorL.nuc;
+ *callR1 = majorR.nuc;
+ *callR2 = minorR.nuc;
+ return 1;
+ }
+ else if ((majorL.nuc == majorR.nuc) && (minorL.nuc != minorR.nuc))
+ {
+ *callL1 = minorL.nuc;
+ *callL2 = majorL.nuc;
+ *callR1 = minorR.nuc;
+ *callR2 = majorR.nuc;
+ return 1;
+ }
+ else if ((majorL.nuc == minorR.nuc) && (minorL.nuc != majorR.nuc))
+ {
+ *callL1 = minorL.nuc;
+ *callL2 = majorL.nuc;
+ *callR1 = majorR.nuc;
+ *callR2 = minorR.nuc;
+ return 1;
+ }
+ else if ((minorL.nuc == majorR.nuc) && (majorL.nuc != minorR.nuc))
+ {
+ *callL1 = majorL.nuc;
+ *callL2 = minorL.nuc;
+ *callR1 = minorR.nuc;
+ *callR2 = majorR.nuc;
+ return 1;
+ }
+ else
+ return 0;
+ }
+else if (freqR <= HET_THRESH && freqL > HET_THRESH)
+ { // homo in right (germline), hetero in left (tumor)
+ if (majorL.nuc != majorR.nuc)
+ {
+ *callL1 = majorL.nuc;
+ *callL2 = minorL.nuc;
+ *callR1 = majorR.nuc;
+ *callR2 = minorR.nuc;
+ return 1;
+ }
+ else if (minorL.nuc != majorR.nuc)
+ {
+ *callL1 = minorL.nuc;
+ *callL2 = majorL.nuc;
+ *callR1 = majorR.nuc;
+ *callR2 = minorR.nuc;
+ return 1;
+ }
+ }
+else if (freqR <= HET_THRESH && freqL <= HET_THRESH)
+ { // homozygous in right and left (tumor)
+ if (majorL.nuc != majorR.nuc)
+ {
+ *callL1 = majorL.nuc;
+ *callL2 = minorL.nuc;
+ *callR1 = majorR.nuc;
+ *callR2 = minorR.nuc;
+ return 1;
+ }
+ }
-return 1;
+return 0; // not a SNP variant.
}
+#define NUM_CHROMS 100
+static int mateChromsL[NUM_CHROMS];
+static int mateChromsR[NUM_CHROMS];
+static int somaticSV[NUM_CHROMS];
+static int germlineSV[NUM_CHROMS];
+
+static int struct_variant(pileup_data_t *d, analysis_params_t *ap)
+{
+int i;
+for (i = 0; i < NUM_CHROMS; ++i)
+ {
+ mateChromsL[i] = 0;
+ mateChromsR[i] = 0;
+ somaticSV[i] = 0;
+ germlineSV[i] = 0;
+ }
+
+const bam_pileup1_t *p;
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+
+ if (p->b->core.qual < minMapQ)
+ continue;
+
+ bam1_core_t *c = &p->b->core;
+ if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+ { // read is paired and mapped.
+ if (!((c)->flag & BAM_FMUNMAP))
+ { // mate is mapped
+ if ((c)->mtid != (c)->tid) // different chroms.
+ mateChromsL[(c)->mtid]++;
+ }
+ }
+ }
+
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+
+ if (p->b->core.qual < minMapQ)
+ continue;
+
+ bam1_core_t *c = &p->b->core;
+ if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+ { // read is paired and mapped.
+ if (!((c)->flag & BAM_FMUNMAP))
+ { // mate is mapped
+ if ((c)->mtid != (c)->tid) // different chroms
+ mateChromsR[(c)->mtid]++;
+ }
+ }
+ }
+
+int retVal = 0;
+for (i = 0; i < NUM_CHROMS; i++)
+ {
+ if (mateChromsL[i] < ap->minSupp)
+ continue;
+ retVal = 1;
+ if (mateChromsR[i] < ap->minSupp)
+ somaticSV[i] = mateChromsL[i];
+ else
+ germlineSV[i] = mateChromsL[i] + mateChromsR[i];
+ }
+
+return retVal;
+}
static void perform_analysis(void *data)
{
pileup_data_t *d = (pileup_data_t*)data;
-analysis_params *ap = (analysis_params*)d->func_data;
+analysis_params_t *ap = (analysis_params_t*)d->func_data;
-char callL, callR;
+int i;
+int stateL, stateR;
+char callL1, callL2, callR1, callR2;
+float hetL, hetR;
-if (!consensus_snp_variant(d, ap, &callL, &callR))
- return;
+nuc_counts(d, ap);
// remember, BAM data is starts at base zero, not base one like browser.
-printf("chr%s\t%d\t%d\t%c,%c\n",
- d->hL->target_name[d->tidL], d->posL, d->posL + 1, callL, callR);
-fflush(stdout);
+/***
+if (snp_variant(d, ap, &callL1, &callL2, &callR1, &callR2, &hetL, &hetR, &ap->sites))
+ {
+ printf("chr%s\t%d\t%d\t",
+ d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+ if (hetR > HET_THRESH && hetL > HET_THRESH) // both hetero
+ printf("SNP:%c/%c(%f)>%c/%c(%f)\t", callR1, callR2, 1.0-hetR, callL1, callL2, 1.0-hetL);
+ else if (hetR <= HET_THRESH && hetL <= HET_THRESH) // both homo
+ printf("SNP:%c(%f)>%c(%f)\t", callR1, 1.0-hetR, callL1, 1.0-hetL);
+ else if (hetR > HET_THRESH && hetL <= HET_THRESH) // germ hetero, tumor hom
+ printf("LOH:%c/%c(%f)>%c(%f)\t", callR1, callR2, 1.0-hetR, callL1, 1.0-hetL);
+ else if (hetR <= HET_THRESH && hetL > HET_THRESH) // germ homo, tumor hetero
+ printf("GOH:%c(%f)>%c/%c(%f)\t", callR1, 1.0-hetR, callL1, callL2, 1.0-hetL);
+ else
+ printf("error\t"); // should never get here.
+
+
+ fflush(stdout);
+ }
+***/
+
+if (model_select(d, ap, &callL1, &callL2, &callR1, &callR2,
+ &hetL, &hetR, &stateL, &stateR, &ap->sites))
+ {
+ printf("chr%s\t%d\t%d\t",
+ d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+ if (stateR == 1)
+ printf("Hom:%c(%f)>", callR1, 1.0-hetR);
+ else if (stateR == 2)
+ printf("Het:%c/%c(%f)>", callR1, callR2, 1.0-hetR);
+ else if (stateR == 3)
+ printf("Tri:%c/%c(%f)>", callR1, callR2, 1.0-hetR);
+
+ if (stateL == 1)
+ printf("Hom:%c(%f)\t", callL1, 1.0-hetL);
+ else if (stateL == 2)
+ printf("Het:%c/%c(%f)\t", callL1, callL2, 1.0-hetL);
+ else if (stateL == 3)
+ printf("Tri:%c/%c(%f)\t", callL1, callL2, 1.0-hetL);
+
+ double chi_sq = sig_diff(d, ap);
+ printf("%f\t", chi_sq);
+
+ for (i = 0; i < 4; i++)
+ printf("%c:%d,%d ", NUCS[i], votesR[i], countsR[i]);
+ putchar('\t');
+
+ for (i = 0; i < 4; i++)
+ printf("%c:%d,%d ", NUCS[i], votesL[i], countsL[i]);
+ putchar('\n');
+ fflush(stdout);
+ }
+
+if (struct_variant(d, ap))
+ {
+ printf("chr%s\t%d\t%d\t",
+ d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+ for (i = 0; i < NUM_CHROMS; ++i)
+ {
+ if (somaticSV[i] > 0)
+ printf("SSV:%s(%d),", d->hL->target_name[i], somaticSV[i]);
+ if (germlineSV[i] > 0)
+ printf("GSV:%s(%d),", d->hL->target_name[i], germlineSV[i]);
+ }
+ putchar('\n');
+ fflush(stdout);
+ }
}
void bamBam(char *inbamL, char *inbamR)
{
-analysis_params aps;
-aps.minQ = minQ;
-aps.minMapQ = minMapQ;
-aps.minSupp = minSupport;
-
-bam_bam_file(inbamL, inbamR, perform_analysis, &aps);
+analysis_params_t ap;
+ap.minQ = minQ;
+ap.minMapQ = minMapQ;
+ap.minSupp = minSupport;
+ap.sites = 0;
+bam_bam_file(inbamL, inbamR, perform_analysis, &ap);
+fprintf(stderr, "# qualified sites = %d\n", ap.sites);
}
int main(int argc, char *argv[])
/* Process command line. */
{
+// Print out command line as a comment.
int i;
putchar('#');
for (i = 0; i < argc; i++)
printf(" %s", argv[i]);