src/hg/tcga/bamBam/bamBam.c 1.7
1.7 2009/09/30 04:45:06 jsanborn
updated
Index: src/hg/tcga/bamBam/bamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamBam/bamBam.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -b -B -U 4 -r1.6 -r1.7
--- src/hg/tcga/bamBam/bamBam.c 29 Sep 2009 04:52:56 -0000 1.6
+++ src/hg/tcga/bamBam/bamBam.c 30 Sep 2009 04:45:06 -0000 1.7
@@ -29,16 +29,20 @@
"options:\n"
" -minSupport=INT - Minimum Support (i.e. num reads) to call a variant. [2]\n"
" -minQ=INT - Minimum acceptable base quality. [0]\n"
" -minMapQ=INT - Minimum acceptable mapping quality. [0]\n"
+ " -avgMapQ=INT - Minimum acceptable avg mapping quality. [0]\n"
+ " -minChiSq=INT - Minimum acceptable chi square of nucleotide freqs. [0]\n"
"\n"
);
}
static struct optionSpec options[] = {
{"minSupport", OPTION_INT},
{"minQ", OPTION_INT},
{"minMapQ", OPTION_INT},
+ {"avgMapQ", OPTION_INT},
+ {"minChiSq", OPTION_INT},
{NULL, 0}
};
#define HET_THRESH 0.05
@@ -47,21 +51,26 @@
char *position = NULL;
int minSupport = 2;
int minQ = 0;
int minMapQ = 0;
+int avgMapQ = 0;
+int minChiSq = 0;
typedef struct {
int minQ;
int minMapQ;
+ int avgMapQ;
int minSupp;
+ int minChiSq;
int sites;
} analysis_params_t;
static int32_t votesL[4];
static int32_t countsL[4];
static int32_t votesR[4];
static int32_t countsR[4];
+static int32_t insL, delL, insR, delR;
typedef struct {
char nuc;
int count;
@@ -100,8 +109,12 @@
countsL[i] = 0;
votesR[i] = 0;
countsR[i] = 0;
}
+insL = 0;
+insR = 0;
+delL = 0;
+delR = 0;
const bam_pileup1_t *p;
for (i = 0; i < d->nL; ++i)
{
@@ -109,18 +122,25 @@
if (p->b->core.qual < ap->minMapQ)
continue;
- int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
int qual = bam1_qual(p->b)[p->qpos];
if (qual < ap->minQ)
continue;
-
+ if (!p->is_del)
+ {
+ int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
if ((index = nuc_index(c)) < 0)
continue;
votesL[index] += qual;
countsL[index]++;
+
+ if (p->indel > 0)
+ insL += 1;
+ else if (p->indel < 0)
+ delL += 1;
+ }
}
for (i = 0; i < d->nR; ++i)
{
@@ -128,22 +148,29 @@
if (p->b->core.qual < ap->minMapQ)
continue;
- int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
int qual = bam1_qual(p->b)[p->qpos];
if (qual < ap->minQ)
continue;
+ if (!p->is_del)
+ {
+ int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
if ((index = nuc_index(c)) < 0)
continue;
-
votesR[index] += qual;
countsR[index]++;
+
+ if (p->indel > 0)
+ insR += 1;
+ else if (p->indel < 0)
+ delR += 1;
+ }
}
-int iL1 = -1, iL2 = -1, maxL1 = -1, maxL2 = -1;
-int iR1 = -1, iR2 = -1, maxR1 = -1, maxR2 = -1;
+int iL1 = -1, iL2 = -1, maxL1 = 0, maxL2 = 0;
+int iR1 = -1, iR2 = -1, maxR1 = 0, maxR2 = 0;
int countL1 = -1, countL2 = -1, countR1 = -1, countR2 = -1;
for (i = 0; i < 4; i++)
{
if (votesL[i] > maxL1)
@@ -224,19 +251,16 @@
static double chi_square(int obs1, int obs2, double p) // p = binomial prob of getting 1
{
-int pseudo = 5;
-obs1 += pseudo;
-obs2 += pseudo;
+double pseudo = 1.0;
int count = obs1 + obs2;
double exp1 = p * ((double) count) + pseudo;
double exp2 = (1.0 - p) * ((double) count) + pseudo;
// apply Yates correction... will often have small observed values.
-return pow(abs((double)obs1 - exp1) - 0.5, 2.0) / obs1
- + pow(abs((double)obs2 - exp2) - 0.5, 2.0) / obs2;
+return pow(abs((double)obs1 - exp1) - 0.5, 2.0)/exp1 + pow(abs((double)obs2 - exp2) - 0.5, 2.0)/exp2;
}
static int sig_diff(pileup_data_t *d, analysis_params_t *ap)
{
@@ -245,32 +269,30 @@
int totalL = 0, totalR = 0;
for (i = 0; i < 4; i++)
{
- totalL += votesL[i];
- totalR += votesR[i];
+ totalL += countsL[i];
+ totalR += countsR[i];
}
double obs, exp, chi_sq = 0.0;
for (i = 0; i < 4; i++)
{
- obs = ((double) votesL[i] * 100.0) / (double) totalL + pseudo;
- exp = ((double) votesR[i] * 100.0) / (double) totalR + pseudo;
+ obs = ((double) countsL[i] * 100.0) / (double) totalL + pseudo;
+ exp = ((double) countsR[i] * 100.0) / (double) totalR + pseudo;
chi_sq += pow(abs(obs - exp) - 0.5, 2.0) / exp;
}
return chi_sq;
}
-static int which_model(double homo, double het, double tri)
+static int which_model(double homo, double het)
{
-if (homo <= het && homo <= tri)
+if (homo <= het)
return 1;
-if (het <= homo && het <= tri)
+else if (homo > het)
return 2;
-if (tri <= homo && tri <= het)
- return 3;
return 0;
}
@@ -280,32 +302,27 @@
{
if (majorR.vote == 0 || majorL.vote == 0)
return 0; // most likely a bad quality region.
-// might want to check here if the germline & tumor are significantly different, using
-// votesL & votesR
-
-if (!sig_diff(d, ap))
+if (sig_diff(d, ap) < ap->minChiSq)
return 0;
-float freqL = ((float) minorL.vote) / ((float) majorL.vote + (float) minorL.vote);
-float freqR = ((float) minorR.vote) / ((float) majorR.vote + (float) minorR.vote);
+float freqL = ((float) minorL.count) / ((float) majorL.count + (float) minorL.count);
+float freqR = ((float) minorR.count) / ((float) majorR.count + (float) minorR.count);
*hetL = freqL;
*hetR = freqR;
*sites += 1;
-double modelR_homo = chi_square(majorR.vote, minorR.vote, 1.0);
-double modelR_het = chi_square(majorR.vote, minorR.vote, 0.5);
-double modelR_tri = chi_square(majorR.vote, minorR.vote, 0.66);
-
-double modelL_homo = chi_square(majorL.vote, minorL.vote, 1.0);
-double modelL_het = chi_square(majorL.vote, minorL.vote, 0.5);
-double modelL_tri = chi_square(majorL.vote, minorL.vote, 0.66);
+double modelR_homo = chi_square(majorR.count, minorR.count, 1.0);
+double modelR_het = chi_square(majorR.count, minorR.count, 0.5);
-int typeL = which_model(modelL_homo, modelL_het, modelL_tri);
-int typeR = which_model(modelR_homo, modelR_het, modelR_tri);
+double modelL_homo = chi_square(majorL.count, minorL.count, 1.0);
+double modelL_het = chi_square(majorL.count, minorL.count, 0.5);
+
+int typeL = which_model(modelL_homo, modelL_het);
+int typeR = which_model(modelR_homo, modelR_het);
*stateL = typeL;
*stateR = typeR;
@@ -323,16 +340,14 @@
retVal = 1;
}
else if (typeL == 2)
retVal = 1;
- else if (typeL == 3)
- retVal = 1;
}
-else if (typeR == 2 || typeR == 3)
+else if (typeR == 2)
{ // normal most likely heterozygous.
if (typeL == 1)
retVal = 1; // tumor is homozygous... LOH!
- else if (typeL == 2 || typeL == 3)
+ else if (typeL == 2)
{ // tumor also heterozygous.
if (majorL.nuc == majorR.nuc && minorL.nuc == minorR.nuc)
retVal = 0;
else if (majorL.nuc == minorR.nuc && minorL.nuc == majorR.nuc)
@@ -344,118 +359,8 @@
return retVal;
}
-/** usually set to static, removed to avoid compiler warning/error */
-
-int snp_variant(pileup_data_t *d, analysis_params_t *ap,
- char *callL1, char *callL2, char *callR1, char *callR2,
- float *hetL, float *hetR, int *sites)
-{
-float freqL = ((float) minorL.vote) / ((float) majorL.vote + (float) minorL.vote);
-float freqR = ((float) minorR.vote) / ((float) majorR.vote + (float) minorR.vote);
-
-*hetL = freqL;
-*hetR = freqR;
-
-*sites += 1;
-
-if (freqR > HET_THRESH && freqL <= HET_THRESH)
- { // hetero in norm, homo in normal (LOH?)
- if (majorL.nuc != majorR.nuc)
- {
- *callL1 = majorL.nuc;
- *callL2 = minorL.nuc;
- *callR1 = majorR.nuc;
- *callR2 = minorR.nuc;
- return 1;
- }
- else if (majorL.nuc != minorR.nuc)
- {
- *callL1 = majorL.nuc;
- *callL2 = minorL.nuc;
- *callR1 = minorR.nuc;
- *callR2 = majorR.nuc;
- return 1;
- }
- else
- return 0;
- }
-else if (freqR > HET_THRESH && freqL > HET_THRESH)
- { // hetero in both tumor and normal
- if ((majorL.nuc == majorR.nuc) && (minorL.nuc == minorR.nuc))
- return 0; // same nucs
- else if ((minorL.nuc == majorR.nuc) && ( majorL.nuc == minorR.nuc))
- return 0; // same nucs, just changed proportions.
- else if ((majorL.nuc != majorR.nuc) && (minorL.nuc == minorR.nuc))
- {
- *callL1 = majorL.nuc;
- *callL2 = minorL.nuc;
- *callR1 = majorR.nuc;
- *callR2 = minorR.nuc;
- return 1;
- }
- else if ((majorL.nuc == majorR.nuc) && (minorL.nuc != minorR.nuc))
- {
- *callL1 = minorL.nuc;
- *callL2 = majorL.nuc;
- *callR1 = minorR.nuc;
- *callR2 = majorR.nuc;
- return 1;
- }
- else if ((majorL.nuc == minorR.nuc) && (minorL.nuc != majorR.nuc))
- {
- *callL1 = minorL.nuc;
- *callL2 = majorL.nuc;
- *callR1 = majorR.nuc;
- *callR2 = minorR.nuc;
- return 1;
- }
- else if ((minorL.nuc == majorR.nuc) && (majorL.nuc != minorR.nuc))
- {
- *callL1 = majorL.nuc;
- *callL2 = minorL.nuc;
- *callR1 = minorR.nuc;
- *callR2 = majorR.nuc;
- return 1;
- }
- else
- return 0;
- }
-else if (freqR <= HET_THRESH && freqL > HET_THRESH)
- { // homo in right (germline), hetero in left (tumor)
- if (majorL.nuc != majorR.nuc)
- {
- *callL1 = majorL.nuc;
- *callL2 = minorL.nuc;
- *callR1 = majorR.nuc;
- *callR2 = minorR.nuc;
- return 1;
- }
- else if (minorL.nuc != majorR.nuc)
- {
- *callL1 = minorL.nuc;
- *callL2 = majorL.nuc;
- *callR1 = majorR.nuc;
- *callR2 = minorR.nuc;
- return 1;
- }
- }
-else if (freqR <= HET_THRESH && freqL <= HET_THRESH)
- { // homozygous in right and left (tumor)
- if (majorL.nuc != majorR.nuc)
- {
- *callL1 = majorL.nuc;
- *callL2 = minorL.nuc;
- *callR1 = majorR.nuc;
- *callR2 = minorR.nuc;
- return 1;
- }
- }
-
-return 0; // not a SNP variant.
-}
-
#define NUM_CHROMS 100
static int mateChromsL[NUM_CHROMS];
static int mateChromsR[NUM_CHROMS];
static int somaticSV[NUM_CHROMS];
@@ -472,13 +377,31 @@
germlineSV[i] = 0;
}
const bam_pileup1_t *p;
+int avgMapQ = 0;
for (i = 0; i < d->nL; ++i)
{
p = d->puL + i;
+ avgMapQ += p->b->core.qual;
+ }
+if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
+ return 0;
- if (p->b->core.qual < minMapQ)
+avgMapQ = 0;
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+ avgMapQ += p->b->core.qual;
+ }
+if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
+ return 0;
+
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+
+ if (p->b->core.qual < ap->minMapQ)
continue;
bam1_core_t *c = &p->b->core;
if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
@@ -494,9 +417,9 @@
for (i = 0; i < d->nR; ++i)
{
p = d->puR + i;
- if (p->b->core.qual < minMapQ)
+ if (p->b->core.qual < ap->minMapQ)
continue;
bam1_core_t *c = &p->b->core;
if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
@@ -523,8 +446,121 @@
return retVal;
}
+static int indel_variant(pileup_data_t *d, analysis_params_t *ap,
+ int *isIns, int *isDel, int *insGerm, int *delGerm)
+{
+if (insL >= ap->minSupp)
+ {
+ *isIns = 1;
+ if (insR >= ap->minSupp)
+ *insGerm = 1;
+ else
+ *insGerm = 0;
+ return 1;
+ }
+
+if (delL >= ap->minSupp)
+ {
+ *isDel = 1;
+ if (delR >= ap->minSupp)
+ *delGerm = 1;
+ else
+ *delGerm = 0;
+ return 1;
+ }
+
+return 0;
+}
+
+static void print_seq(uint32_t tid, uint32_t pos, int n,
+ const bam_pileup1_t *pu, pileup_data_t *d)
+{
+int i, j, rb;
+rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+for (i = 0; i < n; ++i)
+ {
+ const bam_pileup1_t *p = pu + i;
+ if (p->b->core.qual < minMapQ)
+ continue;
+
+ int qual = bam1_qual(p->b)[p->qpos];
+ if (qual < minQ)
+ continue;
+
+ if (p->is_head)
+ printf("^%c", p->b->core.qual > 93 ? 126 : p->b->core.qual + 33);
+ if (!p->is_del)
+ {
+ int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+ if (c == '=' || toupper(c) == toupper(rb))
+ c = bam1_strand(p->b) ? ',' : '.';
+ else
+ c = bam1_strand(p->b) ? tolower(c) : toupper(c);
+ putchar(c);
+ if (p->indel > 0)
+ {
+ printf("+%d", p->indel);
+ for (j = 1; j <= p->indel; ++j)
+ {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+ putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
+ }
+ }
+ else if (p->indel < 0)
+ {
+ printf("%d", p->indel);
+ for (j = 1; j <= -p->indel; ++j)
+ {
+ c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
+ putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
+ }
+ }
+ }
+ else
+ putchar('*');
+ if (p->is_tail)
+ putchar('$');
+ }
+putchar('\t');
+
+// print base quality
+for (i = 0; i < n; ++i)
+ {
+ const bam_pileup1_t *p = pu + i;
+ if (p->b->core.qual < minMapQ)
+ continue;
+
+ int qual = bam1_qual(p->b)[p->qpos];
+ if (qual < minQ)
+ continue;
+
+ int c = qual + 33;
+ if (c > 126)
+ c = 126;
+ putchar(c);
+ }
+putchar('\t');
+
+// print mapping quality
+for (i = 0; i < n; ++i)
+ {
+ const bam_pileup1_t *p = pu + i;
+ int qual = p->b->core.qual;
+ if (qual < minMapQ)
+ continue;
+
+ if (bam1_qual(p->b)[p->qpos] < minQ)
+ continue;
+
+ int c = qual + 33;
+ if (c > 126)
+ c = 126;
+ putchar(c);
+ }
+}
+
static void perform_analysis(void *data)
{
pileup_data_t *d = (pileup_data_t*)data;
analysis_params_t *ap = (analysis_params_t*)d->func_data;
@@ -535,61 +571,57 @@
float hetL, hetR;
nuc_counts(d, ap);
-// remember, BAM data is starts at base zero, not base one like browser.
-/***
-if (snp_variant(d, ap, &callL1, &callL2, &callR1, &callR2, &hetL, &hetR, &ap->sites))
- {
- printf("chr%s\t%d\t%d\t",
- d->hL->target_name[d->tidL], d->posL, d->posL + 1);
-
- if (hetR > HET_THRESH && hetL > HET_THRESH) // both hetero
- printf("SNP:%c/%c(%f)>%c/%c(%f)\t", callR1, callR2, 1.0-hetR, callL1, callL2, 1.0-hetL);
- else if (hetR <= HET_THRESH && hetL <= HET_THRESH) // both homo
- printf("SNP:%c(%f)>%c(%f)\t", callR1, 1.0-hetR, callL1, 1.0-hetL);
- else if (hetR > HET_THRESH && hetL <= HET_THRESH) // germ hetero, tumor hom
- printf("LOH:%c/%c(%f)>%c(%f)\t", callR1, callR2, 1.0-hetR, callL1, 1.0-hetL);
- else if (hetR <= HET_THRESH && hetL > HET_THRESH) // germ homo, tumor hetero
- printf("GOH:%c(%f)>%c/%c(%f)\t", callR1, 1.0-hetR, callL1, callL2, 1.0-hetL);
- else
- printf("error\t"); // should never get here.
-
-
- fflush(stdout);
- }
-***/
-
if (model_select(d, ap, &callL1, &callL2, &callR1, &callR2,
&hetL, &hetR, &stateL, &stateR, &ap->sites))
{
- printf("chr%s\t%d\t%d\t",
+ printf("%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
if (stateR == 1)
printf("Hom:%c(%f)>", callR1, 1.0-hetR);
else if (stateR == 2)
printf("Het:%c/%c(%f)>", callR1, callR2, 1.0-hetR);
- else if (stateR == 3)
- printf("Tri:%c/%c(%f)>", callR1, callR2, 1.0-hetR);
if (stateL == 1)
printf("Hom:%c(%f)\t", callL1, 1.0-hetL);
else if (stateL == 2)
printf("Het:%c/%c(%f)\t", callL1, callL2, 1.0-hetL);
- else if (stateL == 3)
- printf("Tri:%c/%c(%f)\t", callL1, callL2, 1.0-hetL);
double chi_sq = sig_diff(d, ap);
printf("%f\t", chi_sq);
- for (i = 0; i < 4; i++)
- printf("%c:%d,%d ", NUCS[i], votesR[i], countsR[i]);
+ print_seq(d->tidR, d->posR, d->nR, d->puR, d);
putchar('\t');
+ print_seq(d->tidL, d->posL, d->nL, d->puL, d);
+ putchar('\n');
+
+ fflush(stdout);
+ }
+
+int isIns = 0, isDel = 0, insGerm = 0, delGerm = 0;
+if (indel_variant(d, ap, &isIns, &isDel, &insGerm, &delGerm))
+ {
+ printf("%s\t%d\t%d\t",
+ d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+ if (isIns && insGerm)
+ printf("GIns(%d,%d),", insR, insL);
+ else if (isIns && !insGerm)
+ printf("SIns(%d,%d),", insR, insL);
+
+ if (isDel && delGerm)
+ printf("GDel(%d,%d)", delR, delL);
+ else if (isDel && !delGerm)
+ printf("SDel(%d,%d)", delR, delL);
- for (i = 0; i < 4; i++)
- printf("%c:%d,%d ", NUCS[i], votesL[i], countsL[i]);
+ putchar('\t');
+ print_seq(d->tidR, d->posR, d->nR, d->puR, d);
+ putchar('\t');
+ print_seq(d->tidL, d->posL, d->nL, d->puL, d);
putchar('\n');
+
fflush(stdout);
}
if (struct_variant(d, ap))
@@ -593,9 +625,9 @@
}
if (struct_variant(d, ap))
{
- printf("chr%s\t%d\t%d\t",
+ printf("%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
for (i = 0; i < NUM_CHROMS; ++i)
{
@@ -603,9 +635,14 @@
printf("SSV:%s(%d),", d->hL->target_name[i], somaticSV[i]);
if (germlineSV[i] > 0)
printf("GSV:%s(%d),", d->hL->target_name[i], germlineSV[i]);
}
+ putchar('\t');
+ print_seq(d->tidR, d->posR, d->nR, d->puR, d);
+ putchar('\t');
+ print_seq(d->tidL, d->posL, d->nL, d->puL, d);
putchar('\n');
+
fflush(stdout);
}
}
@@ -613,9 +650,11 @@
{
analysis_params_t ap;
ap.minQ = minQ;
ap.minMapQ = minMapQ;
+ap.avgMapQ = avgMapQ;
ap.minSupp = minSupport;
+ap.minChiSq = minChiSq;
ap.sites = 0;
bam_bam_file(inbamL, inbamR, perform_analysis, &ap);
fprintf(stderr, "# qualified sites = %d\n", ap.sites);
}
@@ -641,8 +679,12 @@
if (optionExists("minQ"))
minQ = optionInt("minQ", 0);
if (optionExists("minMapQ"))
minMapQ = optionInt("minMapQ", 0);
+if (optionExists("avgMapQ"))
+ avgMapQ = optionInt("avgMapQ", 0);
+if (optionExists("minChiSq"))
+ minChiSq = optionInt("minChiSq", 0);
char *inbamL = argv[1];
char *inbamR = argv[2];
bamBam(inbamL, inbamR);