src/hg/tcga/bamBam/bamBam.c 1.9
1.9 2009/11/06 07:08:06 jsanborn
updated
Index: src/hg/tcga/bamBam/bamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamBam/bamBam.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -b -B -U 1000000 -r1.8 -r1.9
--- src/hg/tcga/bamBam/bamBam.c 1 Oct 2009 22:24:58 -0000 1.8
+++ src/hg/tcga/bamBam/bamBam.c 6 Nov 2009 07:08:06 -0000 1.9
@@ -1,709 +1,833 @@
/* bamBam -- discovering variants from BAM file
* code adapted from Heng Li's Samtools library functions
*/
#ifdef USE_BAM
#include "common.h"
#include "hCommon.h"
#include "linefile.h"
#include "hash.h"
#include "hdb.h"
#include "options.h"
#include "jksql.h"
#define _IOLIB 2
#include "bam.h"
#include "sam.h"
#include "bam_dual_pileup.h"
static char const rcsid[] = "$Id";
void usage()
/* Explain usage and exit. */
{
errAbort(
"bamBam - Perform analysis on two BAM files.\n"
"usage:\n"
" bamBam [options] <left.bam> <right.bam>\n"
"options:\n"
" -position=chrX:1-100 - Position to do analysis. [None]\n"
" -minSupport=INT - Minimum Support (i.e. num reads) to call a variant. [2]\n"
" -minQ=INT - Minimum acceptable base quality. [0]\n"
" -minMapQ=INT - Minimum acceptable mapping quality. [0]\n"
" -avgMapQ=INT - Minimum acceptable avg mapping quality. [0]\n"
" -minChiSq=INT - Minimum acceptable chi square of nucleotide freqs. [0]\n"
+ " -maxISize=INT - Maximum Insert Size. [10000]\n"
"\n"
);
}
static struct optionSpec options[] = {
{"position", OPTION_STRING},
{"minSupport", OPTION_INT},
{"minQ", OPTION_INT},
{"minMapQ", OPTION_INT},
{"avgMapQ", OPTION_INT},
{"minChiSq", OPTION_INT},
+ {"maxISize", OPTION_INT},
{NULL, 0}
};
-#define HET_THRESH 0.05
-
/* Option variables */
char *position = NULL;
int minSupport = 2;
int minQ = 0;
int minMapQ = 0;
int avgMapQ = 0;
int minChiSq = 0;
+int maxISize = 10000;
typedef struct {
int minQ;
int minMapQ;
int avgMapQ;
int minSupp;
int minChiSq;
int sites;
} analysis_params_t;
static int32_t votesL[4];
static int32_t countsL[4];
static int32_t votesR[4];
static int32_t countsR[4];
static int32_t insL, delL, insR, delR;
typedef struct {
char nuc;
int count;
int vote;
} variant_t;
variant_t minorL, majorL;
variant_t minorR, majorR;
static int nuc_index(char c)
{
switch (c)
{
case 'A':
return 0;
case 'G':
return 1;
case 'C':
return 2;
case 'T':
return 3;
default:
return -1;
}
return -1;
}
#define NUCS "AGCT"
static inline void nuc_counts(pileup_data_t *d, analysis_params_t *ap)
{
int i, index;
for (i = 0; i < 4; i++)
{ // sum up the qualities of each base
votesL[i] = 0;
countsL[i] = 0;
votesR[i] = 0;
countsR[i] = 0;
}
insL = 0;
insR = 0;
delL = 0;
delR = 0;
const bam_pileup1_t *p;
for (i = 0; i < d->nL; ++i)
{
p = d->puL + i;
if (p->b->core.qual < ap->minMapQ)
continue;
int qual = bam1_qual(p->b)[p->qpos];
if (qual < ap->minQ)
continue;
if (!p->is_del)
{
int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
if ((index = nuc_index(c)) < 0)
continue;
votesL[index] += qual;
countsL[index]++;
if (p->indel > 0)
insL += 1;
else if (p->indel < 0)
delL += 1;
}
}
for (i = 0; i < d->nR; ++i)
{
p = d->puR + i;
if (p->b->core.qual < ap->minMapQ)
continue;
int qual = bam1_qual(p->b)[p->qpos];
if (qual < ap->minQ)
continue;
if (!p->is_del)
{
int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
if ((index = nuc_index(c)) < 0)
continue;
votesR[index] += qual;
countsR[index]++;
if (p->indel > 0)
insR += 1;
else if (p->indel < 0)
delR += 1;
}
}
int iL1 = -1, iL2 = -1, maxL1 = 0, maxL2 = 0;
int iR1 = -1, iR2 = -1, maxR1 = 0, maxR2 = 0;
int countL1 = -1, countL2 = -1, countR1 = -1, countR2 = -1;
for (i = 0; i < 4; i++)
{
if (votesL[i] > maxL1)
{
iL2 = iL1; // set prev max to max2
maxL2 = maxL1;
countL2 = countL1;
iL1 = i;
maxL1 = votesL[i];
countL1 = countsL[i];
}
else if (votesL[i] > maxL2)
{
iL2 = i;
maxL2 = votesL[i];
countL2 = countsL[i];
}
if (votesR[i] > maxR1)
{
iR2 = iR1;
maxR2 = maxR1;
countR2 = countR1;
iR1 = i;
maxR1 = votesR[i];
countR1 = countsR[i];
}
else if (votesR[i] > maxR2)
{
iR2 = i;
maxR2 = votesR[i];
countR2 = countsR[i];
}
}
majorL.nuc = 'N';
majorL.vote = 0;
majorL.count = 0;
if (iL1 != -1 && countL1 >= ap->minSupp)
{
majorL.nuc = NUCS[iL1];
majorL.vote = maxL1;
majorL.count = countL1;
}
minorL.nuc = 'N';
minorL.vote = 0;
minorL.count = 0;
if (iL2 != -1 && countL2 >= ap->minSupp)
{
minorL.nuc = NUCS[iL2];
minorL.vote = maxL2;
minorL.count = countL2;
}
majorR.nuc = 'N';
majorR.vote = 0;
majorR.count = 0;
if (iR1 != -1 && countR1 >= ap->minSupp)
{
majorR.nuc = NUCS[iR1];
majorR.vote = maxR1;
majorR.count = countR1;
}
minorR.nuc = 'N';
minorR.vote = 0;
minorR.count = 0;
if (iR2 != -1 && countR2 >= ap->minSupp)
{
minorR.nuc = NUCS[iR2];
minorR.vote = maxR2;
minorR.count = countR2;
}
}
static double chi_square(int obs1, int obs2, double p) // p = binomial prob of getting 1
{
double pseudo = 1.0;
int count = obs1 + obs2;
double exp1 = p * ((double) count) + pseudo;
double exp2 = (1.0 - p) * ((double) count) + pseudo;
// apply Yates correction... will often have small observed values.
return pow(abs((double)obs1 - exp1) - 0.5, 2.0)/exp1 + pow(abs((double)obs2 - exp2) - 0.5, 2.0)/exp2;
}
static int sig_diff(pileup_data_t *d, analysis_params_t *ap)
{
int i;
double pseudo = 1.0;
int totalL = 0, totalR = 0;
for (i = 0; i < 4; i++)
{
totalL += countsL[i];
totalR += countsR[i];
}
double obs, exp, chi_sq = 0.0;
for (i = 0; i < 4; i++)
{
obs = ((double) countsL[i] * 100.0) / (double) totalL + pseudo;
exp = ((double) countsR[i] * 100.0) / (double) totalR + pseudo;
chi_sq += pow(abs(obs - exp) - 0.5, 2.0) / exp;
}
return chi_sq;
}
static int which_model(double homo, double het)
{
if (homo <= het)
return 1;
else if (homo > het)
return 2;
return 0;
}
static int model_select(pileup_data_t *d, analysis_params_t *ap,
char *callL1, char *callL2, char *callR1, char *callR2,
float *hetL, float *hetR, int *stateL, int *stateR, int *sites)
{
if (majorR.vote == 0 || majorL.vote == 0)
return 0; // most likely a bad quality region.
if (sig_diff(d, ap) < ap->minChiSq)
return 0;
float freqL = ((float) minorL.count) / ((float) majorL.count + (float) minorL.count);
float freqR = ((float) minorR.count) / ((float) majorR.count + (float) minorR.count);
*hetL = freqL;
*hetR = freqR;
*sites += 1;
double modelR_homo = chi_square(majorR.count, minorR.count, 1.0);
double modelR_het = chi_square(majorR.count, minorR.count, 0.5);
double modelL_homo = chi_square(majorL.count, minorL.count, 1.0);
double modelL_het = chi_square(majorL.count, minorL.count, 0.5);
int typeL = which_model(modelL_homo, modelL_het);
int typeR = which_model(modelR_homo, modelR_het);
*stateL = typeL;
*stateR = typeR;
*callL1 = majorL.nuc;
*callL2 = minorL.nuc;
*callR1 = majorR.nuc;
*callR2 = minorR.nuc;
int retVal = 0;
if (typeR == 1)
{ // normal most likely homozygous.
if (typeL == 1)
{ // tumor is also homozygous.
if (majorL.nuc != majorR.nuc) // homozygous SNP
retVal = 1;
}
else if (typeL == 2)
retVal = 1;
}
else if (typeR == 2)
{ // normal most likely heterozygous.
if (typeL == 1)
retVal = 1; // tumor is homozygous... LOH!
else if (typeL == 2)
{ // tumor also heterozygous.
if (majorL.nuc == majorR.nuc && minorL.nuc == minorR.nuc)
retVal = 0;
else if (majorL.nuc == minorR.nuc && minorL.nuc == majorR.nuc)
retVal = 0;
else
retVal = 1;
}
}
return retVal;
}
#define NUM_CHROMS 100
static int mateChromsL[NUM_CHROMS];
static int mateChromsR[NUM_CHROMS];
static int somaticSV[NUM_CHROMS];
static int germlineSV[NUM_CHROMS];
-static int struct_variant(pileup_data_t *d, analysis_params_t *ap)
+static int inter_struct_variant(pileup_data_t *d, analysis_params_t *ap)
{
int i;
for (i = 0; i < NUM_CHROMS; ++i)
{
mateChromsL[i] = 0;
mateChromsR[i] = 0;
somaticSV[i] = 0;
germlineSV[i] = 0;
}
const bam_pileup1_t *p;
int avgMapQ = 0;
for (i = 0; i < d->nL; ++i)
{
p = d->puL + i;
avgMapQ += p->b->core.qual;
}
if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
return 0;
avgMapQ = 0;
for (i = 0; i < d->nR; ++i)
{
p = d->puR + i;
avgMapQ += p->b->core.qual;
}
if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
return 0;
for (i = 0; i < d->nL; ++i)
{
p = d->puL + i;
if (p->b->core.qual < ap->minMapQ)
continue;
bam1_core_t *c = &p->b->core;
if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
{ // read is paired and mapped.
if (!((c)->flag & BAM_FMUNMAP))
{ // mate is mapped
if ((c)->mtid != (c)->tid) // different chroms.
mateChromsL[(c)->mtid]++;
}
}
}
for (i = 0; i < d->nR; ++i)
{
p = d->puR + i;
if (p->b->core.qual < ap->minMapQ)
continue;
bam1_core_t *c = &p->b->core;
if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
{ // read is paired and mapped.
if (!((c)->flag & BAM_FMUNMAP))
{ // mate is mapped
if ((c)->mtid != (c)->tid) // different chroms
mateChromsR[(c)->mtid]++;
}
}
}
int retVal = 0;
for (i = 0; i < NUM_CHROMS; i++)
{
if (mateChromsL[i] < ap->minSupp)
continue;
retVal = 1;
if (mateChromsR[i] < ap->minSupp)
somaticSV[i] = mateChromsL[i];
else
germlineSV[i] = mateChromsL[i] + mateChromsR[i];
}
return retVal;
}
+
+
+static int intra_struct_variant(pileup_data_t *d, analysis_params_t *ap,
+ int32_t *avgIsizeL, double *avgOrientL, int32_t *retCountL,
+ int32_t *avgIsizeR, double *avgOrientR, int32_t *retCountR)
+{
+// First check average mapping quality of read pairs
+int i;
+const bam_pileup1_t *p;
+int avgMapQ = 0;
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+ avgMapQ += p->b->core.qual;
+ }
+if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
+ return 0;
+
+avgMapQ = 0;
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+ avgMapQ += p->b->core.qual;
+ }
+if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
+ return 0;
+
+// Satisfied average mapping quality constraint
+
+int32_t isizeL = 0;
+double orientL = 0.0;
+double countL = 0.0;
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+
+ if (p->b->core.qual < ap->minMapQ)
+ continue;
+
+ bam1_core_t *c = &p->b->core;
+ if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+ { // read is paired and mapped.
+ if (!((c)->flag & BAM_FMUNMAP))
+ { // mate is mapped
+ if ( ((c)->mtid == (c)->tid) && (abs((c)->isize) > maxISize) )
+ { // same chrom, exceeds maximum isize.
+ isizeL += ((c)->mpos - (c)->pos);
+ orientL += (double) (bam1_strand(p->b) && bam1_mstrand(p->b));
+ countL += 1.0;
+ }
+ }
+ }
+ }
+
+int32_t isizeR = 0;
+double orientR = 0.0;
+double countR = 0.0;
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+
+ if (p->b->core.qual < ap->minMapQ)
+ continue;
+
+ bam1_core_t *c = &p->b->core;
+ if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+ { // read is paired and mapped.
+ if (!((c)->flag & BAM_FMUNMAP))
+ { // mate is mapped
+ if ( ((c)->mtid == (c)->tid) && (abs((c)->isize) > maxISize) )
+ { // same chrom, exceeds maximum insert size.
+ isizeR += ((c)->mpos - (c)->pos);
+ orientR += (double) (bam1_strand(p->b) && bam1_mstrand(p->b));
+ countR += 1.0;
+ }
+ }
+ }
+ }
+
+if (countR > 0.0)
+ { // don't look to minSupport for germline, variants... if you see it once, it's enough
+ // assuming it's found in somatic.
+ *avgIsizeR = isizeR / (int32_t) countR;
+ *avgOrientR = orientR / countR;
+ *retCountR = (int32_t) countR;
+ }
+
+if (countL > minSupport)
+ {
+ *avgIsizeL = isizeL / (int32_t) countL;
+ *avgOrientL = orientL / countL;
+ *retCountL = (int32_t) countL;
+ return 1; // only return successful if found a putative intra-chrom in somatic.
+ }
+
+return 0;
+}
+
static int indel_variant(pileup_data_t *d, analysis_params_t *ap,
int *isIns, int *isDel, int *insGerm, int *delGerm)
{
if (insL >= ap->minSupp)
{
*isIns = 1;
if (insR > 0) // if you see even once, believe it.
*insGerm = 1;
else
*insGerm = 0;
return 1;
}
if (delL >= ap->minSupp)
{
*isDel = 1;
if (delR > 0) // if you see even once, believe it.
*delGerm = 1;
else
*delGerm = 0;
return 1;
}
return 0;
}
static void print_seq(uint32_t tid, uint32_t pos, int n,
const bam_pileup1_t *pu, pileup_data_t *d)
{
int i, j, rb;
rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
for (i = 0; i < n; ++i)
{
const bam_pileup1_t *p = pu + i;
if (p->b->core.qual < minMapQ)
continue;
int qual = bam1_qual(p->b)[p->qpos];
if (qual < minQ)
continue;
if (p->is_head)
printf("^%c", p->b->core.qual > 93 ? 126 : p->b->core.qual + 33);
if (!p->is_del)
{
int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
if (c == '=' || toupper(c) == toupper(rb))
c = bam1_strand(p->b) ? ',' : '.';
else
c = bam1_strand(p->b) ? tolower(c) : toupper(c);
putchar(c);
if (p->indel > 0)
{
printf("+%d", p->indel);
for (j = 1; j <= p->indel; ++j)
{
c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
}
}
else if (p->indel < 0)
{
printf("%d", p->indel);
for (j = 1; j <= -p->indel; ++j)
{
c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
}
}
}
else
putchar('*');
if (p->is_tail)
putchar('$');
}
putchar('\t');
// print base quality
for (i = 0; i < n; ++i)
{
const bam_pileup1_t *p = pu + i;
if (p->b->core.qual < minMapQ)
continue;
int qual = bam1_qual(p->b)[p->qpos];
if (qual < minQ)
continue;
int c = qual + 33;
if (c > 126)
c = 126;
putchar(c);
}
putchar('\t');
// print mapping quality
for (i = 0; i < n; ++i)
{
const bam_pileup1_t *p = pu + i;
int qual = p->b->core.qual;
if (qual < minMapQ)
continue;
if (bam1_qual(p->b)[p->qpos] < minQ)
continue;
int c = qual + 33;
if (c > 126)
c = 126;
putchar(c);
}
}
static void perform_analysis(void *data)
{
pileup_data_t *d = (pileup_data_t*)data;
analysis_params_t *ap = (analysis_params_t*)d->func_data;
int i;
int stateL, stateR;
char callL1, callL2, callR1, callR2;
float hetL, hetR;
nuc_counts(d, ap);
if (model_select(d, ap, &callL1, &callL2, &callR1, &callR2,
&hetL, &hetR, &stateL, &stateR, &ap->sites))
{
printf("%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
if (stateR == 1)
printf("Hom:%c(%f)>", callR1, 1.0-hetR);
else if (stateR == 2)
printf("Het:%c/%c(%f)>", callR1, callR2, 1.0-hetR);
if (stateL == 1)
printf("Hom:%c(%f)\t", callL1, 1.0-hetL);
else if (stateL == 2)
printf("Het:%c/%c(%f)\t", callL1, callL2, 1.0-hetL);
double chi_sq = sig_diff(d, ap);
printf("%f\t", chi_sq);
print_seq(d->tidR, d->posR, d->nR, d->puR, d);
putchar('\t');
print_seq(d->tidL, d->posL, d->nL, d->puL, d);
putchar('\n');
fflush(stdout);
}
int isIns = 0, isDel = 0, insGerm = 0, delGerm = 0;
if (indel_variant(d, ap, &isIns, &isDel, &insGerm, &delGerm))
{
printf("%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
if (isIns && insGerm)
printf("GIns(%d,%d),", insR, insL);
else if (isIns && !insGerm)
printf("SIns(%d,%d),", insR, insL);
if (isDel && delGerm)
printf("GDel(%d,%d)", delR, delL);
else if (isDel && !delGerm)
printf("SDel(%d,%d)", delR, delL);
putchar('\t');
print_seq(d->tidR, d->posR, d->nR, d->puR, d);
putchar('\t');
print_seq(d->tidL, d->posL, d->nL, d->puL, d);
putchar('\n');
fflush(stdout);
}
-if (struct_variant(d, ap))
+if (inter_struct_variant(d, ap))
{
printf("%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
for (i = 0; i < NUM_CHROMS; ++i)
{
if (somaticSV[i] > 0)
printf("SSV:%s(%d),", d->hL->target_name[i], somaticSV[i]);
if (germlineSV[i] > 0)
printf("GSV:%s(%d),", d->hL->target_name[i], germlineSV[i]);
}
putchar('\t');
print_seq(d->tidR, d->posR, d->nR, d->puR, d);
putchar('\t');
print_seq(d->tidL, d->posL, d->nL, d->puL, d);
putchar('\n');
fflush(stdout);
}
+
+int32_t avgIsizeL = 0, avgIsizeR = 0, countL = 0, countR = 0;
+double sameOrientL = 0.0, sameOrientR = 0.0;
+
+if (intra_struct_variant(d, ap, &avgIsizeL, &sameOrientL, &countL,
+ &avgIsizeR, &sameOrientR, &countR))
+ {
+ printf("%s\t%d\t%d\t",
+ d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+ if (avgIsizeL != 0.0 && avgIsizeR == 0.0)
+ printf("SIC:%d(%d,%f)", avgIsizeL, countL, sameOrientL);
+ else if (avgIsizeL != 0.0 && avgIsizeR != 0.0)
+ printf("GIC:%d(%d,%f)/%d(%d,%f)", avgIsizeR, countR, sameOrientR,
+ avgIsizeL, countL, sameOrientL);
+
+ putchar('\t');
+ print_seq(d->tidR, d->posR, d->nR, d->puR, d);
+ putchar('\t');
+ print_seq(d->tidL, d->posL, d->nL, d->puL, d);
+ putchar('\n');
+
+ fflush(stdout);
+ }
}
void bamBam(char *inbamL, char *inbamR)
{
analysis_params_t ap;
ap.minQ = minQ;
ap.minMapQ = minMapQ;
ap.avgMapQ = avgMapQ;
ap.minSupp = minSupport;
ap.minChiSq = minChiSq;
ap.sites = 0;
bam_bam_file(inbamL, inbamR, position, perform_analysis, &ap);
fprintf(stderr, "# qualified sites = %d\n", ap.sites);
}
int main(int argc, char *argv[])
/* Process command line. */
{
// Print out command line as a comment.
int i;
putchar('#');
for (i = 0; i < argc; i++)
printf(" %s", argv[i]);
putchar('\n');
optionInit(&argc, argv, options);
if (argc < 3)
usage();
if (optionExists("minSupport"))
minSupport = optionInt("minSupport", 2);
if (optionExists("minQ"))
minQ = optionInt("minQ", 0);
if (optionExists("minMapQ"))
minMapQ = optionInt("minMapQ", 0);
if (optionExists("avgMapQ"))
avgMapQ = optionInt("avgMapQ", 0);
if (optionExists("minChiSq"))
minChiSq = optionInt("minChiSq", 0);
+if (optionExists("maxISize"))
+ maxISize = optionInt("maxISize", 10000);
if (optionExists("position"))
position = optionVal("position", NULL);
char *inbamL = argv[1];
char *inbamR = argv[2];
bamBam(inbamL, inbamR);
return 0;
}
#else // USE_BAM not defined
int main(int argc, char *argv[])
/* Process command line. */
{
printf("BAM not installed");
return 0;
}
#endif