src/hg/tcga/pBamBam/asBamBam.c 1.2
1.2 2010/03/06 02:06:48 jsanborn
updated
Index: src/hg/tcga/pBamBam/asBamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/asBamBam.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/tcga/pBamBam/asBamBam.c 5 Mar 2010 19:53:24 -0000 1.1
+++ src/hg/tcga/pBamBam/asBamBam.c 6 Mar 2010 02:06:48 -0000 1.2
@@ -1,944 +1,1357 @@
/* asBamBam -- discovering variants from BAM file
* code adapted from Heng Li's Samtools library functions
*/
#ifdef USE_BAM
#include "common.h"
#include "hCommon.h"
#include "linefile.h"
#include "hash.h"
#include "hdb.h"
#include "options.h"
#include "jksql.h"
#define _IOLIB 2
#include "bam.h"
#include "sam.h"
#include "bam_helper.h"
static char const rcsid[] = "$Id";
void usage()
/* Explain usage and exit. */
{
errAbort(
"asBamBam - Allele-specific BamBam.\n"
"usage:\n"
" asBamBam [options] <left.bam> <right.bam>\n"
"options:\n"
" -position=chrX:1-100 - Position to do analysis. [None]\n"
" -minSupport=INT - Minimum Support in Germline. [10]\n"
" -minQ=INT - Minimum acceptable base quality. [0]\n"
" -minMapQ=INT - Minimum acceptable mapping quality. [0]\n"
+ " -avgMapQ=INT - Minimum acceptable average mapping quality. [0]\n"
" -minChiSq=INT - Minimum acceptable chi square of nucleotide freqs. [0]\n"
+ " -maxISize=INT - Maximum Insert Size. [10000]\n"
" -readsPerBin=INT - Reads per CNV bin [500]\n"
- " -readLength=INT - Length of read [100]\n"
"\n"
);
}
static struct optionSpec options[] = {
{"position", OPTION_STRING},
{"minSupport", OPTION_INT},
{"minQ", OPTION_INT},
{"minMapQ", OPTION_INT},
+ {"avgMapQ", OPTION_INT},
+ {"maxISize", OPTION_INT},
{"minChiSq", OPTION_INT},
{"readsPerBin", OPTION_INT},
- {"readLength", OPTION_INT},
{NULL, 0}
};
#define BAD_TID -999
/* Option variables */
char *position = NULL;
int minSupport = 2;
int minQ = 0;
int minMapQ = 0;
+int avgMapQ = 0;
+int maxISize = 10000;
int minChiSq = 0;
int readsPerBin = 500;
int readLength = 100;
typedef struct {
int minQ;
int minMapQ;
+ int avgMapQ;
+ int maxISize;
int minChiSq;
int minSupp;
int sites;
int readLength;
int readsPerBin;
int pStartPrev;
int pStopPrev;
int pStart;
int pStop;
/* Overall Copy Number */
int32_t tid;
int32_t lstart;
int32_t lstop;
int lcounts;
int32_t rstart;
int32_t rstop;
int rcounts;
int32_t prevTid;
int32_t prevStart;
int32_t prevStop;
/* ASCN */
int32_t tidAS;
int32_t startAS;
int32_t stopAS;
int lcountsMin;
int lcountsMax;
int rcountsMin;
int rcountsMax;
+ /* SV */
+ int svStartL;
+ int svStopL;
+ int svStartR;
+ int svStopR;
+
} analysis_params_t;
static int32_t countsL1[4];
static int32_t countsL2[4];
static int32_t countsL3[4];
static int32_t countsR1[4];
static int32_t countsR2[4];
static int32_t countsR3[4];
static int32_t votesL[4];
static int32_t countsL[4];
static int32_t votesR[4];
static int32_t countsR[4];
typedef struct {
char nuc;
int count;
int vote;
} variant_t;
variant_t minorL, majorL;
variant_t minorR, majorR;
+static int32_t insL, delL, insR, delR;
+
+typedef struct {
+ int tid;
+
+ int32_t qstart;
+ int32_t qstop;
+ int32_t mstart;
+ int32_t mstop;
+
+ int supp;
+ int orientation;
+} sv_t;
+
+#define BUF 500 // 500 base buffer around start/stop
+sv_t intraL, intraR;
+sv_t interL, interR;
+
static int nuc_index(char c)
{
switch (c)
{
case 'A':
return 0;
case 'G':
return 1;
case 'C':
return 2;
case 'T':
return 3;
default:
return -1;
}
return -1;
}
#define NUCS "AGCT"
-static inline void nuc_counts(dual_pileup_t *d, analysis_params_t *ap)
+static inline void reset_globals(analysis_params_t *ap)
{
-int i, index;
+/* reset nuc counts */
+int i;
for (i = 0; i < 4; i++)
{ // sum up the qualities of each base
votesL[i] = 0;
countsL[i] = 0;
votesR[i] = 0;
countsR[i] = 0;
countsL1[i] = 0;
countsL2[i] = 0;
countsL3[i] = 0;
countsR1[i] = 0;
countsR2[i] = 0;
countsR3[i] = 0;
}
+/* reset indel stats */
+insL = 0;
+insR = 0;
+delL = 0;
+delR = 0;
+
/* reset pileup info */
ap->pStart = -1, ap->pStop = -1;
+}
+
+static inline int check_read(const bam_pileup1_t *p, analysis_params_t *ap)
+{
+bam1_core_t *c = &p->b->core;
+/* Check if unmapped or PCR optical dupe */
+if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
+ return 0;
+
+/* check base quality */
+if (bam1_qual(p->b)[p->qpos] < ap->minQ)
+ return 0;
+/* check mapping quality */
+if ((c)->qual < ap->minMapQ)
+ return 0;
+
+/* all good */
+return 1;
+}
+
+
+static inline void nuc_counts(dual_pileup_t *d, analysis_params_t *ap)
+{
+/* reset global vars */
+reset_globals(ap);
+
+int i, index;
int32_t start, stop;
int32_t prevStart = -1, prevStop = -1;
const bam_pileup1_t *p;
for (i = 0; i < d->nL; ++i)
{
p = d->puL + i;
- bam1_core_t *c = &p->b->core;
-
- if ((c)->qual < ap->minMapQ)
+ if (!check_read(p, ap))
continue;
+ bam1_core_t *c = &p->b->core;
int qual = bam1_qual(p->b)[p->qpos];
- if (qual < ap->minQ)
- continue;
-
- /* Check if unmapped or PCR optical dupe */
- if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
- continue;
/* Check if start & stop has already been visited, to dupes missed in prev step*/
start = (c)->pos;
stop = start + (c)->l_qseq;
if (start <= prevStart && stop <= prevStop)
continue;
prevStart = start;
prevStop = stop;
if (ap->pStart == -1)
{
ap->pStart = start;
ap->pStop = stop;
}
if (start < ap->pStart)
ap->pStart = start;
if (stop > ap->pStop)
ap->pStop = stop;
+ if (p->indel > 0)
+ insL += 1;
+ else if (p->indel < 0)
+ delL += 1;
+
if (!p->is_del && p->indel == 0)
{
double lseq = (double) p->b->core.l_qseq;
int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
if ((index = nuc_index(b)) < 0)
continue;
votesL[index] += qual;
countsL[index]++;
if (p->qpos < round(0.33 * lseq))
countsL1[index]++;
else if (p->qpos < round(0.67 * lseq))
countsL2[index]++;
else
countsL3[index]++;
}
}
prevStart = -1;
prevStop = -1;
for (i = 0; i < d->nR; ++i)
{
p = d->puR + i;
- bam1_core_t *c = &p->b->core;
-
- if ((c)->qual < ap->minMapQ)
+ if (!check_read(p, ap))
continue;
+ bam1_core_t *c = &p->b->core;
int qual = bam1_qual(p->b)[p->qpos];
- if (qual < ap->minQ)
- continue;
-
- /* Check if unmapped or PCR optical dupe */
- if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
- continue;
/* Check if start & stop has already been visited, to dupes missed in prev step*/
start = (c)->pos;
stop = start + (c)->l_qseq;
if (start <= prevStart && stop <= prevStop)
continue;
prevStart = start;
prevStop = stop;
if (ap->pStart == -1)
{
ap->pStart = start;
ap->pStop = stop;
}
if (start < ap->pStart)
ap->pStart = start;
if (stop > ap->pStop)
ap->pStop = stop;
+ if (p->indel > 0)
+ insR += 1;
+ else if (p->indel < 0)
+ delR += 1;
+
if (!p->is_del && p->indel == 0)
{
double lseq = (double) p->b->core.l_qseq;
int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
if ((index = nuc_index(b)) < 0)
continue;
votesR[index] += qual;
countsR[index]++;
if (p->qpos < round(0.33 * lseq))
countsR1[index]++;
else if (p->qpos < round(0.67 * lseq))
countsR2[index]++;
else
countsR3[index]++;
}
}
int iL1 = -1, iL2 = -1, maxL1 = 0, maxL2 = 0;
int iR1 = -1, iR2 = -1, maxR1 = 0, maxR2 = 0;
int countL1 = -1, countL2 = -1, countR1 = -1, countR2 = -1;
for (i = 0; i < 4; i++)
{
if (countsL[i] > countL1)
{
iL2 = iL1; // set prev max to max2
maxL2 = maxL1;
countL2 = countL1;
iL1 = i;
maxL1 = votesL[i];
countL1 = countsL[i];
}
else if (countsL[i] > countL2)
{
iL2 = i;
maxL2 = votesL[i];
countL2 = countsL[i];
}
if (countsR[i] > countR1)
{
iR2 = iR1;
maxR2 = maxR1;
countR2 = countR1;
iR1 = i;
maxR1 = votesR[i];
countR1 = countsR[i];
}
else if (countsR[i] > countR2)
{
iR2 = i;
maxR2 = votesR[i];
countR2 = countsR[i];
}
}
majorL.nuc = 'N';
majorL.vote = 0;
majorL.count = 0;
if (iL1 != -1 && countL1 >= ap->minSupp)
{
majorL.nuc = NUCS[iL1];
majorL.vote = maxL1;
majorL.count = countL1;
}
minorL.nuc = 'N';
minorL.vote = 0;
minorL.count = 0;
if (iL2 != -1 && countL2 >= ap->minSupp)
{
minorL.nuc = NUCS[iL2];
minorL.vote = maxL2;
minorL.count = countL2;
}
majorR.nuc = 'N';
majorR.vote = 0;
majorR.count = 0;
if (iR1 != -1 && countR1 >= ap->minSupp)
{
majorR.nuc = NUCS[iR1];
majorR.vote = maxR1;
majorR.count = countR1;
}
minorR.nuc = 'N';
minorR.vote = 0;
minorR.count = 0;
if (iR2 != -1 && countR2 >= ap->minSupp)
{
minorR.nuc = NUCS[iR2];
minorR.vote = maxR2;
minorR.count = countR2;
}
}
double chi_square(int obs1, int obs2, double p) // p = binomial prob of getting 1
{
double pseudo = 1.0;
int count = obs1 + obs2;
double exp1 = p * ((double) count) + pseudo;
double exp2 = (1.0 - p) * ((double) count) + pseudo;
// apply Yates correction... will often have small observed values.
return pow(abs((double)obs1 - exp1) - 0.5, 2.0)/exp1 + pow(abs((double)obs2 - exp2) - 0.5, 2.0)/exp2;
}
static int sig_diff(dual_pileup_t *d, analysis_params_t *ap)
{
int i;
double pseudo = 1.0;
int totalL = 0, totalR = 0;
for (i = 0; i < 4; i++)
{
totalL += countsL[i];
totalR += countsR[i];
}
double obs, exp, chi_sq = 0.0;
for (i = 0; i < 4; i++)
{
obs = ((double) countsL[i] * 100.0) / (double) totalL + pseudo;
exp = ((double) countsR[i] * 100.0) / (double) totalR + pseudo;
chi_sq += pow(abs(obs - exp) - 0.5, 2.0) / exp;
}
return chi_sq;
}
static int is_mut(dual_pileup_t *d, analysis_params_t *ap,
- char *callL1, char *callL2, char *callR1, char *callR2,
- float *hetL, float *hetR, int *stateL, int *stateR, int *sites)
+ char *callL1, char *callL2, char *callR1, char *callR2)
{
-*sites += 1;
-
if (majorR.vote == 0 || majorL.vote == 0)
return 0; // most likely a bad quality region.
if (sig_diff(d, ap) < ap->minChiSq)
return 0;
int i = 0;
int rCount = 0;
for (i = 0; i < 4; i++)
rCount += countsR1[i] + countsR2[i] + countsR3[i];
if (rCount < 10)
return 0;
int retVal1 = 0, retVal2 = 0;
for (i = 0; i < 4; i++)
{
if (NUCS[i] != majorL.nuc && NUCS[i] != minorL.nuc)
continue;
if (NUCS[i] == majorL.nuc && majorL.nuc != majorR.nuc && majorL.nuc != minorR.nuc)
{
// if (countsL1[i] >= 2 && countsL2[i] >= 1)
// retVal1 = 1;
// else if (countsL1[i] > 0 && countsL2[i] > 0 && countsL3[i] > 0)
// retVal1 = 1;
if (countsL1[i] > 0 && countsL2[i] > 0 && countsL1[i] + countsL2[i] >= 3)
retVal1 = 1;
else if (countsL1[i] > 0 && countsL3[i] > 0 && countsL1[i] + countsL3[i] >= 3)
retVal1 = 1;
else if (countsL2[i] > 0 && countsL3[i] > 0 && countsL2[i] + countsL3[i] >= 3)
retVal1 = 1;
if (retVal1)
{
if (rCount >= 30 && countsR[i] <= 1)
retVal1 = 1;
else if (rCount < 30 && countsR[i] == 0)
retVal1 = 1;
else
retVal1 = 0;
}
}
if (NUCS[i] == minorL.nuc && (minorL.nuc != majorR.nuc && minorL.nuc != minorR.nuc))
{
// if (countsL1[i] >= 2 && countsL2[i] >= 1)
// retVal2 = 1;
// else if (countsL1[i] > 0 && countsL2[i] > 0 && countsL3[i] > 0)
// retVal2 = 1;
if (countsL1[i] > 0 && countsL2[i] > 0 && countsL1[i] + countsL2[i] >= 3)
retVal2 = 1;
else if (countsL1[i] > 0 && countsL3[i] > 0 && countsL1[i] + countsL3[i] >= 3)
retVal2 = 1;
else if (countsL2[i] > 0 && countsL3[i] > 0 && countsL2[i] + countsL3[i] >= 3)
retVal2 = 1;
if (retVal2)
{
if (rCount >= 30 && countsR[i] <= 1)
retVal2 = 1;
else if (rCount < 30 && countsR[i] == 0)
retVal2 = 1;
else
retVal2 = 0;
}
}
}
if (!retVal1 && !retVal2)
return 0;
-
-float freqL = ((float) minorL.count) / ((float) majorL.count + (float) minorL.count);
-float freqR = ((float) minorR.count) / ((float) majorR.count + (float) minorR.count);
-
-*hetL = freqL;
-*hetR = freqR;
-
*callL1 = majorL.nuc;
*callL2 = minorL.nuc;
*callR1 = majorR.nuc;
*callR2 = minorR.nuc;
if (majorL.nuc != majorR.nuc && majorL.nuc != minorR.nuc)
return 1;
if (minorL.nuc != 'N' && (minorL.nuc != majorR.nuc && minorL.nuc != minorR.nuc))
return 1;
return 0;
}
static int calc_cnv(dual_pileup_t *d, analysis_params_t *ap)
{
if ((ap->tid != BAD_TID) && (ap->tid != d->tidL)) // onto next chromosome
return 2;
int i;
const bam_pileup1_t *p;
if ((ap->prevTid != BAD_TID) && (ap->prevTid == d->tidL))
{
for (i = 0; i < d->nL; ++i)
{
p = d->puL + i;
bam1_core_t *c = &p->b->core;
if ((c)->pos <= ap->prevStop)
return 0;
}
for (i = 0; i < d->nR; ++i)
{
p = d->puR + i;
bam1_core_t *c = &p->b->core;
if ((c)->pos <= ap->prevStop)
return 0;
}
}
ap->tid = d->tidL;
int32_t start, stop;
int32_t lstart = ap->lstart, lstop = ap->lstop;
int32_t prevPos = lstop - 1;
for (i = 0; i < d->nL; ++i)
{
p = d->puL + i;
bam1_core_t *c = &p->b->core;
if ((c)->qual < ap->minMapQ)
continue;
if ( !((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FDUP) )
{ // read is mapped and not an optical/PCR dupe
start = (c)->pos;
stop = start + (c)->l_qseq;
if (lstart == -1)
{
lstart = start;
lstop = start + 1;
ap->lcounts += 1;
}
else if ((start > prevPos) && (start > lstart)) // && stop > lstop))
{ // make sure there are no possible dupes by removing all by
// comparing left-most position of each read, only allowing one per pos.
prevPos = start;
lstop = start + 1;
ap->lcounts += 1;
}
}
}
int32_t rstart = ap->rstart, rstop = ap->rstop;
prevPos = rstop - 1;
for (i = 0; i < d->nR; ++i)
{
p = d->puR + i;
bam1_core_t *c = &p->b->core;
if ((c)->qual < ap->minMapQ)
continue;
if ( !((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FDUP) )
{ // read is mapped and not an optical/PCR dupe
start = (c)->pos;
stop = start + (c)->l_qseq;
if (rstart == -1)
{
rstart = start;
rstop = start + 1;
ap->rcounts += 1;
}
else if ((start > prevPos) && (start > rstart)) // && (stop > rstop))
{ // make sure there are no possible dupes by removing all by
// comparing left-most position of each read, only allowing one per pos.
prevPos = start;
rstop = start + 1;
ap->rcounts += 1;
}
}
}
ap->lstart = lstart;
ap->lstop = lstop;
ap->rstart = rstart;
ap->rstop = rstop;
if ((ap->lcounts >= ap->readsPerBin) || (ap->rcounts >= ap->readsPerBin))
return 1;
return 0;
}
static int calc_ascn(dual_pileup_t *d, analysis_params_t *ap,
int maxS, int minS, int maxG, int minG)
{
if ((ap->tidAS != BAD_TID) && (ap->tidAS != d->tidL)) // onto next chromosome
return 2;
if ( ap->pStopPrev != -1 && (d->posL < ap->pStopPrev))
return 0; // within previous position's territory, avoid double-counting
ap->tidAS = d->tidL;
if (ap->startAS == -1)
{
ap->startAS = d->posL;
ap->stopAS = d->posL;
}
if (d->posL > ap->stopAS)
ap->stopAS = d->posL;
ap->lcountsMin += minS;
ap->lcountsMax += maxS;
ap->rcountsMin += minG;
ap->rcountsMax += maxG;
int totalL = ap->lcountsMin + ap->lcountsMax;
int totalR = ap->rcountsMin + ap->rcountsMax;
if (totalL >= ap->readsPerBin || totalR >= ap->readsPerBin)
return 1;
return 0;
}
static int is_het(dual_pileup_t *d, analysis_params_t *ap)
{
int i = 0;
int rCount = 0, maxCountR = 0;
for (i = 0; i < 4; i++)
{
if (countsR[i] < ap->minSupp)
continue;
rCount += countsR[i];
if (countsR[i] > maxCountR)
maxCountR = countsR[i];
}
double het = (double) maxCountR / (double) rCount;
if (het > 0.4 && het < 0.6)
return 1; // need a smarter test here.
return 0;
}
-static int is_sv(dual_pileup_t *d, analysis_params_t *ap)
-{
-return 0;
-}
-
static void print_nuc_info()
{
int i, first = 1;
for (i = 0; i < 4; i++)
{
if (countsR[i] == 0)
continue;
if (!first)
putchar(' ');
printf("%c:%d/%2.1f", NUCS[i], countsR[i], (double) votesR[i] / (double) countsR[i]);
first = 0;
}
putchar('\t');
first = 1;
for (i = 0; i < 4; i++)
{
if (countsL[i] == 0)
continue;
if (!first)
putchar(' ');
printf("%c:%d/%2.1f", NUCS[i], countsL[i], (double) votesL[i] / (double) countsL[i]);
first = 0;
}
putchar('\n');
}
-
-static void perform_analysis(dual_pileup_t *d, void *func_data)
+static void do_cnv(dual_pileup_t *d, analysis_params_t *ap)
{
-analysis_params_t *ap = (analysis_params_t *)func_data;
-
-int i, ret;
-if ((ret = calc_cnv(d, ap)) > 0)
- { // do overall cnv
- int start = ap->rstart;
- int stop = ap->rstop;
- if (ap->lstart < start)
+int ret;
+if ((ret = calc_cnv(d, ap)) == 0)
+ return;
+
+int start = ap->rstart;
+int stop = ap->rstop;
+if (ap->lstart < start)
start = ap->lstart;
- if (ap->lstop > stop)
+if (ap->lstop > stop)
stop = ap->lstop;
- printf("CNV\t%s\t%d\t%d\t%f\t%d\t%d\n",
+printf("CNV\t%s\t%d\t%d\t%f\t%d\t%d\n",
d->hL->target_name[ap->tid],
start, stop, (double) ap->lcounts / (double) ap->rcounts,
ap->rcounts, ap->lcounts);
- ap->prevTid = ap->tid;
- ap->prevStart = ap->lstart;
- ap->prevStop = ap->lstop;
+ap->prevTid = ap->tid;
+ap->prevStart = ap->lstart;
+ap->prevStop = ap->lstop;
- ap->lcounts = 0;
- ap->rcounts = 0;
+ap->lcounts = 0;
+ap->rcounts = 0;
- if (ret == 2)
+if (ret == 2)
{ // switching chroms
ap->tid = BAD_TID;
ap->lstart = -1;
ap->lstop = -1;
ap->rstart = -1;
ap->rstop = -1;
}
- else
+else
{ // on same chrom
ap->lstart = ap->lstop;
ap->rstart = ap->rstop;
}
+}
+
+#define NUM_CHROMS 100
+static int mateChromsL[NUM_CHROMS];
+static int mateChromsR[NUM_CHROMS];
+static int somaticSV[NUM_CHROMS];
+static int germlineSV[NUM_CHROMS];
+
+static int inter_struct_variant(dual_pileup_t *d, analysis_params_t *ap)
+{
+int i;
+for (i = 0; i < NUM_CHROMS; ++i)
+ {
+ mateChromsL[i] = 0;
+ mateChromsR[i] = 0;
+ somaticSV[i] = 0;
+ germlineSV[i] = 0;
+ }
+
+const bam_pileup1_t *p;
+int avgMapQ = 0;
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+ avgMapQ += p->b->core.qual;
+ }
+if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
+ return 0;
+
+avgMapQ = 0;
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+ avgMapQ += p->b->core.qual;
+ }
+if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
+ return 0;
+
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+
+ if (p->b->core.qual < ap->minMapQ)
+ continue;
+
+ bam1_core_t *c = &p->b->core;
+ if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+ { // read is paired and mapped.
+ if (!((c)->flag & BAM_FMUNMAP))
+ { // mate is mapped
+ if ((c)->mtid != (c)->tid) // different chroms.
+ mateChromsL[(c)->mtid]++;
+ }
+ }
+ }
+
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+
+ if (p->b->core.qual < ap->minMapQ)
+ continue;
+
+ bam1_core_t *c = &p->b->core;
+ if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+ { // read is paired and mapped.
+ if (!((c)->flag & BAM_FMUNMAP))
+ { // mate is mapped
+ if ((c)->mtid != (c)->tid) // different chroms
+ mateChromsR[(c)->mtid]++;
+ }
+ }
+ }
+
+int retVal = 0;
+for (i = 0; i < NUM_CHROMS; i++)
+ {
+ if (mateChromsL[i] < ap->minSupp)
+ continue;
+ retVal = 1;
+ if (mateChromsR[i] < ap->minSupp)
+ somaticSV[i] = mateChromsL[i];
+ else
+ germlineSV[i] = mateChromsL[i] + mateChromsR[i];
+ }
+
+return retVal;
+}
+
+
+
+
+static int intra_struct_variant(dual_pileup_t *d, analysis_params_t *ap)
+{
+intraL.tid = -1;
+intraL.qstart = -1;
+intraL.qstop = -1;
+intraL.mstart = -1;
+intraL.mstop = -1;
+intraL.supp = 0;
+intraL.orientation = -1;
+
+intraR.tid = -1;
+intraR.qstart = -1;
+intraR.qstop = -1;
+intraR.mstart = -1;
+intraR.mstop = -1;
+intraR.supp = 0;
+intraR.orientation = -1;
+
+// First check average mapping quality of read pairs
+int i;
+const bam_pileup1_t *p;
+int avgMapQ = 0;
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+ avgMapQ += p->b->core.qual;
+ }
+if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
+ return 0;
+
+avgMapQ = 0;
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+ avgMapQ += p->b->core.qual;
+ }
+if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
+ return 0;
+
+// Satisfied average mapping quality constraint
+
+int dupeL = 0, dupeR = 0;
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+ if (p->b->core.qual < ap->minMapQ)
+ continue;
+ bam1_core_t *c = &p->b->core;
+ if (!((c)->flag & BAM_FPAIRED) || ((c)->flag & BAM_FUNMAP))
+ continue; // read is not paired or unmapped.
+ if ((c)->flag & BAM_FMUNMAP)
+ continue; // mate is unmapped.
+ if ((c)->mtid != (c)->tid)
+ continue; // mate is on different chrom. MERGE INTER CODE HERE?
+
+ int orientation = (bam1_strand(p->b) == bam1_mstrand(p->b));
+
+ int32_t qstart = (c)->pos;
+ int32_t qstop = qstart + (c)->l_qseq;
+ int32_t mstart = (c)->mpos;
+ int32_t mstop = mstart;
+ if (orientation)
+ mstop -= (c)->l_qseq;
+ else
+ mstop += (c)->l_qseq;
+
+ if (qstart <= ap->svStartL)
+ continue;
+ if (orientation || abs((c)->isize) > maxISize)
+ {
+ if (intraL.tid == -1)
+ {
+ intraL.tid = (c)->tid;
+ intraL.qstart = qstart;
+ intraL.qstop = qstop;
+ intraL.mstart = mstart;
+ intraL.mstop = mstop;
+ intraL.orientation = orientation;
+ intraL.supp = 1;
+ }
+ else if (qstart == intraL.qstart)
+ { // potential dupe... same start position.
+ dupeL += 1;
+ }
+ else if (qstart < (intraL.qstop + BUF) && qstop > (intraL.qstart - BUF) &&
+ mstart < (intraL.mstop + BUF) && mstop > (intraL.mstart - BUF))
+ {
+ if (intraL.qstart > qstart)
+ intraL.qstart = qstart;
+ if (intraL.qstop < qstop)
+ intraL.qstop = qstop;
+ if (intraL.mstart > mstart)
+ intraL.mstart = mstart;
+ if (intraL.mstop < mstop)
+ intraL.mstop = mstop;
+
+ intraL.supp += 1;
+ }
+ }
+ }
+
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+ if (p->b->core.qual < ap->minMapQ)
+ continue;
+ bam1_core_t *c = &p->b->core;
+ if (!((c)->flag & BAM_FPAIRED) || ((c)->flag & BAM_FUNMAP))
+ continue; // read is not paired or unmapped.
+ if ((c)->flag & BAM_FMUNMAP)
+ continue; // mate is unmapped.
+ if ((c)->mtid != (c)->tid)
+ continue; // mate is on different chrom. MERGE INTER CODE HERE?
+
+ int orientation = (bam1_strand(p->b) == bam1_mstrand(p->b));
+
+ int32_t qstart = (c)->pos;
+ int32_t qstop = qstart + (c)->l_qseq;
+ int32_t mstart = (c)->mpos;
+ int32_t mstop = mstart;
+ if (orientation)
+ mstop -= (c)->l_qseq;
+ else
+ mstop += (c)->l_qseq;
+
+ if (qstart <= ap->svStartR)
+ continue;
+
+ if (orientation || abs((c)->isize) > maxISize)
+ {
+ if (intraR.tid == -1)
+ {
+ intraR.tid = (c)->tid;
+ intraR.qstart = qstart;
+ intraR.qstop = qstop;
+ intraR.mstart = mstart;
+ intraR.mstop = mstop;
+ intraR.orientation = orientation;
+ intraR.supp = 1;
+ }
+ else if (qstart == intraR.qstart)
+ { // potential dupe... same start position.
+ dupeR += 1;
+ }
+ else if (qstart < (intraR.qstop + BUF) && qstop > (intraR.qstart - BUF) &&
+ mstart < (intraR.mstop + BUF) && mstop > (intraR.mstart - BUF))
+ {
+ if (intraR.qstart > qstart)
+ intraR.qstart = qstart;
+ if (intraR.qstop < qstop)
+ intraR.qstop = qstop;
+ if (intraR.mstart > mstart)
+ intraR.mstart = mstart;
+ if (intraR.mstop < mstop)
+ intraR.mstop = mstop;
+
+ intraR.supp += 1;
+ }
+ }
+ }
+
+if (intraL.supp > ap->minSupp)
+ {
+ ap->svStartL = intraL.qstart;
+ ap->svStopL = intraL.qstop;
+
+ ap->svStartR = intraR.qstart;
+ ap->svStopR = intraR.qstop;
+ return 1;
+ }
+
+return 0;
+}
+
+static int indel_variant(dual_pileup_t *d, analysis_params_t *ap,
+ int *isIns, int *isDel, int *insGerm, int *delGerm)
+{
+if (insL >= ap->minSupp)
+ {
+ *isIns = 1;
+ if (insR > 0) // if you see even once, believe it.
+ *insGerm = 1;
+ else
+ *insGerm = 0;
+ return 1;
+ }
+
+if (delL >= ap->minSupp)
+ {
+ *isDel = 1;
+ if (delR > 0) // if you see even once, believe it.
+ *delGerm = 1;
+ else
+ *delGerm = 0;
+ return 1;
+ }
+
+return 0;
+}
+
+
+static void do_struct_var(dual_pileup_t *d, analysis_params_t *ap)
+{
+int i;
+if (0 && inter_struct_variant(d, ap))
+ {
+ printf("SVS\t%s\t%d\t%d\t",
+ d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+ for (i = 0; i < NUM_CHROMS; ++i)
+ {
+ if (somaticSV[i] > 0)
+ printf("SSV:%s(%d),", d->hL->target_name[i], somaticSV[i]);
+ if (germlineSV[i] > 0)
+ printf("GSV:%s(%d),", d->hL->target_name[i], germlineSV[i]);
+ }
+ putchar('\n');
}
-if (is_sv(d, ap))
- { // do struct vars
+if (intra_struct_variant(d, ap))
+ {
printf("SVS\t%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
- print_nuc_info();
+ if (intraR.supp > 0)
+ { // possibly germline, see if they overlap
+ if (intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF))
+ printf("GIC\t%d,%d(%d-%d)>(%d-%d)\t%d,%d(%d-%d)>(%d-%d)",
+ intraR.orientation, intraR.supp, intraR.qstart, intraR.qstop,
+ intraR.mstart, intraR.mstop,
+ intraL.orientation, intraL.supp, intraL.qstart, intraL.qstop,
+ intraL.mstart, intraL.mstop);
}
+ else
+ printf("SIC\t%d,%d(%d-%d)>(%d-%d)",
+ intraL.orientation, intraL.supp, intraL.qstart, intraL.qstop,
+ intraL.mstart, intraL.mstop);
-nuc_counts(d, ap);
-if (is_het(d, ap))
- { // do ascn & phasing
- char minGa = 'N';
- char maxGa = 'N';
- char minSa = 'N';
- char maxSa = 'N';
- int minGc = 0;
- int maxGc = 0;
- int minSc = 0;
- int maxSc = 0;
- for (i = 0; i < 4; i++)
+ putchar('\n');
+ }
+
+int isIns = 0, isDel = 0, insGerm = 0, delGerm = 0;
+if (indel_variant(d, ap, &isIns, &isDel, &insGerm, &delGerm))
+ {
+ printf("IDL\t%s\t%d\t%d\t",
+ d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+
+ if (isIns && insGerm)
+ printf("GIns\t(%d,%d),", insR, insL);
+ else if (isIns && !insGerm)
+ printf("SIns\t(%d,%d),", insR, insL);
+
+ if (isDel && delGerm)
+ printf("GDel\t(%d,%d)", delR, delL);
+ else if (isDel && !delGerm)
+ printf("SDel\t(%d,%d)", delR, delL);
+
+ putchar('\n');
+ }
+}
+
+static void do_het_analysis(dual_pileup_t *d, analysis_params_t *ap)
+{
+if (!is_het(d, ap))
+ return;
+
+// do ascn & phasing
+char minGa = 'N', maxGa = 'N', minSa = 'N', maxSa = 'N';
+int i, minGc = 0, maxGc = 0, minSc = 0, maxSc = 0;
+for (i = 0; i < 4; i++)
{
int cr = countsR[i];
int cl = countsL[i];
if (cr > maxGc)
{
minGc = maxGc;
minGa = maxGa;
maxGc = cr;
maxGa = NUCS[i];
}
else if (cr > minGc)
{
minGc = cr;
minGa = NUCS[i];
}
if (cl > maxSc)
{
minSc = maxSc;
minSa = maxSa;
maxSc = cl;
maxSa = NUCS[i];
}
else if (cl > minSc)
{
minSc = cl;
minSa = NUCS[i];
}
}
- if (minSc < ap->minSupp)
+if (minSc < ap->minSupp)
{
minSc = 0;
minSa = 'N';
}
- char g1 = 'N';
- char g2 = 'N';
- if (maxSa == maxGa)
+char g1 = 'N', g2 = 'N';
+if (maxSa == maxGa)
{
g1 = maxGa;
g2 = minGa;
}
- else if (maxSa == minGa)
+else if (maxSa == minGa)
{
g1 = minGa;
g2 = maxGa;
}
- // else, weird case!
+// else, weird case!
- if (g1 != 'N' && g2 != 'N')
- {
- printf("HAP\t%s\t%d\t%d\t%c %c\t%c %c\t",
+if (g1 == 'N' || g2 == 'N')
+ return;
+
+printf("HAP\t%s\t%d\t%d\t%c %c\t%c %c\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1,
g1, g2, maxSa, minSa);
- print_nuc_info();
+print_nuc_info();
- if ((ret = calc_ascn(d, ap, maxSc, minSc, maxGc, minGc)) > 0)
- {
- double minCN = (double) (ap->lcountsMin) / (double) (ap->rcountsMin + ap->rcountsMax);
- double maxCN = (double) (ap->lcountsMax) / (double) (ap->rcountsMin + ap->rcountsMax);
+int ret;
+if ((ret = calc_ascn(d, ap, maxSc, minSc, maxGc, minGc)) == 0)
+ return;
+
+double minCN = (double) (ap->lcountsMin) / (double) (ap->rcountsMin + ap->rcountsMax);
+double maxCN = (double) (ap->lcountsMax) / (double) (ap->rcountsMin + ap->rcountsMax);
- printf("ASC\t%s\t%d\t%d\t%f\t%f\t%d\t%d\t%d\t%d\n",
+printf("ASC\t%s\t%d\t%d\t%f\t%f\t%d\t%d\t%d\t%d\n",
d->hL->target_name[ap->tidAS],
ap->startAS, ap->stopAS,
maxCN, minCN,
ap->rcountsMax, ap->rcountsMin,
ap->lcountsMax, ap->lcountsMin);
- ap->startAS = -1;
- ap->stopAS = -1;
- ap->lcountsMax = 0;
- ap->lcountsMin = 0;
- ap->rcountsMax = 0;
- ap->rcountsMin = 0;
+ap->startAS = -1;
+ap->stopAS = -1;
+ap->lcountsMax = 0;
+ap->lcountsMin = 0;
+ap->rcountsMax = 0;
+ap->rcountsMin = 0;
- if (ret == 2)
+ap->pStartPrev = ap->pStart;
+ap->pStopPrev = ap->pStop;
+
+if (ret == 2)
{
ap->tidAS = BAD_TID;
ap->pStartPrev = -1;
ap->pStopPrev = -1;
}
- }
- ap->pStartPrev = ap->pStart;
- ap->pStopPrev = ap->pStop;
- }
- }
+}
-int stateL, stateR;
+static void do_mut(dual_pileup_t *d, analysis_params_t *ap)
+{
char callL1, callL2, callR1, callR2;
-float hetL, hetR;
+if (!is_mut(d, ap, &callL1, &callL2, &callR1, &callR2))
+ return;
-if (is_mut(d, ap, &callL1, &callL2, &callR1, &callR2,
- &hetL, &hetR, &stateL, &stateR, &ap->sites))
- {
- printf("MUT\t%s\t%d\t%d\t",
+printf("MUT\t%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
- if (callR2 == 'N')
+if (callR2 == 'N')
printf("%c>", callR1);
- else
+else
printf("%c/%c>", callR1, callR2);
- if (callL2 == 'N')
+if (callL2 == 'N')
printf("%c\t", callL1);
- else
+else
printf("%c/%c\t", callL1, callL2);
- double chi_sq = sig_diff(d, ap);
- printf("%f\t", chi_sq);
- print_nuc_info();
- }
+double chi_sq = sig_diff(d, ap);
+printf("%f\t", chi_sq);
+print_nuc_info();
+}
+
+
+
+static void perform_analysis(dual_pileup_t *d, void *func_data)
+{
+analysis_params_t *ap = (analysis_params_t *)func_data;
+
+/* processes pileup for het and mut analysis */
+nuc_counts(d, ap);
+
+/* overall cnv */
+do_cnv(d, ap);
+
+/* structural variation */
+do_struct_var(d, ap);
+
+/* het analysis: ASCN & phasing */
+do_het_analysis(d, ap);
+
+/* mutation discovery */
+do_mut(d, ap);
fflush(stdout);
}
void bamBam(char *inbamL, char *inbamR, int mask)
{
analysis_params_t ap;
ap.minQ = minQ;
ap.minMapQ = minMapQ;
+ap.avgMapQ = avgMapQ;
+ap.maxISize = maxISize;
ap.minChiSq = minChiSq;
ap.minSupp = minSupport;
ap.sites = 0;
-ap.readLength = readLength;
ap.readsPerBin = readsPerBin;
ap.pStart = -1;
ap.pStop = -1;
ap.pStartPrev = -1;
ap.pStopPrev = -1;
/* Overall Copy Number */
ap.tid = BAD_TID;
ap.lstart = -1;
ap.lstop = -1;
ap.lcounts = 0;
ap.rstart = -1;
ap.rstop = -1;
ap.rcounts = 0;
ap.prevTid = BAD_TID;
ap.prevStart = -1;
ap.prevStop = -1;
/* ASCN */
ap.tidAS = BAD_TID;
ap.startAS = -1;
ap.stopAS = -1;
ap.lcountsMin = 0;
ap.lcountsMax = 0;
ap.rcountsMin = 0;
ap.rcountsMax = 0;
+/* SV */
+ap.svStartL = -1;
+ap.svStopL = -1;
+
+ap.svStartR = -1;
+ap.svStopR = -1;
+
bam_bam_file(inbamL, inbamR, position, mask, perform_analysis, &ap);
-fprintf(stderr, "# qualified sites = %d\n", ap.sites);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc < 3)
usage();
if (optionExists("minSupport"))
minSupport = optionInt("minSupport", 2);
if (optionExists("minQ"))
minQ = optionInt("minQ", 0);
if (optionExists("minMapQ"))
minMapQ = optionInt("minMapQ", 0);
+if (optionExists("avgMapQ"))
+ avgMapQ = optionInt("avgMapQ", 0);
+if (optionExists("maxISize"))
+ maxISize = optionInt("maxISize", 10000);
if (optionExists("minChiSq"))
minChiSq = optionInt("minChiSq", 0);
if (optionExists("position"))
position = optionVal("position", NULL);
if (optionExists("readsPerBin"))
readsPerBin = optionInt("readsPerBin", 500);
-if (optionExists("readLength"))
- readLength = optionInt("readLength", 100);
char *inbamL = argv[1];
char *inbamR = argv[2];
bamBam(inbamL, inbamR, BAM_DEF_MASK);
return 0;
}
#else // USE_BAM not defined
int main(int argc, char *argv[])
/* Process command line. */
{
printf("BAM not installed");
return 0;
}
#endif