src/hg/tcga/pBamBam/asBamBam.c 1.10
1.10 2010/04/22 18:10:50 jsanborn
added ref FASTA reading
Index: src/hg/tcga/pBamBam/asBamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/asBamBam.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 4 -r1.9 -r1.10
--- src/hg/tcga/pBamBam/asBamBam.c 15 Mar 2010 00:24:27 -0000 1.9
+++ src/hg/tcga/pBamBam/asBamBam.c 22 Apr 2010 18:10:50 -0000 1.10
@@ -26,8 +26,9 @@
"asBamBam - Allele-Specific BamBam.\n"
"usage:\n"
" asBamBam [options] <left.bam> <right.bam>\n"
"options:\n"
+ " -fa=file - FASTA filename of reference [NULL]\n"
" -position=chr1:1-100 - Position to do analysis. [NULL=process all reads]\n"
" -minSuppSNP=INT - Minimum Support for SNP/Het Calling. [4]\n"
" -minSuppSV=INT - Minimum Support for struct vars. [2]\n"
" -minQ=INT - Minimum acceptable base quality. [10]\n"
@@ -40,8 +41,9 @@
);
}
static struct optionSpec options[] = {
+ {"fa", OPTION_STRING},
{"position", OPTION_STRING},
{"minSuppSNP", OPTION_INT},
{"minSuppSV", OPTION_INT},
{"minQ", OPTION_INT},
@@ -55,8 +57,9 @@
#define BAD_TID -999
/* Option variables */
+char *faidx = NULL;
char *position = NULL;
int minSuppSNP = 4;
int minSuppSV = 2;
int minQ = 10;
@@ -149,11 +152,15 @@
int supp;
int qual;
int mstrand;
int qstrand;
+
+ int mdir; // trend of mate directions
+ int pmstart; // previos start
} sv_t;
-#define BUF 100 // 500 base buffer around start/stop
+#define BUF 100 // buffer around reads' starts/stops
+
static sv_t intraL, intraR;
static sv_t interL, interR;
static int nuc_index(char c)
@@ -233,13 +240,12 @@
int32_t start, stop;
int32_t prevStart = -1;
const bam_pileup1_t *p;
-char best_n1 = 'N';
+char best_b1 = 'N';
int best_q1 = -1;
-char best_n2 = 'N';
+char best_b2 = 'N';
int best_q2 = -1;
-
int insertion = 0, deletion = 0;
for (i = 0; i < d->nL; ++i)
{
@@ -252,10 +258,8 @@
/* Check if start & stop has already been visited, to dupes missed in prev step*/
start = (c)->pos;
stop = start + (c)->l_qseq;
- //if (start <= prevStart && stop <= prevStop)
-// continue;
if (ap->pStart == -1)
{
ap->pStart = start;
ap->pStop = stop;
@@ -285,10 +289,8 @@
}
else
{
int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
- if ((index = nuc_index(b)) < 0)
- continue;
if (best_b1 == 'N')
{
best_b1 = b;
best_q1 = qual;
@@ -316,9 +318,9 @@
else if (deletion)
delL += 1;
double lseq = (double) p->b->core.l_qseq;
- if (!(index = nuc_index(best_b1)) < 0)
+ if ((index = nuc_index(best_b1)) >= 0)
{
votesL[index] += best_q1;
countsL[index]++;
@@ -329,9 +331,9 @@
else
countsL3[index]++;
}
- if (!(index = nuc_index(best_b2)) < 0)
+ if ((index = nuc_index(best_b2)) >= 0)
{
votesL[index] += best_q2;
countsL[index]++;
@@ -347,8 +349,38 @@
best_b2 = 'N';
best_q2 = -1;
insertion = 0, deletion = 0;
+
+ if (p->is_del)
+ {
+ if (p->indel > 0)
+ insertion = 1;
+ else if (p->indel < 0)
+ deletion = 1;
+ }
+ else
+ {
+ int b = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
+ if (best_b1 == 'N')
+ {
+ best_b1 = b;
+ best_q1 = qual;
+ }
+ else if (qual > best_q1)
+ {
+ best_b2 = best_b1;
+ best_q2 = best_q1;
+
+ best_b1 = b;
+ best_q1 = qual;
+ }
+ else if (qual > best_q2)
+ {
+ best_b2 = b;
+ best_q2 = qual;
+ }
+ }
}
prevStart = -1;
for (i = 0; i < d->nR; ++i)
@@ -361,12 +393,11 @@
/* Check if start & stop has already been visited, to dupes missed in prev step*/
start = (c)->pos;
stop = start + (c)->l_qseq;
- if (start <= prevStart && stop <= prevStop)
+ if (start <= prevStart)
continue;
prevStart = start;
- prevStop = stop;
if (ap->pStart == -1)
{
ap->pStart = start;
@@ -514,9 +545,9 @@
return chi_sq;
}
-static int is_mut(dual_pileup_t *d, analysis_params_t *ap,
+static int is_mut(dual_pileup_t *d, analysis_params_t *ap, char *ref,
char *callL1, char *callL2, char *callR1, char *callR2)
{
if (majorR.vote == 0 || majorL.vote == 0)
return 0; // most likely a bad quality region.
@@ -531,8 +562,9 @@
if (rCount < 10)
return 0;
+char rb = (d->ref && (int)d->posL < d->len)? d->ref[d->posL] : 'N';
int retVal1 = 0, retVal2 = 0;
for (i = 0; i < 4; i++)
{
@@ -582,8 +614,9 @@
if (!retVal1 && !retVal2)
return 0;
+*ref = rb;
*callL1 = majorL.nuc;
*callL2 = minorL.nuc;
*callR1 = majorR.nuc;
*callR2 = minorR.nuc;
@@ -836,8 +866,10 @@
interL.supp = 0;
interL.qual = 0;
interL.qstrand = -1;
interL.mstrand = -1;
+interL.mdir = 0;
+interL.pmstart = -1;
interR.tid = -1;
interR.mtid = -1;
interR.qstart = -1;
@@ -847,8 +879,10 @@
interR.supp = 0;
interR.qual = 0;
interR.qstrand = -1;
interR.mstrand = -1;
+interR.mdir = 0;
+interR.pmstart = -1;
}
static inline void reset_intra()
{
@@ -860,8 +894,10 @@
intraL.supp = 0;
intraL.qual = 0;
intraL.qstrand = -1;
intraL.mstrand = -1;
+intraL.mdir = 0;
+intraL.pmstart = -1;
intraR.tid = -1;
intraR.qstart = -1;
intraR.qstop = -1;
@@ -870,8 +906,10 @@
intraR.supp = 0;
intraR.qual = 0;
intraR.qstrand = -1;
intraR.mstrand = -1;
+intraR.mdir = 0;
+intraR.pmstart = -1;
}
static inline void update_sv(const bam_pileup1_t *p, sv_t *sv)
{
@@ -895,8 +933,10 @@
sv->mstart = mstart;
sv->mstop = mstop;
sv->mstrand = mstrand;
sv->qstrand = qstrand;
+ sv->mdir = 0;
+ sv->pmstart = mstart;
sv->qual = (c)->qual;
sv->supp = 1;
}
else if ((c)->tid == sv->tid && (c)->mtid == sv->mtid &&
@@ -912,8 +952,11 @@
sv->mstart = mstart;
if (sv->mstop < mstop)
sv->mstop = mstop;
+ sv->mdir += (mstart - sv->pmstart);
+ sv->pmstart = mstart;
+
sv->qual += (c)->qual;
sv->supp += 1;
}
}
@@ -1048,9 +1091,17 @@
/* check average mapping quality of discordants */
int qual = ceil((double) intraL.qual / (double) intraL.supp);
if (intraL.supp >= ap->minSuppSV && qual >= ap->avgMapQ)
+ { // check mate pair direction trend... should be positive for properly paired (opposite strand)
+
+ //return 1;
+
+ if ((intraL.qstrand == intraL.mstrand && intraL.mdir < 0) ||
+ (intraL.qstrand != intraL.mstrand && intraL.mdir > 0))
return 1;
+ }
+
reset_intra();
}
return 0;
}
@@ -1064,9 +1115,16 @@
/* check average mapping quality of discordants */
int qual = ceil((double) interL.qual / (double) interL.supp);
if (interL.supp >= ap->minSuppSV && qual >= ap->avgMapQ)
+ { // check mate pair direction trend... should be positive for properly paired (opposite strand)
+
+ //return 1;
+
+ if ((interL.qstrand == interL.mstrand && interL.mdir < 0) ||
+ (interL.qstrand != interL.mstrand && interL.mdir > 0))
return 1;
+ }
reset_inter();
}
return 0;
}
@@ -1302,15 +1360,16 @@
}
static void do_mut(dual_pileup_t *d, analysis_params_t *ap)
{
-char callL1, callL2, callR1, callR2;
-if (!is_mut(d, ap, &callL1, &callL2, &callR1, &callR2))
+char ref, callL1, callL2, callR1, callR2;
+if (!is_mut(d, ap, &ref, &callL1, &callL2, &callR1, &callR2))
return;
printf("MUT\t%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
+printf("%c\t", ref);
if (callR2 == 'N')
printf("%c>", callR1);
else
printf("%c/%c>", callR1, callR2);
@@ -1348,9 +1407,9 @@
fflush(stdout);
}
-void bamBam(char *inbamL, char *inbamR, int mask)
+void bamBam(char *inbamL, char *inbamR, char *faidx, int mask)
{
analysis_params_t ap;
ap.minQ = minQ;
ap.minMapQ = minMapQ;
@@ -1398,9 +1457,9 @@
reset_intra();
reset_inter();
-bam_bam_file(inbamL, inbamR, position, mask, perform_analysis, &ap);
+bam_bam_file(inbamL, inbamR, faidx, position, mask, perform_analysis, &ap);
}
int main(int argc, char *argv[])
@@ -1409,8 +1468,10 @@
optionInit(&argc, argv, options);
if (argc < 3)
usage();
+if (optionExists("fa"))
+ faidx = optionVal("fa", NULL);
if (optionExists("position"))
position = optionVal("position", NULL);
if (optionExists("minSuppSNP"))
minSuppSNP = optionInt("minSuppSNP", 4);
@@ -1430,9 +1491,9 @@
readsPerBin = optionInt("readsPerBin", 2500);
char *inbamL = argv[1];
char *inbamR = argv[2];
-bamBam(inbamL, inbamR, BAM_DEF_MASK);
+bamBam(inbamL, inbamR, faidx, BAM_DEF_MASK);
return 0;
}