src/hg/tcga/pBamBam/asBamBam.c 1.3
1.3 2010/03/07 02:36:46 jsanborn
updated
Index: src/hg/tcga/pBamBam/asBamBam.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/asBamBam.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/tcga/pBamBam/asBamBam.c 6 Mar 2010 02:06:48 -0000 1.2
+++ src/hg/tcga/pBamBam/asBamBam.c 7 Mar 2010 02:36:46 -0000 1.3
@@ -22,20 +22,20 @@
void usage()
/* Explain usage and exit. */
{
errAbort(
- "asBamBam - Allele-specific BamBam.\n"
+ "asBamBam - Allele-Specific BamBam.\n"
"usage:\n"
" asBamBam [options] <left.bam> <right.bam>\n"
"options:\n"
" -position=chrX:1-100 - Position to do analysis. [None]\n"
- " -minSupport=INT - Minimum Support in Germline. [10]\n"
- " -minQ=INT - Minimum acceptable base quality. [0]\n"
- " -minMapQ=INT - Minimum acceptable mapping quality. [0]\n"
- " -avgMapQ=INT - Minimum acceptable average mapping quality. [0]\n"
- " -minChiSq=INT - Minimum acceptable chi square of nucleotide freqs. [0]\n"
+ " -minSupport=INT - Minimum Support in Germline. [4]\n"
+ " -minQ=INT - Minimum acceptable base quality. [10]\n"
+ " -minMapQ=INT - Minimum acceptable mapping quality. [20]\n"
+ " -avgMapQ=INT - Minimum acceptable average mapping quality. [30]\n"
+ " -minChiSq=INT - Minimum acceptable chi square of nucleotide freqs. [250]\n"
" -maxISize=INT - Maximum Insert Size. [10000]\n"
- " -readsPerBin=INT - Reads per CNV bin [500]\n"
+ " -readsPerBin=INT - Reads per CNV bin [2500]\n"
"\n"
);
}
@@ -54,17 +54,15 @@
#define BAD_TID -999
/* Option variables */
char *position = NULL;
-int minSupport = 2;
-int minQ = 0;
-int minMapQ = 0;
-int avgMapQ = 0;
+int minSupport = 4;
+int minQ = 10;
+int minMapQ = 20;
+int avgMapQ = 30;
int maxISize = 10000;
-int minChiSq = 0;
-int readsPerBin = 500;
-int readLength = 100;
-
+int minChiSq = 250;
+int readsPerBin = 2500;
typedef struct {
int minQ;
int minMapQ;
@@ -73,9 +71,8 @@
int minChiSq;
int minSupp;
int sites;
- int readLength;
int readsPerBin;
int pStartPrev;
int pStopPrev;
@@ -144,13 +141,15 @@
int32_t mstart;
int32_t mstop;
int supp;
- int orientation;
+ int qual;
+ int mstrand;
+ int qstrand;
} sv_t;
-#define BUF 500 // 500 base buffer around start/stop
-sv_t intraL, intraR;
+#define BUF 100 // 500 base buffer around start/stop
+static sv_t intraL, intraR;
sv_t interL, interR;
static int nuc_index(char c)
{
@@ -851,53 +850,41 @@
return retVal;
}
-
-
-
-static int intra_struct_variant(dual_pileup_t *d, analysis_params_t *ap)
+static inline void reset_intra()
{
intraL.tid = -1;
intraL.qstart = -1;
intraL.qstop = -1;
intraL.mstart = -1;
intraL.mstop = -1;
intraL.supp = 0;
-intraL.orientation = -1;
+intraL.qual = 0;
+intraL.qstrand = -1;
+intraL.mstrand = -1;
intraR.tid = -1;
intraR.qstart = -1;
intraR.qstop = -1;
intraR.mstart = -1;
intraR.mstop = -1;
intraR.supp = 0;
-intraR.orientation = -1;
+intraR.qual = 0;
+intraR.qstrand = -1;
+intraR.mstrand = -1;
+}
+static int intra_struct_variant(dual_pileup_t *d, analysis_params_t *ap)
+{
// First check average mapping quality of read pairs
int i;
const bam_pileup1_t *p;
-int avgMapQ = 0;
-for (i = 0; i < d->nL; ++i)
- {
- p = d->puL + i;
- avgMapQ += p->b->core.qual;
- }
-if (((double)avgMapQ / (double) d->nL) < (double)ap->avgMapQ)
- return 0;
-
-avgMapQ = 0;
-for (i = 0; i < d->nR; ++i)
- {
- p = d->puR + i;
- avgMapQ += p->b->core.qual;
- }
-if (((double)avgMapQ / (double) d->nR) < (double) ap->avgMapQ)
- return 0;
-
-// Satisfied average mapping quality constraint
+int deltaSupp = intraL.supp + intraR.supp;
-int dupeL = 0, dupeR = 0;
+int maxStartL = -1, maxStartR = -1;
+int nL = 0, nR = 0;
+//int dupeL = 0, dupeR = 0;
for (i = 0; i < d->nL; ++i)
{
p = d->puL + i;
if (p->b->core.qual < ap->minMapQ)
@@ -909,37 +896,39 @@
continue; // mate is unmapped.
if ((c)->mtid != (c)->tid)
continue; // mate is on different chrom. MERGE INTER CODE HERE?
- int orientation = (bam1_strand(p->b) == bam1_mstrand(p->b));
+ int qstrand = bam1_strand(p->b);
+ int mstrand = bam1_mstrand(p->b);
+ int orientation = (qstrand == mstrand);
int32_t qstart = (c)->pos;
int32_t qstop = qstart + (c)->l_qseq;
int32_t mstart = (c)->mpos;
- int32_t mstop = mstart;
- if (orientation)
- mstop -= (c)->l_qseq;
- else
- mstop += (c)->l_qseq;
+ int32_t mstop = mstart + (c)->l_qseq;
+
+ if (qstart > maxStartL)
+ maxStartL = qstart;
- if (qstart <= ap->svStartL)
- continue;
if (orientation || abs((c)->isize) > maxISize)
{
+ if (qstart <= ap->svStartL)
+ continue;
+ ap->svStartL = qstart;
+ nL += 1;
+
if (intraL.tid == -1)
{
intraL.tid = (c)->tid;
intraL.qstart = qstart;
intraL.qstop = qstop;
intraL.mstart = mstart;
intraL.mstop = mstop;
- intraL.orientation = orientation;
+ intraL.qstrand = qstrand;
+ intraL.mstrand = mstrand;
+ intraL.qual = (c)->qual;
intraL.supp = 1;
}
- else if (qstart == intraL.qstart)
- { // potential dupe... same start position.
- dupeL += 1;
- }
else if (qstart < (intraL.qstop + BUF) && qstop > (intraL.qstart - BUF) &&
mstart < (intraL.mstop + BUF) && mstop > (intraL.mstart - BUF))
{
if (intraL.qstart > qstart)
@@ -950,8 +939,9 @@
intraL.mstart = mstart;
if (intraL.mstop < mstop)
intraL.mstop = mstop;
+ intraL.qual += (c)->qual;
intraL.supp += 1;
}
}
}
@@ -968,38 +958,39 @@
continue; // mate is unmapped.
if ((c)->mtid != (c)->tid)
continue; // mate is on different chrom. MERGE INTER CODE HERE?
- int orientation = (bam1_strand(p->b) == bam1_mstrand(p->b));
+ int qstrand = bam1_strand(p->b);
+ int mstrand = bam1_mstrand(p->b);
+ int orientation = (qstrand == mstrand);
int32_t qstart = (c)->pos;
int32_t qstop = qstart + (c)->l_qseq;
int32_t mstart = (c)->mpos;
- int32_t mstop = mstart;
- if (orientation)
- mstop -= (c)->l_qseq;
- else
- mstop += (c)->l_qseq;
+ int32_t mstop = mstart + (c)->l_qseq;
- if (qstart <= ap->svStartR)
- continue;
+ if (qstart > maxStartR)
+ maxStartR = qstart;
if (orientation || abs((c)->isize) > maxISize)
{
+ if (qstart <= ap->svStartR)
+ continue;
+ ap->svStartR = qstart;
+ nR += 1;
+
if (intraR.tid == -1)
{
intraR.tid = (c)->tid;
intraR.qstart = qstart;
intraR.qstop = qstop;
intraR.mstart = mstart;
intraR.mstop = mstop;
- intraR.orientation = orientation;
+ intraR.mstrand = mstrand;
+ intraR.qstrand = qstrand;
+ intraR.qual = (c)->qual;
intraR.supp = 1;
}
- else if (qstart == intraR.qstart)
- { // potential dupe... same start position.
- dupeR += 1;
- }
else if (qstart < (intraR.qstop + BUF) && qstop > (intraR.qstart - BUF) &&
mstart < (intraR.mstop + BUF) && mstop > (intraR.mstart - BUF))
{
if (intraR.qstart > qstart)
@@ -1010,21 +1001,35 @@
intraR.mstart = mstart;
if (intraR.mstop < mstop)
intraR.mstop = mstop;
+ intraR.qual += (c)->qual;
intraR.supp += 1;
}
}
}
-if (intraL.supp > ap->minSupp)
+if (intraL.supp > 0) // || intraR.supp > 0)
{
- ap->svStartL = intraL.qstart;
- ap->svStopL = intraL.qstop;
+ if (maxStartL <= intraL.qstop + BUF)
+ return 0; // there still might be reads to overlap
- ap->svStartR = intraR.qstart;
- ap->svStopR = intraR.qstop;
+ /* check average mapping quality of discordants */
+ int qual = ceil((double) intraL.qual / (double) intraL.supp);
+ if (intraL.supp >= ap->minSupp && qual >= ap->avgMapQ)
return 1;
+ reset_intra();
+ }
+
+return 0;
+
+deltaSupp = (intraL.supp + intraR.supp) - deltaSupp;
+if (deltaSupp == 0 && (intraL.supp > 0 || intraR.supp > 0) )
+ {
+ fprintf(stderr, "SIC pos = %d\n", d->posL);
+ if (intraL.supp >= ap->minSupp)
+ return 1;
+ reset_intra();
}
return 0;
}
@@ -1078,23 +1083,41 @@
{
printf("SVS\t%s\t%d\t%d\t",
d->hL->target_name[d->tidL], d->posL, d->posL + 1);
- if (intraR.supp > 0)
- { // possibly germline, see if they overlap
- if (intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF))
- printf("GIC\t%d,%d(%d-%d)>(%d-%d)\t%d,%d(%d-%d)>(%d-%d)",
- intraR.orientation, intraR.supp, intraR.qstart, intraR.qstop,
- intraR.mstart, intraR.mstop,
- intraL.orientation, intraL.supp, intraL.qstart, intraL.qstop,
- intraL.mstart, intraL.mstop);
- }
+ char qsL = '+';
+ if (intraL.qstrand)
+ qsL = '-';
+ char msL = '+';
+ if (intraL.mstrand)
+ msL = '-';
+
+ char qsR = '+';
+ if (intraR.qstrand)
+ qsR = '-';
+ char msR = '+';
+ if (intraR.mstrand)
+ msR = '-';
+
+
+ if ( intraR.supp > 0 &&
+ (intraR.qstart < (intraL.qstop + BUF) && intraR.qstop > (intraL.qstart - BUF)) )
+ printf("GIC\t%d\t%3.1f\t%d\t%3.1f\t%c(%d-%d)>%c(%d-%d)\t%c(%d-%d)>%c(%d-%d)",
+ intraR.supp, (double) intraR.qual / (double) intraR.supp,
+ intraL.supp, (double) intraL.qual / (double) intraL.supp,
+ qsR, intraR.qstart, intraR.qstop,
+ msR, intraR.mstart, intraR.mstop,
+ qsL, intraL.qstart, intraL.qstop,
+ msL, intraL.mstart, intraL.mstop);
else
- printf("SIC\t%d,%d(%d-%d)>(%d-%d)",
- intraL.orientation, intraL.supp, intraL.qstart, intraL.qstop,
- intraL.mstart, intraL.mstop);
+ printf("SIC\t%d\t%3.1f\t%c(%d-%d)>%c(%d-%d)",
+ intraL.supp, (double) intraL.qual / (double) intraL.supp,
+ qsL, intraL.qstart, intraL.qstop,
+ msL, intraL.mstart, intraL.mstop);
putchar('\n');
+
+ reset_intra();
}
int isIns = 0, isDel = 0, insGerm = 0, delGerm = 0;
if (indel_variant(d, ap, &isIns, &isDel, &insGerm, &delGerm))
@@ -1309,8 +1332,10 @@
ap.svStartR = -1;
ap.svStopR = -1;
+reset_intra();
+
bam_bam_file(inbamL, inbamR, position, mask, perform_analysis, &ap);
}
@@ -1320,24 +1345,24 @@
optionInit(&argc, argv, options);
if (argc < 3)
usage();
+if (optionExists("position"))
+ position = optionVal("position", NULL);
if (optionExists("minSupport"))
- minSupport = optionInt("minSupport", 2);
+ minSupport = optionInt("minSupport", 4);
if (optionExists("minQ"))
- minQ = optionInt("minQ", 0);
+ minQ = optionInt("minQ", 10);
if (optionExists("minMapQ"))
- minMapQ = optionInt("minMapQ", 0);
+ minMapQ = optionInt("minMapQ", 20);
if (optionExists("avgMapQ"))
- avgMapQ = optionInt("avgMapQ", 0);
+ avgMapQ = optionInt("avgMapQ", 30);
if (optionExists("maxISize"))
maxISize = optionInt("maxISize", 10000);
if (optionExists("minChiSq"))
- minChiSq = optionInt("minChiSq", 0);
-if (optionExists("position"))
- position = optionVal("position", NULL);
+ minChiSq = optionInt("minChiSq", 250);
if (optionExists("readsPerBin"))
- readsPerBin = optionInt("readsPerBin", 500);
+ readsPerBin = optionInt("readsPerBin", 2500);
char *inbamL = argv[1];
char *inbamR = argv[2];
bamBam(inbamL, inbamR, BAM_DEF_MASK);