src/hg/tcga/bamBam/bamBamCNV.c 1.1
1.1 2009/11/06 20:27:29 jsanborn
added file
Index: src/hg/tcga/bamBam/bamBamCNV.c
===================================================================
RCS file: src/hg/tcga/bamBam/bamBamCNV.c
diff -N src/hg/tcga/bamBam/bamBamCNV.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/tcga/bamBam/bamBamCNV.c 6 Nov 2009 20:27:29 -0000 1.1
@@ -0,0 +1,271 @@
+/* bamBam -- discovering variants from BAM file
+ * code adapted from Heng Li's Samtools library functions
+ */
+
+#ifdef USE_BAM
+
+#include "common.h"
+#include "hCommon.h"
+#include "linefile.h"
+#include "hash.h"
+#include "hdb.h"
+#include "options.h"
+#include "jksql.h"
+
+#define _IOLIB 2
+#include "bam.h"
+#include "sam.h"
+#include "bam_dual_pileup.h"
+
+static char const rcsid[] = "$Id";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "bamBamCNV - Perform copy number analysis on two BAM files.\n"
+ "usage:\n"
+ " bamBamCNV [options] <left.bam> <right.bam>\n"
+ "options:\n"
+ " -position=chrX:1-100 - Position to do analysis. [None]\n"
+ " -minMapQ=INT - Minimum acceptable mapping quality. [0]\n"
+ " -readsPerBin - Number of RIGHT reads to put in each cnv bin. [500]\n"
+ "\n"
+ );
+}
+
+static struct optionSpec options[] = {
+ {"position", OPTION_STRING},
+ {"minMapQ", OPTION_INT},
+ {"readsPerBin", OPTION_INT},
+ {NULL, 0}
+};
+
+#define BAD_TID -9999
+
+/* Option variables */
+char *position = NULL;
+int minMapQ = 0;
+int readsPerBin = 500;
+
+typedef struct {
+ int minMapQ;
+ int readsPerBin;
+ int sites;
+
+ int32_t tid;
+
+ int32_t lstart;
+ int32_t lstop;
+ int lcounts;
+
+ int32_t rstart;
+ int32_t rstop;
+ int rcounts;
+
+ int32_t prevTid;
+ int32_t prevStart;
+ int32_t prevStop;
+} analysis_params_t;
+
+static int calc_cnv(pileup_data_t *d, analysis_params_t *ap)
+{
+if ((ap->tid != BAD_TID) && (ap->tid != d->tidL)) // onto next chromosome
+ return 1;
+
+int i;
+const bam_pileup1_t *p;
+if ((ap->prevTid != BAD_TID) && (ap->prevTid == d->tidL))
+ {
+ for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+ bam1_core_t *c = &p->b->core;
+ if ((c)->pos < ap->prevStop)
+ return 0;
+ }
+
+ for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+ bam1_core_t *c = &p->b->core;
+ if ((c)->pos < ap->prevStop)
+ return 0;
+ }
+ }
+
+ap->tid = d->tidL;
+int32_t start, stop, rstart = ap->rstart, rstop = ap->rstop;
+int32_t prevPos = -1;
+for (i = 0; i < d->nR; ++i)
+ {
+ p = d->puR + i;
+ bam1_core_t *c = &p->b->core;
+ if ((c)->qual < ap->minMapQ)
+ continue;
+
+ if ( !((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FDUP) )
+ { // read is mapped and not an optical/PCR dupe
+ start = (c)->pos;
+ stop = start + (c)->l_qseq;
+
+ if (rstart == -1)
+ {
+ rstart = start;
+ rstop = stop;
+ ap->rcounts += 1;
+ }
+ else if ((start > prevPos) && (start > rstart) && (stop > rstop))
+ { // make sure there are no possible dupes by removing all by
+ // comparing left-most position of each read, only allowing one per pos.
+ prevPos = start;
+ rstop = stop;
+ ap->rcounts += 1;
+ }
+ }
+ }
+
+int32_t lstart = ap->lstart, lstop = ap->lstop;
+prevPos = -1;
+for (i = 0; i < d->nL; ++i)
+ {
+ p = d->puL + i;
+ bam1_core_t *c = &p->b->core;
+ if ((c)->qual < ap->minMapQ)
+ continue;
+
+ if ( !((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FDUP) )
+ { // read is mapped and not an optical/PCR dupe
+ start = (c)->pos;
+ stop = start + (c)->l_qseq;
+
+ if (!((start >= rstart) && (stop <= rstop)))
+ continue;
+
+ if (lstart == -1)
+ {
+ lstart = start;
+ lstop = stop;
+ ap->lcounts += 1;
+ }
+ else if ((start > prevPos) && (start > lstart && stop > lstop))
+ { // make sure there are no possible dupes by removing all by
+ // comparing left-most position of each read, only allowing one per pos.
+ prevPos = start;
+ lstop = stop;
+ ap->lcounts += 1;
+ }
+ }
+ }
+
+ap->lstart = lstart;
+ap->lstop = lstop;
+
+ap->rstart = rstart;
+ap->rstop = rstop;
+
+if (ap->rcounts >= ap->readsPerBin)
+ return 1;
+
+return 0;
+}
+
+static void perform_analysis(void *data)
+{
+pileup_data_t *d = (pileup_data_t*)data;
+analysis_params_t *ap = (analysis_params_t*)d->func_data;
+
+if (calc_cnv(d, ap))
+ {
+ //int32_t start = min(ap->lstart, ap->rstart);
+ //int32_t stop = max(ap->lstop, ap->rstop);
+ //int32_t len = (stop - start) + 1;
+
+ printf("%s\t%d\t%d\t%d\t%d\t%d\t%d\n",
+ d->hL->target_name[ap->tid],
+ ap->lstart, ap->lstop + 1,
+ ap->rstart, ap->rstop + 1,
+ ap->lcounts, ap->rcounts);
+
+ fflush(stdout);
+
+ ap->prevTid = ap->tid;
+ ap->prevStart = ap->rstart;
+ ap->prevStop = ap->rstop;
+
+ ap->tid = BAD_TID;
+ ap->lstart = -1;
+ ap->lstop = -1;
+ ap->lcounts = 0;
+
+ ap->rstart = -1;
+ ap->rstop = -1;
+ ap->rcounts = 0;
+ }
+
+ap->sites += 1;
+}
+
+void bamBam(char *inbamL, char *inbamR)
+{
+analysis_params_t ap;
+ap.minMapQ = minMapQ;
+ap.sites = 0;
+ap.readsPerBin = readsPerBin;
+
+ap.tid = BAD_TID;
+ap.lstart = -1;
+ap.lstop = -1;
+ap.lcounts = 0;
+
+ap.rstart = -1;
+ap.rstop = -1;
+ap.rcounts = 0;
+
+ap.prevTid = BAD_TID;
+ap.prevStart = -1;
+ap.prevStop = -1;
+
+bam_bam_file(inbamL, inbamR, position, perform_analysis, &ap);
+fprintf(stderr, "# qualified sites = %d\n", ap.sites);
+}
+
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+// Print out command line as a comment.
+int i;
+putchar('#');
+for (i = 0; i < argc; i++)
+ printf(" %s", argv[i]);
+putchar('\n');
+
+optionInit(&argc, argv, options);
+if (argc < 3)
+ usage();
+
+if (optionExists("minMapQ"))
+ minMapQ = optionInt("minMapQ", 0);
+if (optionExists("readsPerBin"))
+ readsPerBin = optionInt("readsPerBin", 0);
+if (optionExists("position"))
+ position = optionVal("position", NULL);
+
+char *inbamL = argv[1];
+char *inbamR = argv[2];
+bamBam(inbamL, inbamR);
+
+return 0;
+}
+
+#else // USE_BAM not defined
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+printf("BAM not installed");
+
+return 0;
+}
+#endif