src/hg/tcga/bamVariants/bamVariants.c 1.1

1.1 2009/09/22 17:40:54 jsanborn
initial commit
Index: src/hg/tcga/bamVariants/bamVariants.c
===================================================================
RCS file: src/hg/tcga/bamVariants/bamVariants.c
diff -N src/hg/tcga/bamVariants/bamVariants.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/tcga/bamVariants/bamVariants.c	22 Sep 2009 17:40:54 -0000	1.1
@@ -0,0 +1,379 @@
+/* bamVariants -- discovering variants from BAM file
+ *   code adapted from Heng Li's Samtools library functions
+ */
+
+#ifdef USE_BAM
+
+#include "common.h"
+#include "hCommon.h"
+#include "linefile.h"
+#include "hash.h"
+#include "hdb.h"
+#include "options.h"
+#include "jksql.h"
+
+#define _IOLIB 2
+#include "bam.h"
+#include "sam.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "bamVariants - Find variants in input BAM file, output position/type of variant.\n"
+  "usage:\n"
+  "   bamVariants [options] <in.bam> <out.results>\n"
+  "options:\n"
+  "   -position=<chrom position> - Specify a region of BAM file to perform pileup\n"
+  "   -minSupport=INT - Minimum Support (i.e. num reads) to call a variant. [2]\n"
+  "   -minQ=INT - Minimum acceptable base quality. [0]\n"
+  "   -minMapQ=INT - Minimum acceptable mapping quality. [0]\n"
+  "   -minISize=INT - Minimum Insert Size [outside of range is variant]. [-1]\n"
+  "   -maxISize=INT - Maximum Insert Size [\" \"]. [-1] \n"
+  "   -ignoreUnmap - Ignore Unmapped mate pairs.\n"
+  "\n"
+  );
+}
+
+static struct optionSpec options[] = {
+    {"position", OPTION_STRING},
+    {"minSupport", OPTION_INT},
+    {"minQ", OPTION_INT}, 
+    {"minMapQ", OPTION_INT},
+    {"minISize", OPTION_INT},
+    {"maxISize", OPTION_INT},
+    {"ignoreUnmap", OPTION_BOOLEAN},
+    {NULL, 0}
+};
+
+/* Option variables */
+char *position = NULL;
+static int minSupport = 2;
+static int minQ = 0;
+static int minMapQ = 0;
+static int minISize = -1;
+static int maxISize = -1;
+static boolean ignoreUnmap = FALSE;
+
+#define VAR_A 0
+#define VAR_G 1
+#define VAR_T 2
+#define VAR_C 3
+#define VAR_DEL 4
+#define VAR_INS 5
+#define VAR_MATE_ISIZE_INS 6
+#define VAR_MATE_ISIZE_DEL 7
+#define VAR_MATE_DIFF_CHROM 8
+#define VAR_MATE_UNMAP 9
+#define NUM_VAR_TYPES 10
+
+static int variantBitMap[NUM_VAR_TYPES];
+
+#define NUM_CHROMS 100
+static int mateChroms[NUM_CHROMS];
+
+typedef struct {
+    int beg, end;
+    samfile_t *in;
+} tmpstruct_t;
+
+typedef struct {
+    bam_header_t *h;
+    uint32_t format;
+    int tid, len, last_pos;
+    int mask;
+    char *ref;
+} pileup_data_t;
+
+
+// callback for bam_fetch()
+static int fetch_func(const bam1_t *b, void *data)
+{
+bam_plbuf_t *buf = (bam_plbuf_t*)data;
+bam_plbuf_push(b, buf);
+return 0;
+}
+
+
+
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{  // will put everything to stdout.
+
+pileup_data_t *d = (pileup_data_t*)data;
+int i, j, rb;
+rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+
+for (i = 0; i < NUM_VAR_TYPES; ++i)
+    variantBitMap[i] = 0;
+for (i = 0; i < NUM_CHROMS; ++i)
+    mateChroms[i] = 0;
+
+for (i = 0; i < n; ++i)
+    {
+    const bam_pileup1_t *p = pu + i;
+
+    if (p->b->core.qual < minMapQ)
+	continue;
+    
+    bam1_core_t *c = &p->b->core;
+    if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+	{ // read is paired and mapped.
+	if (!((c)->flag & BAM_FMUNMAP))
+	    {  // mate is mapped
+	    if ((c)->mtid != (c)->tid)
+		{ 
+		mateChroms[(c)->mtid]++;
+		if (mateChroms[(c)->mtid] >= minSupport)
+		    variantBitMap[VAR_MATE_DIFF_CHROM]++;
+		}
+	    else
+		{
+		if ((minISize != -1) && (abs((c)->isize) < minISize))
+		    variantBitMap[VAR_MATE_ISIZE_DEL]++;
+		else if ((maxISize != -1) && (abs((c)->isize) > maxISize))
+		    variantBitMap[VAR_MATE_ISIZE_INS]++;
+		}
+	    }
+	else if (!ignoreUnmap && ((c)->flag & BAM_FMUNMAP)) 
+	    variantBitMap[VAR_MATE_UNMAP]++;  // mate is unmapped
+	}
+
+    if (bam1_qual(p->b)[p->qpos] < minQ)
+	continue;    
+    if (p->is_del)
+	variantBitMap[VAR_DEL]++;
+    else if (p->indel > 0)
+	variantBitMap[VAR_INS]++;
+    else
+	{
+	int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
+	switch (c)
+	    {
+	    case 'A':
+		variantBitMap[VAR_A]++;
+		break;
+	    case 'G':	
+		variantBitMap[VAR_G]++;
+		break;
+	    case 'T':
+		variantBitMap[VAR_T]++;
+		break;
+	    case 'C':
+		variantBitMap[VAR_C]++;
+		break;
+	    default:
+		break;
+	    }
+	} 
+    }
+
+int numDetected = 0;
+for (i = 0; i < NUM_VAR_TYPES; ++i)
+    if (variantBitMap[i] >= minSupport)
+	numDetected += 1;
+
+if (numDetected <= 1)
+    return 0;
+
+printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
+
+int snpCount = 0;
+for (i = 0; i < NUM_VAR_TYPES; ++i)
+    {
+    if (variantBitMap[i] >= minSupport)
+	{
+	switch (i)
+	    {
+	    case VAR_A:
+	    case VAR_G:
+	    case VAR_T:
+	    case VAR_C:
+		snpCount++;
+		break;
+	    case VAR_INS:
+	    case VAR_DEL:
+		printf("IND ");
+		break;
+	    case VAR_MATE_ISIZE_INS:
+		printf("ISI ");
+		break;
+	    case VAR_MATE_ISIZE_DEL:
+		printf("ISD ");
+		break;
+	    case VAR_MATE_UNMAP:
+		printf("MUM ");
+		break;
+	    case VAR_MATE_DIFF_CHROM:
+		printf("MDC:");
+		for (j = 0; j < NUM_CHROMS; ++j)
+		    if (mateChroms[j] >= minSupport)
+			printf("%d(%d),", j, mateChroms[j]);
+		printf(" ");
+		break;
+	    default:
+		break;
+	    }
+	}
+    }
+if (snpCount > 1)
+    printf("SNP(%d) ", snpCount);
+putchar('\t');
+
+for (i = 0; i < n; ++i) 
+    {
+    const bam_pileup1_t *p = pu + i;
+    if (p->is_head) 
+	printf("^%c", p->b->core.qual > 93 ? 126 : p->b->core.qual + 33);
+    if (!p->is_del) 
+	{
+	int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+	if (c == '=' || toupper(c) == toupper(rb)) 
+	    c = bam1_strand(p->b) ? ',' : '.';
+	else 
+	    c = bam1_strand(p->b) ? tolower(c) : toupper(c);
+	putchar(c);
+	if (p->indel > 0) 
+	    {
+	    printf("+%d", p->indel);
+	    for (j = 1; j <= p->indel; ++j) 
+		{
+		c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+		putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
+		}
+	    } 
+	else if (p->indel < 0) 
+	    {
+	    printf("%d", p->indel);
+	    for (j = 1; j <= -p->indel; ++j) 
+		{
+		c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
+		putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
+		}
+	    }
+	} 
+    else 
+	putchar('*');
+    if (p->is_tail) 
+	putchar('$');
+    }
+putchar('\t');
+
+// print quality
+for (i = 0; i < n; ++i) 
+    {
+    const bam_pileup1_t *p = pu + i;
+    int c = bam1_qual(p->b)[p->qpos] + 33;
+    if (c > 126) 
+	c = 126;
+    putchar(c);
+    }
+
+putchar('\t');
+for (i = 0; i < n; ++i) 
+    {
+    int c = pu[i].b->core.qual + 33;
+    if (c > 126) 
+	c = 126;
+    putchar(c);
+    }
+
+putchar('\n');
+return 0;
+}
+
+void bamVariants(char *inbam)
+/* bamVariants - Discover variants in input BAM file. */
+{
+tmpstruct_t tmp;  
+tmp.beg = 0; tmp.end = 0x7fffffff;
+tmp.in = samopen(inbam, "rb", NULL);
+if (tmp.in == NULL)
+    errAbort("samopen(%s, \"rb\") returned NULL", inbam);
+
+pileup_data_t *d = (pileup_data_t*)calloc(1, sizeof(pileup_data_t));
+d->tid = -1; 
+d->mask = BAM_DEF_MASK;
+d->h = tmp.in->header;
+
+bam_plbuf_t *buf;
+int ret;
+bam1_t *b;
+b = bam_init1();
+buf = bam_plbuf_init(pileup_func, d);
+bam_plbuf_set_mask(buf, -1);  // mask = -1, no masking?
+
+if (!position) 
+    {  // if a region is not specified
+    while ((ret = samread(tmp.in, b)) >= 0)
+	bam_plbuf_push(b, buf);   // read through entire file
+    bam_plbuf_push(0, buf);       // finalize pileup
+    bam_plbuf_destroy(buf);
+    bam_destroy1(b);
+    } 
+else 
+    {
+    int ref;
+    bam_index_t *idx;
+    idx = bam_index_load(inbam); // load BAM index
+    if (!idx) 
+	{
+	samclose(tmp.in);
+	errAbort("BAM indexing file is not available.\n");
+	}
+    bam_parse_region(tmp.in->header, position, &ref,
+		     &tmp.beg, &tmp.end); // parse the region
+    if (ref < 0) 
+	{
+	samclose(tmp.in);
+	errAbort("Invalid region %s\n", position);
+	}
+
+    bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func);
+    bam_plbuf_push(0, buf); // finalize pileup
+    bam_index_destroy(idx);
+    bam_plbuf_destroy(buf);
+    }
+
+samclose(tmp.in);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc < 2)
+    usage();
+
+if (optionExists("position"))
+    position = optionVal("position", NULL);
+if (optionExists("minSupport"))
+    minSupport = optionInt("minSupport", 2);
+if (optionExists("minQ"))
+    minQ = optionInt("minQ", 0);
+if (optionExists("minMapQ"))
+    minMapQ = optionInt("minMapQ", 0);
+if (optionExists("minISize"))
+    minISize = optionInt("minISize", -1);
+if (optionExists("maxISize"))
+    maxISize = optionInt("maxISize", -1);
+if (optionExists("ignoreUnmap"))
+    ignoreUnmap = TRUE;
+
+char *inbam = argv[1];
+bamVariants(inbam);
+
+return 0;
+}
+
+#else // USE_BAM not defined
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+printf("BAM not installed");
+
+return 0;
+}
+#endif