src/hg/tcga/bamVariants/bamVariants.c 1.1
1.1 2009/09/22 17:40:54 jsanborn
initial commit
Index: src/hg/tcga/bamVariants/bamVariants.c
===================================================================
RCS file: src/hg/tcga/bamVariants/bamVariants.c
diff -N src/hg/tcga/bamVariants/bamVariants.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/tcga/bamVariants/bamVariants.c 22 Sep 2009 17:40:54 -0000 1.1
@@ -0,0 +1,379 @@
+/* bamVariants -- discovering variants from BAM file
+ * code adapted from Heng Li's Samtools library functions
+ */
+
+#ifdef USE_BAM
+
+#include "common.h"
+#include "hCommon.h"
+#include "linefile.h"
+#include "hash.h"
+#include "hdb.h"
+#include "options.h"
+#include "jksql.h"
+
+#define _IOLIB 2
+#include "bam.h"
+#include "sam.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "bamVariants - Find variants in input BAM file, output position/type of variant.\n"
+ "usage:\n"
+ " bamVariants [options] <in.bam> <out.results>\n"
+ "options:\n"
+ " -position=<chrom position> - Specify a region of BAM file to perform pileup\n"
+ " -minSupport=INT - Minimum Support (i.e. num reads) to call a variant. [2]\n"
+ " -minQ=INT - Minimum acceptable base quality. [0]\n"
+ " -minMapQ=INT - Minimum acceptable mapping quality. [0]\n"
+ " -minISize=INT - Minimum Insert Size [outside of range is variant]. [-1]\n"
+ " -maxISize=INT - Maximum Insert Size [\" \"]. [-1] \n"
+ " -ignoreUnmap - Ignore Unmapped mate pairs.\n"
+ "\n"
+ );
+}
+
+static struct optionSpec options[] = {
+ {"position", OPTION_STRING},
+ {"minSupport", OPTION_INT},
+ {"minQ", OPTION_INT},
+ {"minMapQ", OPTION_INT},
+ {"minISize", OPTION_INT},
+ {"maxISize", OPTION_INT},
+ {"ignoreUnmap", OPTION_BOOLEAN},
+ {NULL, 0}
+};
+
+/* Option variables */
+char *position = NULL;
+static int minSupport = 2;
+static int minQ = 0;
+static int minMapQ = 0;
+static int minISize = -1;
+static int maxISize = -1;
+static boolean ignoreUnmap = FALSE;
+
+#define VAR_A 0
+#define VAR_G 1
+#define VAR_T 2
+#define VAR_C 3
+#define VAR_DEL 4
+#define VAR_INS 5
+#define VAR_MATE_ISIZE_INS 6
+#define VAR_MATE_ISIZE_DEL 7
+#define VAR_MATE_DIFF_CHROM 8
+#define VAR_MATE_UNMAP 9
+#define NUM_VAR_TYPES 10
+
+static int variantBitMap[NUM_VAR_TYPES];
+
+#define NUM_CHROMS 100
+static int mateChroms[NUM_CHROMS];
+
+typedef struct {
+ int beg, end;
+ samfile_t *in;
+} tmpstruct_t;
+
+typedef struct {
+ bam_header_t *h;
+ uint32_t format;
+ int tid, len, last_pos;
+ int mask;
+ char *ref;
+} pileup_data_t;
+
+
+// callback for bam_fetch()
+static int fetch_func(const bam1_t *b, void *data)
+{
+bam_plbuf_t *buf = (bam_plbuf_t*)data;
+bam_plbuf_push(b, buf);
+return 0;
+}
+
+
+
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{ // will put everything to stdout.
+
+pileup_data_t *d = (pileup_data_t*)data;
+int i, j, rb;
+rb = (d->ref && (int)pos < d->len)? d->ref[pos] : 'N';
+
+for (i = 0; i < NUM_VAR_TYPES; ++i)
+ variantBitMap[i] = 0;
+for (i = 0; i < NUM_CHROMS; ++i)
+ mateChroms[i] = 0;
+
+for (i = 0; i < n; ++i)
+ {
+ const bam_pileup1_t *p = pu + i;
+
+ if (p->b->core.qual < minMapQ)
+ continue;
+
+ bam1_core_t *c = &p->b->core;
+ if (((c)->flag & BAM_FPAIRED) && !((c)->flag & BAM_FUNMAP))
+ { // read is paired and mapped.
+ if (!((c)->flag & BAM_FMUNMAP))
+ { // mate is mapped
+ if ((c)->mtid != (c)->tid)
+ {
+ mateChroms[(c)->mtid]++;
+ if (mateChroms[(c)->mtid] >= minSupport)
+ variantBitMap[VAR_MATE_DIFF_CHROM]++;
+ }
+ else
+ {
+ if ((minISize != -1) && (abs((c)->isize) < minISize))
+ variantBitMap[VAR_MATE_ISIZE_DEL]++;
+ else if ((maxISize != -1) && (abs((c)->isize) > maxISize))
+ variantBitMap[VAR_MATE_ISIZE_INS]++;
+ }
+ }
+ else if (!ignoreUnmap && ((c)->flag & BAM_FMUNMAP))
+ variantBitMap[VAR_MATE_UNMAP]++; // mate is unmapped
+ }
+
+ if (bam1_qual(p->b)[p->qpos] < minQ)
+ continue;
+ if (p->is_del)
+ variantBitMap[VAR_DEL]++;
+ else if (p->indel > 0)
+ variantBitMap[VAR_INS]++;
+ else
+ {
+ int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]);
+ switch (c)
+ {
+ case 'A':
+ variantBitMap[VAR_A]++;
+ break;
+ case 'G':
+ variantBitMap[VAR_G]++;
+ break;
+ case 'T':
+ variantBitMap[VAR_T]++;
+ break;
+ case 'C':
+ variantBitMap[VAR_C]++;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+int numDetected = 0;
+for (i = 0; i < NUM_VAR_TYPES; ++i)
+ if (variantBitMap[i] >= minSupport)
+ numDetected += 1;
+
+if (numDetected <= 1)
+ return 0;
+
+printf("%s\t%d\t%c\t", d->h->target_name[tid], pos + 1, rb);
+
+int snpCount = 0;
+for (i = 0; i < NUM_VAR_TYPES; ++i)
+ {
+ if (variantBitMap[i] >= minSupport)
+ {
+ switch (i)
+ {
+ case VAR_A:
+ case VAR_G:
+ case VAR_T:
+ case VAR_C:
+ snpCount++;
+ break;
+ case VAR_INS:
+ case VAR_DEL:
+ printf("IND ");
+ break;
+ case VAR_MATE_ISIZE_INS:
+ printf("ISI ");
+ break;
+ case VAR_MATE_ISIZE_DEL:
+ printf("ISD ");
+ break;
+ case VAR_MATE_UNMAP:
+ printf("MUM ");
+ break;
+ case VAR_MATE_DIFF_CHROM:
+ printf("MDC:");
+ for (j = 0; j < NUM_CHROMS; ++j)
+ if (mateChroms[j] >= minSupport)
+ printf("%d(%d),", j, mateChroms[j]);
+ printf(" ");
+ break;
+ default:
+ break;
+ }
+ }
+ }
+if (snpCount > 1)
+ printf("SNP(%d) ", snpCount);
+putchar('\t');
+
+for (i = 0; i < n; ++i)
+ {
+ const bam_pileup1_t *p = pu + i;
+ if (p->is_head)
+ printf("^%c", p->b->core.qual > 93 ? 126 : p->b->core.qual + 33);
+ if (!p->is_del)
+ {
+ int c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)];
+ if (c == '=' || toupper(c) == toupper(rb))
+ c = bam1_strand(p->b) ? ',' : '.';
+ else
+ c = bam1_strand(p->b) ? tolower(c) : toupper(c);
+ putchar(c);
+ if (p->indel > 0)
+ {
+ printf("+%d", p->indel);
+ for (j = 1; j <= p->indel; ++j)
+ {
+ c = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + j)];
+ putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
+ }
+ }
+ else if (p->indel < 0)
+ {
+ printf("%d", p->indel);
+ for (j = 1; j <= -p->indel; ++j)
+ {
+ c = (d->ref && (int)pos+j < d->len)? d->ref[pos+j] : 'N';
+ putchar(bam1_strand(p->b) ? tolower(c) : toupper(c));
+ }
+ }
+ }
+ else
+ putchar('*');
+ if (p->is_tail)
+ putchar('$');
+ }
+putchar('\t');
+
+// print quality
+for (i = 0; i < n; ++i)
+ {
+ const bam_pileup1_t *p = pu + i;
+ int c = bam1_qual(p->b)[p->qpos] + 33;
+ if (c > 126)
+ c = 126;
+ putchar(c);
+ }
+
+putchar('\t');
+for (i = 0; i < n; ++i)
+ {
+ int c = pu[i].b->core.qual + 33;
+ if (c > 126)
+ c = 126;
+ putchar(c);
+ }
+
+putchar('\n');
+return 0;
+}
+
+void bamVariants(char *inbam)
+/* bamVariants - Discover variants in input BAM file. */
+{
+tmpstruct_t tmp;
+tmp.beg = 0; tmp.end = 0x7fffffff;
+tmp.in = samopen(inbam, "rb", NULL);
+if (tmp.in == NULL)
+ errAbort("samopen(%s, \"rb\") returned NULL", inbam);
+
+pileup_data_t *d = (pileup_data_t*)calloc(1, sizeof(pileup_data_t));
+d->tid = -1;
+d->mask = BAM_DEF_MASK;
+d->h = tmp.in->header;
+
+bam_plbuf_t *buf;
+int ret;
+bam1_t *b;
+b = bam_init1();
+buf = bam_plbuf_init(pileup_func, d);
+bam_plbuf_set_mask(buf, -1); // mask = -1, no masking?
+
+if (!position)
+ { // if a region is not specified
+ while ((ret = samread(tmp.in, b)) >= 0)
+ bam_plbuf_push(b, buf); // read through entire file
+ bam_plbuf_push(0, buf); // finalize pileup
+ bam_plbuf_destroy(buf);
+ bam_destroy1(b);
+ }
+else
+ {
+ int ref;
+ bam_index_t *idx;
+ idx = bam_index_load(inbam); // load BAM index
+ if (!idx)
+ {
+ samclose(tmp.in);
+ errAbort("BAM indexing file is not available.\n");
+ }
+ bam_parse_region(tmp.in->header, position, &ref,
+ &tmp.beg, &tmp.end); // parse the region
+ if (ref < 0)
+ {
+ samclose(tmp.in);
+ errAbort("Invalid region %s\n", position);
+ }
+
+ bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func);
+ bam_plbuf_push(0, buf); // finalize pileup
+ bam_index_destroy(idx);
+ bam_plbuf_destroy(buf);
+ }
+
+samclose(tmp.in);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc < 2)
+ usage();
+
+if (optionExists("position"))
+ position = optionVal("position", NULL);
+if (optionExists("minSupport"))
+ minSupport = optionInt("minSupport", 2);
+if (optionExists("minQ"))
+ minQ = optionInt("minQ", 0);
+if (optionExists("minMapQ"))
+ minMapQ = optionInt("minMapQ", 0);
+if (optionExists("minISize"))
+ minISize = optionInt("minISize", -1);
+if (optionExists("maxISize"))
+ maxISize = optionInt("maxISize", -1);
+if (optionExists("ignoreUnmap"))
+ ignoreUnmap = TRUE;
+
+char *inbam = argv[1];
+bamVariants(inbam);
+
+return 0;
+}
+
+#else // USE_BAM not defined
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+printf("BAM not installed");
+
+return 0;
+}
+#endif