src/hg/tcga/bamFindPairs/bamFindPairs.c 1.1

1.1 2009/10/06 05:04:17 jsanborn
init commit
Index: src/hg/tcga/bamFindPairs/bamFindPairs.c
===================================================================
RCS file: src/hg/tcga/bamFindPairs/bamFindPairs.c
diff -N src/hg/tcga/bamFindPairs/bamFindPairs.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/tcga/bamFindPairs/bamFindPairs.c	6 Oct 2009 05:04:17 -0000	1.1
@@ -0,0 +1,331 @@
+/* bamSeq - Reading/processing BAM files (testing). 
+ *   code adapted from Angie's bamFile.c / bamTrack.c
+ */
+
+#ifdef USE_BAM
+
+#include "common.h"
+#include "hCommon.h"
+#include "linefile.h"
+#include "hash.h"
+#include "hdb.h"
+#include "options.h"
+#include "jksql.h"
+
+#define _IOLIB 2
+#include "bam.h"
+#include "sam.h"
+#include "khash.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "bamFindPairs - Find reads overlapping BED file in BAM file, then find their mates,\n"
+  "   wherever they are. Intended to find pairs of discordant reads. \n"
+  "   Will store read IDs in memory, so be careful not to request too many reads. \n" 
+  "usage:\n"
+  "   bamFindPairs [options] <overlap.bed> <in.bam> <out.bam>\n"
+  "options:\n"
+  "   -stripPrefix=[chr] - Specify prefix to strip from bed (e.g. chr)\n" 
+  "                           to match entries in BAM.\n"
+  "   -pad=[20] - Specify padding around each bed entry to help merge nearby\n"
+  "                bed entries. Suggest half of read length.\n"
+  "   -qual=[0] - Specify minimum mapping quality for reads.\n"
+  "\n"
+  "TODO: add flag for filtering out concordant reads, to reduce memory footprint.\n"
+  "\n"
+  "  Entries in input overlap.bed file are merged onto an rb-tree to find overlapping\n" 
+  "  regions (after padding is added).\n"
+  "  BAM alignments for each of these regions are written to output bam file\n"
+  "  (or to stdout if no output file is specified).\n"
+  "  It might be necessary to sort the output and create a new index (using \n"
+  "  'samtools sort' and 'samtools index') for random access to output BAM file.\n"
+  );
+}
+
+
+static struct optionSpec options[] = {
+    {"stripPrefix", OPTION_STRING},
+    {"padding", OPTION_INT},
+    {"qual", OPTION_INT},
+    {NULL, 0},
+};
+
+/* Option variables */
+char *stripPrefix = "chr";
+int padding = 20;
+static int mapQuality = 0;
+
+typedef struct {
+    samfile_t *out;
+    bam_header_t *h;
+    struct hash *hash;
+    struct hash *read_hash;
+    int found;
+} fetch_data_t;
+
+
+struct hash *splitBedByChrom(char *fileName)
+{
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+
+// peek to find number of columns in file.
+char *line;
+if (!lineFileNextReal(lf, &line))
+    return FALSE;
+int numCols = chopByChar(line, '\t', NULL, 0);
+
+// rewind file to beginning (prior to peek).
+lineFileRewind(lf);
+
+int i;
+char *row[numCols];
+struct bed *bed, *bedList = NULL;
+struct hash *hash = hashNew(0);
+
+while (lineFileRow(lf, row))
+    {
+    bed = bedLoadN(row, numCols);
+
+    // add padding to overall chrom start/end.
+    bed->chromStart -= padding;
+    bed->chromEnd += padding;
+
+    // add padding to exons, if there are any.
+    for (i = 0; i < bed->blockCount; i++)
+	{  
+	bed->blockSizes[i] += 2 * padding;
+	bed->chromStarts[i] -= padding;
+	}
+
+    struct hashEl *el = hashLookup(hash, bed->chrom);
+    if (!el)
+	hashAdd(hash, bed->chrom, bed);
+    else
+	{
+	bedList = el->val;
+	slAddTail(&bedList, bed);
+	}
+    }
+lineFileClose(&lf);
+
+return hash;
+}
+
+struct slName *buildPositionList(char *filename)
+{
+struct hashEl *el;
+struct hash *bedHash = splitBedByChrom(filename);
+
+struct slName *slList = NULL;
+for (el = hashElListHash(bedHash); el != NULL; el = el->next)
+    {
+    char *chrom = el->name;
+    if (stripPrefix && startsWith(stripPrefix, chrom))
+	chrom += + strlen(stripPrefix);
+
+    struct bed *bed, *bedList = el->val;
+
+    struct rbTree *rangeTree = rangeTreeNew();
+    for (bed = bedList; bed; bed = bed->next)
+	bedIntoRangeTree(bed, rangeTree);
+    struct range *rn, *rangeList = rangeTreeList(rangeTree);
+
+    char position[128];
+    for (rn = rangeList; rn; rn = rn->next)
+	{
+	safef(position, sizeof(position), "%s:%d-%d", 
+	      chrom, rn->start, rn->end);
+
+	slNameAddHead(&slList, position);
+	}
+    rangeTreeFree(&rangeTree);
+    }
+
+slReverse(&slList);
+return slList;
+}
+
+/* Code adapted from Samtools samview.c */
+static inline int __g_skip_alignment(const bam1_t *b)
+{
+if (b->core.qual < mapQuality)
+    return 1;
+if (!(b->core.flag & BAM_FPAIRED) || (b->core.flag & BAM_FUNMAP))
+    return 1;  // not paired data or read is unmapped. 
+if (b->core.flag & BAM_FMUNMAP)
+    return 1;  // mate is unmapped
+if (b->core.tid == b->core.mtid)
+    return 1;  // mate maps to same chromosome.
+
+// add discordant reads that span large junctions...
+
+return 0;
+}
+
+/* Code adapted from Samtools samview.c */
+static int fetch_func(const bam1_t *b, void *data)
+{
+if (!__g_skip_alignment(b))
+    {
+    fetch_data_t *fd = (fetch_data_t*)data;
+    struct hashEl *el;
+    struct hash *tmpHash;
+    if ((el = hashLookup(fd->hash, fd->h->target_name[b->core.mtid])) == NULL)
+	{
+	tmpHash = hashNew(0);
+	hashAdd(fd->hash, fd->h->target_name[b->core.mtid], tmpHash);
+	}
+    else
+	tmpHash = el->val;
+
+    hashAddInt(tmpHash, bam1_qname(b), 0);
+    samwrite((samfile_t*)fd->out, b);
+    }
+
+return 0;
+}
+
+static int fetch_pair_func(const bam1_t *b, void *data)
+{
+if (!__g_skip_alignment(b))
+    {
+    fetch_data_t *fd = (fetch_data_t*)data;
+    if (!hashLookup(fd->read_hash, bam1_qname(b)))
+	return 0;
+    fd->found += 1;
+    samwrite((samfile_t*)fd->out, b);
+    }
+
+return 0;
+}
+
+void bamFindPairs(char *bedfile, char *inbam, char *outbam)
+/* bamFindPairs - Finding pairs a BAM file for specified region(s) and criteria. */
+{
+samfile_t *in = samopen(inbam, "rb", NULL);
+if (in == NULL)
+    errAbort("samopen(%s, \"rb\") returned NULL", inbam);
+
+/* if outbam is not given, assume user wants data sent to stdout (i.e. filename = '-') 
+ * and in plain text to use in series of piped commands.
+ * if outbam is given, write bam file (binary) 
+ */ 
+
+samfile_t *out;
+if (outbam)
+    out = samopen(outbam, "wbh", in->header);
+else
+    out = samopen("-", "wh", in->header);
+
+if (out == NULL)
+    {
+    samclose(in);
+    errAbort("Failed to open file for writing.\n");
+    }
+
+bam_index_t *idx = bam_index_load(inbam);
+if (idx == 0)
+    {
+    samclose(in);
+    samclose(out);
+    errAbort("Index for %s could not be read," 
+	    "please sort (if necessary) & create index with \n\tsamtools sort / index %s", 
+	    inbam, inbam);
+    }
+
+fetch_data_t fd;
+fd.out   = out;
+fd.hash  = hashNew(0);
+fd.h     = in->header;
+fd.found = 0;
+
+struct slName *pos, *positions = buildPositionList(bedfile);
+
+int ref, beg, end;  // chrom id (does not include 'chr')
+for (pos = positions; pos; pos = pos->next)
+    { 
+    fprintf(stderr, "pos->name = %s\n", pos->name);
+    int ret = bam_parse_region(in->header, pos->name, &ref, &beg, &end);
+    if (ret < 0)
+	continue;
+    bam_fetch(in->x.bam, idx, ref, beg, end, &fd, fetch_func);
+    }
+
+// DO stuff here!
+
+fprintf(stderr, "found reads on %d chromosomes, now finding pairs.\n", 
+	hashNumEntries(fd.hash));
+
+struct hashEl *el;
+struct hashCookie cookie = hashFirst(fd.hash);
+while ((el = hashNext(&cookie)) != NULL)
+    {
+    char *chrom = el->name;
+    fd.read_hash = el->val; 
+
+    if (!fd.read_hash)
+	{
+	fprintf(stderr, "no hash\n");
+	continue;
+	}
+    fprintf(stderr, "checking %s, %d reads\n", chrom, hashNumEntries(fd.read_hash));
+
+    ref = 0;
+    beg = 0;
+    end = 0x7fffffff;
+    int ret = bam_parse_region(in->header, chrom, &ref, &beg, &end);
+    if (ret < 0 && ref < 0)
+	{
+	fprintf(stderr, "couldn't parse region\n");
+	continue;
+	}
+    bam_fetch(in->x.bam, idx, ref, beg, end, &fd, fetch_pair_func);
+
+    fprintf(stderr, "found %d pairs so far.\n", fd.found);
+    }
+
+bam_index_destroy(idx);
+samclose(in);
+samclose(out);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc < 3)
+    usage();
+
+if (optionExists("stripPrefix"))
+    stripPrefix = optionVal("stripPrefix", "chr");
+if (optionExists("padding"))
+    padding = optionInt("padding", 20);
+if (optionExists("qual"))
+    mapQuality = optionInt("qual", 0);
+
+char *bedfile = argv[1];
+char *inbam = argv[2];
+
+char *outbam = NULL;
+if (argc == 4)
+    outbam = argv[3];
+
+bamFindPairs(bedfile, inbam, outbam);
+
+return 0;
+}
+
+#else // USE_BAM not defined
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+printf("BAM not installed");
+
+return 0;
+}
+#endif