src/hg/tcga/bamFindPairs/bamFindPairs.c 1.1
1.1 2009/10/06 05:04:17 jsanborn
init commit
Index: src/hg/tcga/bamFindPairs/bamFindPairs.c
===================================================================
RCS file: src/hg/tcga/bamFindPairs/bamFindPairs.c
diff -N src/hg/tcga/bamFindPairs/bamFindPairs.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/tcga/bamFindPairs/bamFindPairs.c 6 Oct 2009 05:04:17 -0000 1.1
@@ -0,0 +1,331 @@
+/* bamSeq - Reading/processing BAM files (testing).
+ * code adapted from Angie's bamFile.c / bamTrack.c
+ */
+
+#ifdef USE_BAM
+
+#include "common.h"
+#include "hCommon.h"
+#include "linefile.h"
+#include "hash.h"
+#include "hdb.h"
+#include "options.h"
+#include "jksql.h"
+
+#define _IOLIB 2
+#include "bam.h"
+#include "sam.h"
+#include "khash.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "bamFindPairs - Find reads overlapping BED file in BAM file, then find their mates,\n"
+ " wherever they are. Intended to find pairs of discordant reads. \n"
+ " Will store read IDs in memory, so be careful not to request too many reads. \n"
+ "usage:\n"
+ " bamFindPairs [options] <overlap.bed> <in.bam> <out.bam>\n"
+ "options:\n"
+ " -stripPrefix=[chr] - Specify prefix to strip from bed (e.g. chr)\n"
+ " to match entries in BAM.\n"
+ " -pad=[20] - Specify padding around each bed entry to help merge nearby\n"
+ " bed entries. Suggest half of read length.\n"
+ " -qual=[0] - Specify minimum mapping quality for reads.\n"
+ "\n"
+ "TODO: add flag for filtering out concordant reads, to reduce memory footprint.\n"
+ "\n"
+ " Entries in input overlap.bed file are merged onto an rb-tree to find overlapping\n"
+ " regions (after padding is added).\n"
+ " BAM alignments for each of these regions are written to output bam file\n"
+ " (or to stdout if no output file is specified).\n"
+ " It might be necessary to sort the output and create a new index (using \n"
+ " 'samtools sort' and 'samtools index') for random access to output BAM file.\n"
+ );
+}
+
+
+static struct optionSpec options[] = {
+ {"stripPrefix", OPTION_STRING},
+ {"padding", OPTION_INT},
+ {"qual", OPTION_INT},
+ {NULL, 0},
+};
+
+/* Option variables */
+char *stripPrefix = "chr";
+int padding = 20;
+static int mapQuality = 0;
+
+typedef struct {
+ samfile_t *out;
+ bam_header_t *h;
+ struct hash *hash;
+ struct hash *read_hash;
+ int found;
+} fetch_data_t;
+
+
+struct hash *splitBedByChrom(char *fileName)
+{
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+
+// peek to find number of columns in file.
+char *line;
+if (!lineFileNextReal(lf, &line))
+ return FALSE;
+int numCols = chopByChar(line, '\t', NULL, 0);
+
+// rewind file to beginning (prior to peek).
+lineFileRewind(lf);
+
+int i;
+char *row[numCols];
+struct bed *bed, *bedList = NULL;
+struct hash *hash = hashNew(0);
+
+while (lineFileRow(lf, row))
+ {
+ bed = bedLoadN(row, numCols);
+
+ // add padding to overall chrom start/end.
+ bed->chromStart -= padding;
+ bed->chromEnd += padding;
+
+ // add padding to exons, if there are any.
+ for (i = 0; i < bed->blockCount; i++)
+ {
+ bed->blockSizes[i] += 2 * padding;
+ bed->chromStarts[i] -= padding;
+ }
+
+ struct hashEl *el = hashLookup(hash, bed->chrom);
+ if (!el)
+ hashAdd(hash, bed->chrom, bed);
+ else
+ {
+ bedList = el->val;
+ slAddTail(&bedList, bed);
+ }
+ }
+lineFileClose(&lf);
+
+return hash;
+}
+
+struct slName *buildPositionList(char *filename)
+{
+struct hashEl *el;
+struct hash *bedHash = splitBedByChrom(filename);
+
+struct slName *slList = NULL;
+for (el = hashElListHash(bedHash); el != NULL; el = el->next)
+ {
+ char *chrom = el->name;
+ if (stripPrefix && startsWith(stripPrefix, chrom))
+ chrom += + strlen(stripPrefix);
+
+ struct bed *bed, *bedList = el->val;
+
+ struct rbTree *rangeTree = rangeTreeNew();
+ for (bed = bedList; bed; bed = bed->next)
+ bedIntoRangeTree(bed, rangeTree);
+ struct range *rn, *rangeList = rangeTreeList(rangeTree);
+
+ char position[128];
+ for (rn = rangeList; rn; rn = rn->next)
+ {
+ safef(position, sizeof(position), "%s:%d-%d",
+ chrom, rn->start, rn->end);
+
+ slNameAddHead(&slList, position);
+ }
+ rangeTreeFree(&rangeTree);
+ }
+
+slReverse(&slList);
+return slList;
+}
+
+/* Code adapted from Samtools samview.c */
+static inline int __g_skip_alignment(const bam1_t *b)
+{
+if (b->core.qual < mapQuality)
+ return 1;
+if (!(b->core.flag & BAM_FPAIRED) || (b->core.flag & BAM_FUNMAP))
+ return 1; // not paired data or read is unmapped.
+if (b->core.flag & BAM_FMUNMAP)
+ return 1; // mate is unmapped
+if (b->core.tid == b->core.mtid)
+ return 1; // mate maps to same chromosome.
+
+// add discordant reads that span large junctions...
+
+return 0;
+}
+
+/* Code adapted from Samtools samview.c */
+static int fetch_func(const bam1_t *b, void *data)
+{
+if (!__g_skip_alignment(b))
+ {
+ fetch_data_t *fd = (fetch_data_t*)data;
+ struct hashEl *el;
+ struct hash *tmpHash;
+ if ((el = hashLookup(fd->hash, fd->h->target_name[b->core.mtid])) == NULL)
+ {
+ tmpHash = hashNew(0);
+ hashAdd(fd->hash, fd->h->target_name[b->core.mtid], tmpHash);
+ }
+ else
+ tmpHash = el->val;
+
+ hashAddInt(tmpHash, bam1_qname(b), 0);
+ samwrite((samfile_t*)fd->out, b);
+ }
+
+return 0;
+}
+
+static int fetch_pair_func(const bam1_t *b, void *data)
+{
+if (!__g_skip_alignment(b))
+ {
+ fetch_data_t *fd = (fetch_data_t*)data;
+ if (!hashLookup(fd->read_hash, bam1_qname(b)))
+ return 0;
+ fd->found += 1;
+ samwrite((samfile_t*)fd->out, b);
+ }
+
+return 0;
+}
+
+void bamFindPairs(char *bedfile, char *inbam, char *outbam)
+/* bamFindPairs - Finding pairs a BAM file for specified region(s) and criteria. */
+{
+samfile_t *in = samopen(inbam, "rb", NULL);
+if (in == NULL)
+ errAbort("samopen(%s, \"rb\") returned NULL", inbam);
+
+/* if outbam is not given, assume user wants data sent to stdout (i.e. filename = '-')
+ * and in plain text to use in series of piped commands.
+ * if outbam is given, write bam file (binary)
+ */
+
+samfile_t *out;
+if (outbam)
+ out = samopen(outbam, "wbh", in->header);
+else
+ out = samopen("-", "wh", in->header);
+
+if (out == NULL)
+ {
+ samclose(in);
+ errAbort("Failed to open file for writing.\n");
+ }
+
+bam_index_t *idx = bam_index_load(inbam);
+if (idx == 0)
+ {
+ samclose(in);
+ samclose(out);
+ errAbort("Index for %s could not be read,"
+ "please sort (if necessary) & create index with \n\tsamtools sort / index %s",
+ inbam, inbam);
+ }
+
+fetch_data_t fd;
+fd.out = out;
+fd.hash = hashNew(0);
+fd.h = in->header;
+fd.found = 0;
+
+struct slName *pos, *positions = buildPositionList(bedfile);
+
+int ref, beg, end; // chrom id (does not include 'chr')
+for (pos = positions; pos; pos = pos->next)
+ {
+ fprintf(stderr, "pos->name = %s\n", pos->name);
+ int ret = bam_parse_region(in->header, pos->name, &ref, &beg, &end);
+ if (ret < 0)
+ continue;
+ bam_fetch(in->x.bam, idx, ref, beg, end, &fd, fetch_func);
+ }
+
+// DO stuff here!
+
+fprintf(stderr, "found reads on %d chromosomes, now finding pairs.\n",
+ hashNumEntries(fd.hash));
+
+struct hashEl *el;
+struct hashCookie cookie = hashFirst(fd.hash);
+while ((el = hashNext(&cookie)) != NULL)
+ {
+ char *chrom = el->name;
+ fd.read_hash = el->val;
+
+ if (!fd.read_hash)
+ {
+ fprintf(stderr, "no hash\n");
+ continue;
+ }
+ fprintf(stderr, "checking %s, %d reads\n", chrom, hashNumEntries(fd.read_hash));
+
+ ref = 0;
+ beg = 0;
+ end = 0x7fffffff;
+ int ret = bam_parse_region(in->header, chrom, &ref, &beg, &end);
+ if (ret < 0 && ref < 0)
+ {
+ fprintf(stderr, "couldn't parse region\n");
+ continue;
+ }
+ bam_fetch(in->x.bam, idx, ref, beg, end, &fd, fetch_pair_func);
+
+ fprintf(stderr, "found %d pairs so far.\n", fd.found);
+ }
+
+bam_index_destroy(idx);
+samclose(in);
+samclose(out);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc < 3)
+ usage();
+
+if (optionExists("stripPrefix"))
+ stripPrefix = optionVal("stripPrefix", "chr");
+if (optionExists("padding"))
+ padding = optionInt("padding", 20);
+if (optionExists("qual"))
+ mapQuality = optionInt("qual", 0);
+
+char *bedfile = argv[1];
+char *inbam = argv[2];
+
+char *outbam = NULL;
+if (argc == 4)
+ outbam = argv[3];
+
+bamFindPairs(bedfile, inbam, outbam);
+
+return 0;
+}
+
+#else // USE_BAM not defined
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+printf("BAM not installed");
+
+return 0;
+}
+#endif