src/hg/tcga/pBamBam/pBamStats.c 1.3

1.3 2010/01/06 21:15:48 jsanborn
updated
Index: src/hg/tcga/pBamBam/pBamStats.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/pBamStats.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/tcga/pBamBam/pBamStats.c	5 Jan 2010 04:30:53 -0000	1.2
+++ src/hg/tcga/pBamBam/pBamStats.c	6 Jan 2010 21:15:48 -0000	1.3
@@ -38,8 +38,13 @@
 #define HIGH_Q 30
 #define MAX_ISIZE 10000
 
 typedef struct {
+    int beg, end;
+    samfile_t *in;
+} tmpstruct_t;
+
+typedef struct {
     double avg_isize, n_good_paired;
     long long prev_pos, n_pos, n_align;
     long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good;
     long long n_sgltn, n_read1, n_read2;
@@ -47,9 +52,16 @@
     long long n_interchr, n_interchrhigh;
     long long n_intrachr, n_intrachrhigh;
 } bam_flagstat_t;
 
-void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c)
+typedef struct {
+    bam_plbuf_t *buf;
+    bam_flagstat_t *s;
+} fetch_data_t;
+
+
+
+void flagstat_loop(bam_flagstat_t *s, const bam1_core_t *c)
 {
 ++(s)->n_reads;
 if ((c)->pos > (s)->prev_pos) 
     {
@@ -100,58 +112,121 @@
 if ((c)->flag & BAM_FDUP) 
     ++(s)->n_dup;
 }
 
+typedef struct {
+    bam_header_t *h;
+    uint32_t format;
+    int tid, len, last_pos;
+    int mask;
+    char *ref;
 
-bam_flagstat_t *bam_flagstat_core(bamFile fp)
-{
-bam_flagstat_t *s;
-bam1_t *b;
-bam1_core_t *c;
-int ret;
-s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+    double avg_cov, n_cov_pos;
+    double avg_cov_uniq, n_cov_pos_uniq;
+} func_data_t;
 
-b = bam_init1();
-c = &b->core;
-int prev_tid = -1;
-while ((ret = bam_read1(fp, b)) >= 0)
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{  // will put everything to stdout.
+func_data_t *d = (func_data_t*)data;
+
+int i = 0;
+double n_reads      = 0.0;
+double n_reads_uniq = 0.0;
+int last_pos = -1;
+for (i = 0; i < n; ++i)
     {
-    if (prev_tid == -1)
-	prev_tid = (c)->tid;
-    else if ((c)->tid != prev_tid)
-	break;	
+    const bam_pileup1_t *p = pu + i;
     
-    flagstat_loop(s, c);
+    n_reads += 1.0;
+    bam1_core_t *c = &p->b->core;
+    if ((c)->pos > last_pos)
+	{
+	n_reads_uniq += 1.0;
+	last_pos = (c)->pos;
     }
-bam_destroy1(b);
+    }
+
+d->avg_cov = (d->avg_cov * d->n_cov_pos + n_reads);
+d->n_cov_pos += 1.0;
+d->avg_cov /= d->n_cov_pos;
+
+d->avg_cov_uniq = (d->avg_cov_uniq * d->n_cov_pos_uniq + n_reads_uniq);
+d->n_cov_pos_uniq += 1.0;
+d->avg_cov_uniq /= d->n_cov_pos_uniq;
+
+return 0;
+}
 
-return s;
+// callback for bam_fetch()
+static int fetch_func(const bam1_t *b, void *data)
+{
+fetch_data_t *d = (fetch_data_t*)data;
+
+bam_flagstat_t *s = d->s;
+bam_plbuf_t *buf = d->buf; 
+
+const bam1_core_t *c = &b->core;
+flagstat_loop(s, c);
+
+bam_plbuf_push(b, buf);
+return 0;
 }
 
 void bamStats(char *position, char *inbam)
 /* bamFilter - Filtering a BAM file for specified region(s) and criteria. */
 {
-bamFile fp;
-bam_header_t *header;
 bam_flagstat_t *s;
-fp = strcmp(inbam, "-")? bam_open(inbam, "r") : bam_dopen(fileno(stdin), "r");
-if (!fp)
-    errAbort("Problem opening bamfile %s\n", inbam);
-
-header = bam_header_read(fp);
-
-int ref, beg, end;
-bam_index_t *idx = bam_index_load(inbam); // load BAM index for L file
-bam_parse_region(header, position, &ref, &beg, &end);
+tmpstruct_t tmp;  
+bam_plbuf_t *buf;
+bam1_t *b;
+
+tmp.beg = 0; tmp.end = 0x7fffffff;
+tmp.in = samopen(inbam, "rb", NULL);
+if (tmp.in == NULL)
+    errAbort("samopen(%s, \"rb\") returned NULL", inbam);
+
+func_data_t *d = (func_data_t*)calloc(1, sizeof(func_data_t));
+d->tid = -1; 
+d->mask = BAM_DEF_MASK;
+d->h = tmp.in->header;
+
+b = bam_init1();
+buf = bam_plbuf_init(pileup_func, d);
+
+s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+
+fetch_data_t *fd = (fetch_data_t*)calloc(1, sizeof(fetch_data_t));
+fd->buf = buf;
+fd->s = s;
+
+bam_plbuf_set_mask(buf, -1);  // mask = -1, no masking?
+
+int ref;
+bam_index_t *idx;
+idx = bam_index_load(inbam); // load BAM index
+if (!idx) 
+    {
+    samclose(tmp.in);
+    errAbort("BAM indexing file is not available.\n");
+    }
+bam_parse_region(tmp.in->header, position, &ref,
+		 &tmp.beg, &tmp.end); // parse the region
 if (ref < 0)
     {
-    fprintf(stderr, "Invalid region for BAM file\n");
-    return;
+    samclose(tmp.in);
+    errAbort("Invalid region %s\n", position);
     }
-bam_file_seek(fp, idx, ref, beg, end);
-s = bam_flagstat_core(fp);
+
+bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, fd, fetch_func);
+bam_plbuf_push(0, buf); // finalize pileup
+
+samclose(tmp.in);
 
 printf(">%s\n", position);
+printf("avg_cov\t%f\n", d->avg_cov);
+printf("n_cov_pos\t%f\n", d->n_cov_pos);
+printf("avg_cov_uniq\t%f\n", d->avg_cov_uniq);
+printf("n_cov_pos_uniq\t%f\n", d->n_cov_pos_uniq);
 printf("n_reads\t%lld\n", s->n_reads);
 printf("n_pos\t%lld\n", s->n_pos);
 printf("n_align\t%lld\n", s->n_align);
 printf("fail_QA\t%lld\n", s->n_qcfail);
@@ -170,9 +245,10 @@
 printf("intra_pair\t%lld\n", s->n_intrachr);
 printf("intra_pair_gt_30Q\t%lld\n", s->n_intrachrhigh);
 free(s);
 
-bam_close(fp);
+bam_index_destroy(idx);
+bam_plbuf_destroy(buf);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */