src/hg/tcga/pBamBam/pBamStats.c 1.3

1.3 2010/01/06 21:15:48 jsanborn
updated
Index: src/hg/tcga/pBamBam/pBamStats.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/pBamStats.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/tcga/pBamBam/pBamStats.c	5 Jan 2010 04:30:53 -0000	1.2
+++ src/hg/tcga/pBamBam/pBamStats.c	6 Jan 2010 21:15:48 -0000	1.3
@@ -1,201 +1,278 @@
 /* bamStats - Gathering useful information from BAM files. 
  *   code adapted from samtools...
  */
 
 #ifdef USE_BAM
 
 #include "common.h"
 #include "hCommon.h"
 #include "linefile.h"
 #include "hash.h"
 #include "hdb.h"
 #include "options.h"
 #include "jksql.h"
 
 #define _IOLIB 2
 #include "bam.h"
 #include "sam.h"
 #include "bam_helper.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 	 "bamStats - Useful info from BAM file (chroms represented, etc.)\n"
 	 "usage:\n"
 	 "   bamStats chrom <in.bam>\n"
 	 "options:\n"
 	 "\n"
   );
 }
 
 static struct optionSpec options[] = {
     {NULL, 0},
 };
 
 #define HIGH_Q 30
 #define MAX_ISIZE 10000
 
 typedef struct {
+    int beg, end;
+    samfile_t *in;
+} tmpstruct_t;
+
+typedef struct {
     double avg_isize, n_good_paired;
     long long prev_pos, n_pos, n_align;
     long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good;
     long long n_sgltn, n_read1, n_read2;
     long long n_qcfail, n_dup;
     long long n_interchr, n_interchrhigh;
     long long n_intrachr, n_intrachrhigh;
 } bam_flagstat_t;
 
-void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c)
+typedef struct {
+    bam_plbuf_t *buf;
+    bam_flagstat_t *s;
+} fetch_data_t;
+
+
+
+void flagstat_loop(bam_flagstat_t *s, const bam1_core_t *c)
 {
 ++(s)->n_reads;
 if ((c)->pos > (s)->prev_pos) 
     {
     ++(s)->n_pos;                 // number of unique locations
     (s)->n_align += (c)->l_qseq;  // number of aligned positions in unique seq
     }
 (s)->prev_pos = (c)->pos;
 
 if ((c)->flag & BAM_FPAIRED) 
     {					
     ++(s)->n_pair_all;							
     if ((c)->flag & BAM_FPROPER_PAIR) 
 	++(s)->n_pair_good;		
     if ((c)->flag & BAM_FREAD1) 
 	++(s)->n_read1;				
     if ((c)->flag & BAM_FREAD2) 
 	++(s)->n_read2;				
     if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) 
 	++(s)->n_sgltn;
     if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) 
 	{
 	++(s)->n_pair_map;				
 	if ((c)->mtid != (c)->tid) 
 	    {			
 	    ++(s)->n_interchr;				
 	    if ((c)->qual >= HIGH_Q) 
 		++(s)->n_interchrhigh;		
 	    }
 	else if (abs((c)->mpos - (c)->pos) >= MAX_ISIZE)
 	    {
 	    ++(s)->n_intrachr;				
 	    if ((c)->qual >= HIGH_Q) 
 		++(s)->n_intrachrhigh;		
 	    }
 	else
 	    {
 	    double isize = (double) abs((c)->mpos - (c)->pos);
 	    (s)->avg_isize = ((s)->avg_isize * (s)->n_good_paired + isize); 
 	    (s)->n_good_paired += 1.0;
 	    (s)->avg_isize /= (s)->n_good_paired;
 	    }
 	}							
     }							
 if (!((c)->flag & BAM_FUNMAP)) 
     ++(s)->n_mapped;
 if ((c)->flag & BAM_FQCFAIL) 
     ++(s)->n_qcfail;
 if ((c)->flag & BAM_FDUP) 
     ++(s)->n_dup;
 }
 
+typedef struct {
+    bam_header_t *h;
+    uint32_t format;
+    int tid, len, last_pos;
+    int mask;
+    char *ref;
 
-bam_flagstat_t *bam_flagstat_core(bamFile fp)
-{
-bam_flagstat_t *s;
-bam1_t *b;
-bam1_core_t *c;
-int ret;
-s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+    double avg_cov, n_cov_pos;
+    double avg_cov_uniq, n_cov_pos_uniq;
+} func_data_t;
 
-b = bam_init1();
-c = &b->core;
-int prev_tid = -1;
-while ((ret = bam_read1(fp, b)) >= 0)
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{  // will put everything to stdout.
+func_data_t *d = (func_data_t*)data;
+
+int i = 0;
+double n_reads      = 0.0;
+double n_reads_uniq = 0.0;
+int last_pos = -1;
+for (i = 0; i < n; ++i)
     {
-    if (prev_tid == -1)
-	prev_tid = (c)->tid;
-    else if ((c)->tid != prev_tid)
-	break;	
+    const bam_pileup1_t *p = pu + i;
     
-    flagstat_loop(s, c);
+    n_reads += 1.0;
+    bam1_core_t *c = &p->b->core;
+    if ((c)->pos > last_pos)
+	{
+	n_reads_uniq += 1.0;
+	last_pos = (c)->pos;
     }
-bam_destroy1(b);
+    }
+
+d->avg_cov = (d->avg_cov * d->n_cov_pos + n_reads);
+d->n_cov_pos += 1.0;
+d->avg_cov /= d->n_cov_pos;
+
+d->avg_cov_uniq = (d->avg_cov_uniq * d->n_cov_pos_uniq + n_reads_uniq);
+d->n_cov_pos_uniq += 1.0;
+d->avg_cov_uniq /= d->n_cov_pos_uniq;
+
+return 0;
+}
 
-return s;
+// callback for bam_fetch()
+static int fetch_func(const bam1_t *b, void *data)
+{
+fetch_data_t *d = (fetch_data_t*)data;
+
+bam_flagstat_t *s = d->s;
+bam_plbuf_t *buf = d->buf; 
+
+const bam1_core_t *c = &b->core;
+flagstat_loop(s, c);
+
+bam_plbuf_push(b, buf);
+return 0;
 }
 
 void bamStats(char *position, char *inbam)
 /* bamFilter - Filtering a BAM file for specified region(s) and criteria. */
 {
-bamFile fp;
-bam_header_t *header;
 bam_flagstat_t *s;
-fp = strcmp(inbam, "-")? bam_open(inbam, "r") : bam_dopen(fileno(stdin), "r");
-if (!fp)
-    errAbort("Problem opening bamfile %s\n", inbam);
-
-header = bam_header_read(fp);
-
-int ref, beg, end;
-bam_index_t *idx = bam_index_load(inbam); // load BAM index for L file
-bam_parse_region(header, position, &ref, &beg, &end);
+tmpstruct_t tmp;  
+bam_plbuf_t *buf;
+bam1_t *b;
+
+tmp.beg = 0; tmp.end = 0x7fffffff;
+tmp.in = samopen(inbam, "rb", NULL);
+if (tmp.in == NULL)
+    errAbort("samopen(%s, \"rb\") returned NULL", inbam);
+
+func_data_t *d = (func_data_t*)calloc(1, sizeof(func_data_t));
+d->tid = -1; 
+d->mask = BAM_DEF_MASK;
+d->h = tmp.in->header;
+
+b = bam_init1();
+buf = bam_plbuf_init(pileup_func, d);
+
+s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+
+fetch_data_t *fd = (fetch_data_t*)calloc(1, sizeof(fetch_data_t));
+fd->buf = buf;
+fd->s = s;
+
+bam_plbuf_set_mask(buf, -1);  // mask = -1, no masking?
+
+int ref;
+bam_index_t *idx;
+idx = bam_index_load(inbam); // load BAM index
+if (!idx) 
+    {
+    samclose(tmp.in);
+    errAbort("BAM indexing file is not available.\n");
+    }
+bam_parse_region(tmp.in->header, position, &ref,
+		 &tmp.beg, &tmp.end); // parse the region
 if (ref < 0)
     {
-    fprintf(stderr, "Invalid region for BAM file\n");
-    return;
+    samclose(tmp.in);
+    errAbort("Invalid region %s\n", position);
     }
-bam_file_seek(fp, idx, ref, beg, end);
-s = bam_flagstat_core(fp);
+
+bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, fd, fetch_func);
+bam_plbuf_push(0, buf); // finalize pileup
+
+samclose(tmp.in);
 
 printf(">%s\n", position);
+printf("avg_cov\t%f\n", d->avg_cov);
+printf("n_cov_pos\t%f\n", d->n_cov_pos);
+printf("avg_cov_uniq\t%f\n", d->avg_cov_uniq);
+printf("n_cov_pos_uniq\t%f\n", d->n_cov_pos_uniq);
 printf("n_reads\t%lld\n", s->n_reads);
 printf("n_pos\t%lld\n", s->n_pos);
 printf("n_align\t%lld\n", s->n_align);
 printf("fail_QA\t%lld\n", s->n_qcfail);
 printf("dupes\t%lld\n", s->n_dup);
 printf("mapped\t%lld\n", s->n_mapped);
 printf("paired\t%lld\n", s->n_pair_all);
 printf("read1\t%lld\n", s->n_read1);
 printf("read2\t%lld\n", s->n_read2);
 printf("prop_paired\t%lld\n", s->n_pair_good);
 printf("both_mapped\t%lld\n", s->n_pair_map);
 printf("concordant\t%lld\n", (long long) s->n_good_paired);
 printf("avg_isize\t%f\n", s->avg_isize);
 printf("singletons\t%lld\n", s->n_sgltn);
 printf("inter_pair\t%lld\n", s->n_interchr);
 printf("inter_pair_gt_30Q\t%lld\n", s->n_interchrhigh);
 printf("intra_pair\t%lld\n", s->n_intrachr);
 printf("intra_pair_gt_30Q\t%lld\n", s->n_intrachrhigh);
 free(s);
 
-bam_close(fp);
+bam_index_destroy(idx);
+bam_plbuf_destroy(buf);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 
 char *position = argv[1];
 char *inbam = argv[2];
 
 bamStats(position, inbam);
 
 return 0;
 }
 
 #else // USE_BAM not defined
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 printf("BAM not installed");
 
 return 0;
 }
 #endif
+