src/hg/tcga/pBamBam/pBamStats.c 1.3
1.3 2010/01/06 21:15:48 jsanborn
updated
Index: src/hg/tcga/pBamBam/pBamStats.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/pBamStats.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/tcga/pBamBam/pBamStats.c 5 Jan 2010 04:30:53 -0000 1.2
+++ src/hg/tcga/pBamBam/pBamStats.c 6 Jan 2010 21:15:48 -0000 1.3
@@ -1,201 +1,278 @@
/* bamStats - Gathering useful information from BAM files.
* code adapted from samtools...
*/
#ifdef USE_BAM
#include "common.h"
#include "hCommon.h"
#include "linefile.h"
#include "hash.h"
#include "hdb.h"
#include "options.h"
#include "jksql.h"
#define _IOLIB 2
#include "bam.h"
#include "sam.h"
#include "bam_helper.h"
static char const rcsid[] = "$Id$";
void usage()
/* Explain usage and exit. */
{
errAbort(
"bamStats - Useful info from BAM file (chroms represented, etc.)\n"
"usage:\n"
" bamStats chrom <in.bam>\n"
"options:\n"
"\n"
);
}
static struct optionSpec options[] = {
{NULL, 0},
};
#define HIGH_Q 30
#define MAX_ISIZE 10000
typedef struct {
+ int beg, end;
+ samfile_t *in;
+} tmpstruct_t;
+
+typedef struct {
double avg_isize, n_good_paired;
long long prev_pos, n_pos, n_align;
long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good;
long long n_sgltn, n_read1, n_read2;
long long n_qcfail, n_dup;
long long n_interchr, n_interchrhigh;
long long n_intrachr, n_intrachrhigh;
} bam_flagstat_t;
-void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c)
+typedef struct {
+ bam_plbuf_t *buf;
+ bam_flagstat_t *s;
+} fetch_data_t;
+
+
+
+void flagstat_loop(bam_flagstat_t *s, const bam1_core_t *c)
{
++(s)->n_reads;
if ((c)->pos > (s)->prev_pos)
{
++(s)->n_pos; // number of unique locations
(s)->n_align += (c)->l_qseq; // number of aligned positions in unique seq
}
(s)->prev_pos = (c)->pos;
if ((c)->flag & BAM_FPAIRED)
{
++(s)->n_pair_all;
if ((c)->flag & BAM_FPROPER_PAIR)
++(s)->n_pair_good;
if ((c)->flag & BAM_FREAD1)
++(s)->n_read1;
if ((c)->flag & BAM_FREAD2)
++(s)->n_read2;
if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP))
++(s)->n_sgltn;
if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP))
{
++(s)->n_pair_map;
if ((c)->mtid != (c)->tid)
{
++(s)->n_interchr;
if ((c)->qual >= HIGH_Q)
++(s)->n_interchrhigh;
}
else if (abs((c)->mpos - (c)->pos) >= MAX_ISIZE)
{
++(s)->n_intrachr;
if ((c)->qual >= HIGH_Q)
++(s)->n_intrachrhigh;
}
else
{
double isize = (double) abs((c)->mpos - (c)->pos);
(s)->avg_isize = ((s)->avg_isize * (s)->n_good_paired + isize);
(s)->n_good_paired += 1.0;
(s)->avg_isize /= (s)->n_good_paired;
}
}
}
if (!((c)->flag & BAM_FUNMAP))
++(s)->n_mapped;
if ((c)->flag & BAM_FQCFAIL)
++(s)->n_qcfail;
if ((c)->flag & BAM_FDUP)
++(s)->n_dup;
}
+typedef struct {
+ bam_header_t *h;
+ uint32_t format;
+ int tid, len, last_pos;
+ int mask;
+ char *ref;
-bam_flagstat_t *bam_flagstat_core(bamFile fp)
-{
-bam_flagstat_t *s;
-bam1_t *b;
-bam1_core_t *c;
-int ret;
-s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+ double avg_cov, n_cov_pos;
+ double avg_cov_uniq, n_cov_pos_uniq;
+} func_data_t;
-b = bam_init1();
-c = &b->core;
-int prev_tid = -1;
-while ((ret = bam_read1(fp, b)) >= 0)
+static int pileup_func(uint32_t tid, uint32_t pos, int n, const bam_pileup1_t *pu, void *data)
+{ // will put everything to stdout.
+func_data_t *d = (func_data_t*)data;
+
+int i = 0;
+double n_reads = 0.0;
+double n_reads_uniq = 0.0;
+int last_pos = -1;
+for (i = 0; i < n; ++i)
{
- if (prev_tid == -1)
- prev_tid = (c)->tid;
- else if ((c)->tid != prev_tid)
- break;
+ const bam_pileup1_t *p = pu + i;
- flagstat_loop(s, c);
+ n_reads += 1.0;
+ bam1_core_t *c = &p->b->core;
+ if ((c)->pos > last_pos)
+ {
+ n_reads_uniq += 1.0;
+ last_pos = (c)->pos;
}
-bam_destroy1(b);
+ }
+
+d->avg_cov = (d->avg_cov * d->n_cov_pos + n_reads);
+d->n_cov_pos += 1.0;
+d->avg_cov /= d->n_cov_pos;
+
+d->avg_cov_uniq = (d->avg_cov_uniq * d->n_cov_pos_uniq + n_reads_uniq);
+d->n_cov_pos_uniq += 1.0;
+d->avg_cov_uniq /= d->n_cov_pos_uniq;
+
+return 0;
+}
-return s;
+// callback for bam_fetch()
+static int fetch_func(const bam1_t *b, void *data)
+{
+fetch_data_t *d = (fetch_data_t*)data;
+
+bam_flagstat_t *s = d->s;
+bam_plbuf_t *buf = d->buf;
+
+const bam1_core_t *c = &b->core;
+flagstat_loop(s, c);
+
+bam_plbuf_push(b, buf);
+return 0;
}
void bamStats(char *position, char *inbam)
/* bamFilter - Filtering a BAM file for specified region(s) and criteria. */
{
-bamFile fp;
-bam_header_t *header;
bam_flagstat_t *s;
-fp = strcmp(inbam, "-")? bam_open(inbam, "r") : bam_dopen(fileno(stdin), "r");
-if (!fp)
- errAbort("Problem opening bamfile %s\n", inbam);
-
-header = bam_header_read(fp);
-
-int ref, beg, end;
-bam_index_t *idx = bam_index_load(inbam); // load BAM index for L file
-bam_parse_region(header, position, &ref, &beg, &end);
+tmpstruct_t tmp;
+bam_plbuf_t *buf;
+bam1_t *b;
+
+tmp.beg = 0; tmp.end = 0x7fffffff;
+tmp.in = samopen(inbam, "rb", NULL);
+if (tmp.in == NULL)
+ errAbort("samopen(%s, \"rb\") returned NULL", inbam);
+
+func_data_t *d = (func_data_t*)calloc(1, sizeof(func_data_t));
+d->tid = -1;
+d->mask = BAM_DEF_MASK;
+d->h = tmp.in->header;
+
+b = bam_init1();
+buf = bam_plbuf_init(pileup_func, d);
+
+s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
+
+fetch_data_t *fd = (fetch_data_t*)calloc(1, sizeof(fetch_data_t));
+fd->buf = buf;
+fd->s = s;
+
+bam_plbuf_set_mask(buf, -1); // mask = -1, no masking?
+
+int ref;
+bam_index_t *idx;
+idx = bam_index_load(inbam); // load BAM index
+if (!idx)
+ {
+ samclose(tmp.in);
+ errAbort("BAM indexing file is not available.\n");
+ }
+bam_parse_region(tmp.in->header, position, &ref,
+ &tmp.beg, &tmp.end); // parse the region
if (ref < 0)
{
- fprintf(stderr, "Invalid region for BAM file\n");
- return;
+ samclose(tmp.in);
+ errAbort("Invalid region %s\n", position);
}
-bam_file_seek(fp, idx, ref, beg, end);
-s = bam_flagstat_core(fp);
+
+bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, fd, fetch_func);
+bam_plbuf_push(0, buf); // finalize pileup
+
+samclose(tmp.in);
printf(">%s\n", position);
+printf("avg_cov\t%f\n", d->avg_cov);
+printf("n_cov_pos\t%f\n", d->n_cov_pos);
+printf("avg_cov_uniq\t%f\n", d->avg_cov_uniq);
+printf("n_cov_pos_uniq\t%f\n", d->n_cov_pos_uniq);
printf("n_reads\t%lld\n", s->n_reads);
printf("n_pos\t%lld\n", s->n_pos);
printf("n_align\t%lld\n", s->n_align);
printf("fail_QA\t%lld\n", s->n_qcfail);
printf("dupes\t%lld\n", s->n_dup);
printf("mapped\t%lld\n", s->n_mapped);
printf("paired\t%lld\n", s->n_pair_all);
printf("read1\t%lld\n", s->n_read1);
printf("read2\t%lld\n", s->n_read2);
printf("prop_paired\t%lld\n", s->n_pair_good);
printf("both_mapped\t%lld\n", s->n_pair_map);
printf("concordant\t%lld\n", (long long) s->n_good_paired);
printf("avg_isize\t%f\n", s->avg_isize);
printf("singletons\t%lld\n", s->n_sgltn);
printf("inter_pair\t%lld\n", s->n_interchr);
printf("inter_pair_gt_30Q\t%lld\n", s->n_interchrhigh);
printf("intra_pair\t%lld\n", s->n_intrachr);
printf("intra_pair_gt_30Q\t%lld\n", s->n_intrachrhigh);
free(s);
-bam_close(fp);
+bam_index_destroy(idx);
+bam_plbuf_destroy(buf);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc < 3)
usage();
char *position = argv[1];
char *inbam = argv[2];
bamStats(position, inbam);
return 0;
}
#else // USE_BAM not defined
int main(int argc, char *argv[])
/* Process command line. */
{
printf("BAM not installed");
return 0;
}
#endif
+