src/hg/tcga/pBamBam/pBamStats.c 1.2

1.2 2010/01/05 04:30:53 jsanborn
updated
Index: src/hg/tcga/pBamBam/pBamStats.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/pBamBam/pBamStats.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/tcga/pBamBam/pBamStats.c	5 Jan 2010 04:03:36 -0000	1.1
+++ src/hg/tcga/pBamBam/pBamStats.c	5 Jan 2010 04:30:53 -0000	1.2
@@ -1,198 +1,201 @@
 /* bamStats - Gathering useful information from BAM files. 
  *   code adapted from samtools...
  */
 
 #ifdef USE_BAM
 
 #include "common.h"
 #include "hCommon.h"
 #include "linefile.h"
 #include "hash.h"
 #include "hdb.h"
 #include "options.h"
 #include "jksql.h"
 
 #define _IOLIB 2
 #include "bam.h"
 #include "sam.h"
 #include "bam_helper.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 	 "bamStats - Useful info from BAM file (chroms represented, etc.)\n"
 	 "usage:\n"
 	 "   bamStats chrom <in.bam>\n"
 	 "options:\n"
 	 "\n"
   );
 }
 
 static struct optionSpec options[] = {
     {NULL, 0},
 };
 
 #define HIGH_Q 30
 #define MAX_ISIZE 10000
 
 typedef struct {
   double avg_isize, n_good_paired;
-  long long prev_pos, n_pos;
+    long long prev_pos, n_pos, n_align;
   long long n_reads, n_mapped, n_pair_all, n_pair_map, n_pair_good;
   long long n_sgltn, n_read1, n_read2;
   long long n_qcfail, n_dup;
   long long n_interchr, n_interchrhigh;
   long long n_intrachr, n_intrachrhigh;
 } bam_flagstat_t;
 
 void flagstat_loop(bam_flagstat_t *s, bam1_core_t *c)
 {
 ++(s)->n_reads;
 if ((c)->pos > (s)->prev_pos) 
-    ++(s)->n_pos;
-
+    {
+    ++(s)->n_pos;                 // number of unique locations
+    (s)->n_align += (c)->l_qseq;  // number of aligned positions in unique seq
+    }
 (s)->prev_pos = (c)->pos;
 
 if ((c)->flag & BAM_FPAIRED) 
     {					
     ++(s)->n_pair_all;							
     if ((c)->flag & BAM_FPROPER_PAIR) 
 	++(s)->n_pair_good;		
     if ((c)->flag & BAM_FREAD1) 
 	++(s)->n_read1;				
     if ((c)->flag & BAM_FREAD2) 
 	++(s)->n_read2;				
     if (((c)->flag & BAM_FMUNMAP) && !((c)->flag & BAM_FUNMAP)) 
 	++(s)->n_sgltn;
     if (!((c)->flag & BAM_FUNMAP) && !((c)->flag & BAM_FMUNMAP)) 
 	{
 	++(s)->n_pair_map;				
 	if ((c)->mtid != (c)->tid) 
 	    {			
 	    ++(s)->n_interchr;				
 	    if ((c)->qual >= HIGH_Q) 
 		++(s)->n_interchrhigh;		
 	    }
 	else if (abs((c)->mpos - (c)->pos) >= MAX_ISIZE)
 	    {
 	    ++(s)->n_intrachr;				
 	    if ((c)->qual >= HIGH_Q) 
 		++(s)->n_intrachrhigh;		
 	    }
 	else
 	    {
 	    double isize = (double) abs((c)->mpos - (c)->pos);
 	    (s)->avg_isize = ((s)->avg_isize * (s)->n_good_paired + isize); 
 	    (s)->n_good_paired += 1.0;
 	    (s)->avg_isize /= (s)->n_good_paired;
 	    }
 	}							
     }							
 if (!((c)->flag & BAM_FUNMAP)) 
     ++(s)->n_mapped;
 if ((c)->flag & BAM_FQCFAIL) 
     ++(s)->n_qcfail;
 if ((c)->flag & BAM_FDUP) 
     ++(s)->n_dup;
 }
 
 
 bam_flagstat_t *bam_flagstat_core(bamFile fp)
 {
 bam_flagstat_t *s;
 bam1_t *b;
 bam1_core_t *c;
 int ret;
 s = (bam_flagstat_t*)calloc(1, sizeof(bam_flagstat_t));
 
 b = bam_init1();
 c = &b->core;
 int prev_tid = -1;
 while ((ret = bam_read1(fp, b)) >= 0)
     {
     if (prev_tid == -1)
 	prev_tid = (c)->tid;
     else if ((c)->tid != prev_tid)
 	break;	
     
     flagstat_loop(s, c);
     }
 bam_destroy1(b);
 
 return s;
 }
 
 void bamStats(char *position, char *inbam)
 /* bamFilter - Filtering a BAM file for specified region(s) and criteria. */
 {
 bamFile fp;
 bam_header_t *header;
 bam_flagstat_t *s;
 fp = strcmp(inbam, "-")? bam_open(inbam, "r") : bam_dopen(fileno(stdin), "r");
 if (!fp)
     errAbort("Problem opening bamfile %s\n", inbam);
 
 header = bam_header_read(fp);
 
 int ref, beg, end;
 bam_index_t *idx = bam_index_load(inbam); // load BAM index for L file
 bam_parse_region(header, position, &ref, &beg, &end);
 if (ref < 0)
     {
     fprintf(stderr, "Invalid region for BAM file\n");
     return;
     }
 bam_file_seek(fp, idx, ref, beg, end);
 s = bam_flagstat_core(fp);
 
 printf(">%s\n", position);
 printf("n_reads\t%lld\n", s->n_reads);
 printf("n_pos\t%lld\n", s->n_pos);
+printf("n_align\t%lld\n", s->n_align);
 printf("fail_QA\t%lld\n", s->n_qcfail);
 printf("dupes\t%lld\n", s->n_dup);
 printf("mapped\t%lld\n", s->n_mapped);
 printf("paired\t%lld\n", s->n_pair_all);
 printf("read1\t%lld\n", s->n_read1);
 printf("read2\t%lld\n", s->n_read2);
 printf("prop_paired\t%lld\n", s->n_pair_good);
 printf("both_mapped\t%lld\n", s->n_pair_map);
 printf("concordant\t%lld\n", (long long) s->n_good_paired);
 printf("avg_isize\t%f\n", s->avg_isize);
 printf("singletons\t%lld\n", s->n_sgltn);
 printf("inter_pair\t%lld\n", s->n_interchr);
 printf("inter_pair_gt_30Q\t%lld\n", s->n_interchrhigh);
 printf("intra_pair\t%lld\n", s->n_intrachr);
 printf("intra_pair_gt_30Q\t%lld\n", s->n_intrachrhigh);
 free(s);
 
 bam_close(fp);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 3)
     usage();
 
 char *position = argv[1];
 char *inbam = argv[2];
 
 bamStats(position, inbam);
 
 return 0;
 }
 
 #else // USE_BAM not defined
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 printf("BAM not installed");
 
 return 0;
 }
 #endif