src/hg/tcga/bamToFastQ/bamToFastQ.c 1.3

1.3 2010/03/11 04:38:52 jsanborn
updated to correctly output paired ends
Index: src/hg/tcga/bamToFastQ/bamToFastQ.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamToFastQ/bamToFastQ.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/tcga/bamToFastQ/bamToFastQ.c	10 Mar 2010 00:21:30 -0000	1.2
+++ src/hg/tcga/bamToFastQ/bamToFastQ.c	11 Mar 2010 04:38:52 -0000	1.3
@@ -1,177 +1,242 @@
 /* bamVariants -- discovering variants from BAM file
  *   code adapted from Heng Li's Samtools library functions
  */
 
 #ifdef USE_BAM
 
 #include "common.h"
 #include "hCommon.h"
 #include "linefile.h"
 #include "hash.h"
 #include "hdb.h"
 #include "options.h"
 #include "jksql.h"
 
 #define _IOLIB 2
 #include "bam.h"
 #include "sam.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bamVariants - Find variants in input BAM file, output position/type of variant.\n"
+  "   BAM files MUST be sorted by name to produce FASTQ files appropriate for \n"
+  "   alignment by MAQ, BWA, etc.\n"
   "usage:\n"
   "   bamVariants [options] <in.bam> <out_read1.fq> <out_read2.fq>\n"
   "options:\n"
   "   -position=<chrom position> - Specify a region of BAM file to perform pileup\n"
   "\n"
   );
 }
 
 static struct optionSpec options[] = {
     {"position", OPTION_STRING},
     {NULL, 0}
 };
 
 /* Option variables */
 char *position = NULL;
 
 typedef struct {
     int beg, end;
     samfile_t *in;
 } tmpstruct_t;
 
 
 typedef struct {
     FILE *f1, *f2;
+    bam1_t *b1, *b2;
 } readfiles_t;
 
-// callback for bam_fetch()
-static int fetch_func(const bam1_t *b, void *data)
+static inline void printForward(FILE *fp, const bam1_t *b)
 {
-int readNum = 0;
-const bam1_core_t *c = &b->core;
-
-readfiles_t *files = (readfiles_t*) data;
-FILE *fp = NULL;
-if ((c)->flag & BAM_FREAD1)
+int i, n = (b)->core.l_qseq;
+for (i = 0; i < n; ++i)
     {
-    fp = files->f1; 
-    readNum = 1;
+    int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]);
+    fputc(c, fp);
     }
-else if ((c)->flag & BAM_FREAD2)
+fputc('\n', fp);
+
+/* sequence base qualities */
+fputc('+', fp);
+fputc('\n', fp);
+for (i = 0; i < n; ++i)
     {
-    fp = files->f2;
-    readNum = 2;
+    int c = bam1_qual(b)[i] + 33;
+    if (c > 126)
+	c = 126;
+    fputc(c, fp);
     }
-if (!fp)
-    return 0;
+fputc('\n', fp);
+}
 
-/* sequence name */
-fprintf(fp, "@%s/%d\n", bam1_qname(b), readNum);
+//char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
 
-/* sequence */
+char *bam_nt16_reverse_table = "=TGMCRSVAWYHKDBN";
+static inline void printReverse(FILE *fp, const bam1_t *b)
+{
 int i, n = (b)->core.l_qseq;
-for (i = 0; i < n; ++i)
+for (i = n-1; i >= 0; --i)
     {
-    int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]);
+    int c = toupper(bam_nt16_reverse_table[bam1_seqi(bam1_seq(b), i)]);
     fputc(c, fp);
     }
 fputc('\n', fp);
 
 /* sequence base qualities */
 fputc('+', fp);
 fputc('\n', fp);
-for (i = 0; i < n; ++i)
+for (i = n-1; i >= 0; --i)
     {
     int c = bam1_qual(b)[i] + 33;
     if (c > 126)
 	c = 126;
     fputc(c, fp);
     }
 fputc('\n', fp);
+}
+
+// callback for bam_fetch()
+static inline int fetch_func(const bam1_t *b, void *data)
+{
+const bam1_core_t *c = &b->core;
+
+if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
+    return 0;
+if (!((c)->flag & BAM_FPAIRED) || ((c)->flag & BAM_FUNMAP))
+    return 0; // read is not paired or unmapped.
+if ((c)->flag & BAM_FMUNMAP)
+    return 0; // mate is unmapped.
+if ((c)->mtid != (c)->tid)
+    return 0;  // mate is on same chrom. MERGE INTER CODE HERE? 
+
+readfiles_t *files = (readfiles_t *) data;
+if ((c)->flag & BAM_FREAD1)
+    files->b1 = bam_dup1(b);
+else if ((c)->flag & BAM_FREAD2)
+    files->b2 = bam_dup1(b);
+
+if (!files->b1 || !files->b2)
+    return 0;
+
+char *name1 = bam1_qname(files->b1);
+char *name2 = bam1_qname(files->b2);
+
+if (!name1 || !name2)
+    return 0;
+
+if (!sameString(name1, name2))  /* different names */
+    return 0;
+
+/* print read 1 */
+c = &files->b1->core;
+fprintf(files->f1, "@%s/1\n", name1);
+
+/* sequence  & quality */
+if ((c)->flag & BAM_FREVERSE)
+    printReverse(files->f1, files->b1);
+else
+    printForward(files->f1, files->b1);
+
+/* print read 2 */
+c = &files->b2->core;
+fprintf(files->f2, "@%s/2\n", name2);
+
+/* sequence  & quality */
+if ((c)->flag & BAM_FREVERSE)
+    printReverse(files->f2, files->b2);
+else
+    printForward(files->f2, files->b2);
 
 return 0;
 }
 
 
 void bamToFastQ(char *inbam, char *fileRead1, char *fileRead2)
 /* bamToFastQ - Convert BAM -> FastQ format */
 {
 tmpstruct_t tmp;  
 tmp.beg = 0; tmp.end = 0x7fffffff;
 tmp.in = samopen(inbam, "rb", NULL);
 if (tmp.in == NULL)
     errAbort("samopen(%s, \"rb\") returned NULL", inbam);
 
 
 readfiles_t files;
 files.f1 = fopen(fileRead1, "w");
 files.f2 = fopen(fileRead2, "w");
 
+// containers to hold read pairs. 
+files.b1 = bam_init1();
+files.b2 = bam_init1();
+
 if (position) 
     {
     int ref;
     bam_index_t *idx;
     idx = bam_index_load(inbam); // load BAM index
     if (!idx) 
 	{
 	samclose(tmp.in);
 	errAbort("BAM indexing file is not available.\n");
 	}
     bam_parse_region(tmp.in->header, position, &ref,
 		     &tmp.beg, &tmp.end); // parse the region
     if (ref < 0) 
 	{
 	samclose(tmp.in);
 	errAbort("Invalid region %s\n", position);
 	}
     
     bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, &files, fetch_func);
     bam_index_destroy(idx);
     samclose(tmp.in);
     }
 else
     {  // region not specified 
     int ret;
     bam1_t *b = bam_init1();
     while ((ret = samread(tmp.in, b)) >= 0)
 	fetch_func(b, &files);   // read through entire file
     bam_destroy1(b);
     }
 
 fclose(files.f1);
 fclose(files.f2);
+bam_destroy1(files.b1);
+bam_destroy1(files.b2);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 4)
     usage();
 
 if (optionExists("position"))
     position = optionVal("position", NULL);
 
 char *inbam = argv[1];
 char *fileRead1 = argv[2];
 char *fileRead2 = argv[3];
 bamToFastQ(inbam, fileRead1, fileRead2);
 
 return 0;
 }
 
 #else // USE_BAM not defined
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 printf("BAM not installed");
 
 return 0;
 }
 #endif