src/hg/tcga/bamToFastQ/bamToFastQ.c 1.3

1.3 2010/03/11 04:38:52 jsanborn
updated to correctly output paired ends
Index: src/hg/tcga/bamToFastQ/bamToFastQ.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamToFastQ/bamToFastQ.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/tcga/bamToFastQ/bamToFastQ.c	10 Mar 2010 00:21:30 -0000	1.2
+++ src/hg/tcga/bamToFastQ/bamToFastQ.c	11 Mar 2010 04:38:52 -0000	1.3
@@ -22,8 +22,10 @@
 /* Explain usage and exit. */
 {
 errAbort(
   "bamVariants - Find variants in input BAM file, output position/type of variant.\n"
+  "   BAM files MUST be sorted by name to produce FASTQ files appropriate for \n"
+  "   alignment by MAQ, BWA, etc.\n"
   "usage:\n"
   "   bamVariants [options] <in.bam> <out_read1.fq> <out_read2.fq>\n"
   "options:\n"
   "   -position=<chrom position> - Specify a region of BAM file to perform pileup\n"
@@ -46,54 +48,111 @@
 
 
 typedef struct {
     FILE *f1, *f2;
+    bam1_t *b1, *b2;
 } readfiles_t;
 
-// callback for bam_fetch()
-static int fetch_func(const bam1_t *b, void *data)
+static inline void printForward(FILE *fp, const bam1_t *b)
 {
-int readNum = 0;
-const bam1_core_t *c = &b->core;
-
-readfiles_t *files = (readfiles_t*) data;
-FILE *fp = NULL;
-if ((c)->flag & BAM_FREAD1)
+int i, n = (b)->core.l_qseq;
+for (i = 0; i < n; ++i)
     {
-    fp = files->f1; 
-    readNum = 1;
+    int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]);
+    fputc(c, fp);
     }
-else if ((c)->flag & BAM_FREAD2)
+fputc('\n', fp);
+
+/* sequence base qualities */
+fputc('+', fp);
+fputc('\n', fp);
+for (i = 0; i < n; ++i)
     {
-    fp = files->f2;
-    readNum = 2;
+    int c = bam1_qual(b)[i] + 33;
+    if (c > 126)
+	c = 126;
+    fputc(c, fp);
     }
-if (!fp)
-    return 0;
+fputc('\n', fp);
+}
 
-/* sequence name */
-fprintf(fp, "@%s/%d\n", bam1_qname(b), readNum);
+//char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
 
-/* sequence */
+char *bam_nt16_reverse_table = "=TGMCRSVAWYHKDBN";
+static inline void printReverse(FILE *fp, const bam1_t *b)
+{
 int i, n = (b)->core.l_qseq;
-for (i = 0; i < n; ++i)
+for (i = n-1; i >= 0; --i)
     {
-    int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]);
+    int c = toupper(bam_nt16_reverse_table[bam1_seqi(bam1_seq(b), i)]);
     fputc(c, fp);
     }
 fputc('\n', fp);
 
 /* sequence base qualities */
 fputc('+', fp);
 fputc('\n', fp);
-for (i = 0; i < n; ++i)
+for (i = n-1; i >= 0; --i)
     {
     int c = bam1_qual(b)[i] + 33;
     if (c > 126)
 	c = 126;
     fputc(c, fp);
     }
 fputc('\n', fp);
+}
+
+// callback for bam_fetch()
+static inline int fetch_func(const bam1_t *b, void *data)
+{
+const bam1_core_t *c = &b->core;
+
+if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
+    return 0;
+if (!((c)->flag & BAM_FPAIRED) || ((c)->flag & BAM_FUNMAP))
+    return 0; // read is not paired or unmapped.
+if ((c)->flag & BAM_FMUNMAP)
+    return 0; // mate is unmapped.
+if ((c)->mtid != (c)->tid)
+    return 0;  // mate is on same chrom. MERGE INTER CODE HERE? 
+
+readfiles_t *files = (readfiles_t *) data;
+if ((c)->flag & BAM_FREAD1)
+    files->b1 = bam_dup1(b);
+else if ((c)->flag & BAM_FREAD2)
+    files->b2 = bam_dup1(b);
+
+if (!files->b1 || !files->b2)
+    return 0;
+
+char *name1 = bam1_qname(files->b1);
+char *name2 = bam1_qname(files->b2);
+
+if (!name1 || !name2)
+    return 0;
+
+if (!sameString(name1, name2))  /* different names */
+    return 0;
+
+/* print read 1 */
+c = &files->b1->core;
+fprintf(files->f1, "@%s/1\n", name1);
+
+/* sequence  & quality */
+if ((c)->flag & BAM_FREVERSE)
+    printReverse(files->f1, files->b1);
+else
+    printForward(files->f1, files->b1);
+
+/* print read 2 */
+c = &files->b2->core;
+fprintf(files->f2, "@%s/2\n", name2);
+
+/* sequence  & quality */
+if ((c)->flag & BAM_FREVERSE)
+    printReverse(files->f2, files->b2);
+else
+    printForward(files->f2, files->b2);
 
 return 0;
 }
 
@@ -111,8 +170,12 @@
 readfiles_t files;
 files.f1 = fopen(fileRead1, "w");
 files.f2 = fopen(fileRead2, "w");
 
+// containers to hold read pairs. 
+files.b1 = bam_init1();
+files.b2 = bam_init1();
+
 if (position) 
     {
     int ref;
     bam_index_t *idx;
@@ -144,8 +207,10 @@
     }
 
 fclose(files.f1);
 fclose(files.f2);
+bam_destroy1(files.b1);
+bam_destroy1(files.b2);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */