src/hg/tcga/bamToFastQ/bamToFastQ.c 1.3
1.3 2010/03/11 04:38:52 jsanborn
updated to correctly output paired ends
Index: src/hg/tcga/bamToFastQ/bamToFastQ.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/tcga/bamToFastQ/bamToFastQ.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/tcga/bamToFastQ/bamToFastQ.c 10 Mar 2010 00:21:30 -0000 1.2
+++ src/hg/tcga/bamToFastQ/bamToFastQ.c 11 Mar 2010 04:38:52 -0000 1.3
@@ -1,177 +1,242 @@
/* bamVariants -- discovering variants from BAM file
* code adapted from Heng Li's Samtools library functions
*/
#ifdef USE_BAM
#include "common.h"
#include "hCommon.h"
#include "linefile.h"
#include "hash.h"
#include "hdb.h"
#include "options.h"
#include "jksql.h"
#define _IOLIB 2
#include "bam.h"
#include "sam.h"
static char const rcsid[] = "$Id$";
void usage()
/* Explain usage and exit. */
{
errAbort(
"bamVariants - Find variants in input BAM file, output position/type of variant.\n"
+ " BAM files MUST be sorted by name to produce FASTQ files appropriate for \n"
+ " alignment by MAQ, BWA, etc.\n"
"usage:\n"
" bamVariants [options] <in.bam> <out_read1.fq> <out_read2.fq>\n"
"options:\n"
" -position=<chrom position> - Specify a region of BAM file to perform pileup\n"
"\n"
);
}
static struct optionSpec options[] = {
{"position", OPTION_STRING},
{NULL, 0}
};
/* Option variables */
char *position = NULL;
typedef struct {
int beg, end;
samfile_t *in;
} tmpstruct_t;
typedef struct {
FILE *f1, *f2;
+ bam1_t *b1, *b2;
} readfiles_t;
-// callback for bam_fetch()
-static int fetch_func(const bam1_t *b, void *data)
+static inline void printForward(FILE *fp, const bam1_t *b)
{
-int readNum = 0;
-const bam1_core_t *c = &b->core;
-
-readfiles_t *files = (readfiles_t*) data;
-FILE *fp = NULL;
-if ((c)->flag & BAM_FREAD1)
+int i, n = (b)->core.l_qseq;
+for (i = 0; i < n; ++i)
{
- fp = files->f1;
- readNum = 1;
+ int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]);
+ fputc(c, fp);
}
-else if ((c)->flag & BAM_FREAD2)
+fputc('\n', fp);
+
+/* sequence base qualities */
+fputc('+', fp);
+fputc('\n', fp);
+for (i = 0; i < n; ++i)
{
- fp = files->f2;
- readNum = 2;
+ int c = bam1_qual(b)[i] + 33;
+ if (c > 126)
+ c = 126;
+ fputc(c, fp);
}
-if (!fp)
- return 0;
+fputc('\n', fp);
+}
-/* sequence name */
-fprintf(fp, "@%s/%d\n", bam1_qname(b), readNum);
+//char *bam_nt16_rev_table = "=ACMGRSVTWYHKDBN";
-/* sequence */
+char *bam_nt16_reverse_table = "=TGMCRSVAWYHKDBN";
+static inline void printReverse(FILE *fp, const bam1_t *b)
+{
int i, n = (b)->core.l_qseq;
-for (i = 0; i < n; ++i)
+for (i = n-1; i >= 0; --i)
{
- int c = toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(b), i)]);
+ int c = toupper(bam_nt16_reverse_table[bam1_seqi(bam1_seq(b), i)]);
fputc(c, fp);
}
fputc('\n', fp);
/* sequence base qualities */
fputc('+', fp);
fputc('\n', fp);
-for (i = 0; i < n; ++i)
+for (i = n-1; i >= 0; --i)
{
int c = bam1_qual(b)[i] + 33;
if (c > 126)
c = 126;
fputc(c, fp);
}
fputc('\n', fp);
+}
+
+// callback for bam_fetch()
+static inline int fetch_func(const bam1_t *b, void *data)
+{
+const bam1_core_t *c = &b->core;
+
+if (((c)->flag & BAM_FUNMAP) || ((c)->flag & BAM_FDUP) )
+ return 0;
+if (!((c)->flag & BAM_FPAIRED) || ((c)->flag & BAM_FUNMAP))
+ return 0; // read is not paired or unmapped.
+if ((c)->flag & BAM_FMUNMAP)
+ return 0; // mate is unmapped.
+if ((c)->mtid != (c)->tid)
+ return 0; // mate is on same chrom. MERGE INTER CODE HERE?
+
+readfiles_t *files = (readfiles_t *) data;
+if ((c)->flag & BAM_FREAD1)
+ files->b1 = bam_dup1(b);
+else if ((c)->flag & BAM_FREAD2)
+ files->b2 = bam_dup1(b);
+
+if (!files->b1 || !files->b2)
+ return 0;
+
+char *name1 = bam1_qname(files->b1);
+char *name2 = bam1_qname(files->b2);
+
+if (!name1 || !name2)
+ return 0;
+
+if (!sameString(name1, name2)) /* different names */
+ return 0;
+
+/* print read 1 */
+c = &files->b1->core;
+fprintf(files->f1, "@%s/1\n", name1);
+
+/* sequence & quality */
+if ((c)->flag & BAM_FREVERSE)
+ printReverse(files->f1, files->b1);
+else
+ printForward(files->f1, files->b1);
+
+/* print read 2 */
+c = &files->b2->core;
+fprintf(files->f2, "@%s/2\n", name2);
+
+/* sequence & quality */
+if ((c)->flag & BAM_FREVERSE)
+ printReverse(files->f2, files->b2);
+else
+ printForward(files->f2, files->b2);
return 0;
}
void bamToFastQ(char *inbam, char *fileRead1, char *fileRead2)
/* bamToFastQ - Convert BAM -> FastQ format */
{
tmpstruct_t tmp;
tmp.beg = 0; tmp.end = 0x7fffffff;
tmp.in = samopen(inbam, "rb", NULL);
if (tmp.in == NULL)
errAbort("samopen(%s, \"rb\") returned NULL", inbam);
readfiles_t files;
files.f1 = fopen(fileRead1, "w");
files.f2 = fopen(fileRead2, "w");
+// containers to hold read pairs.
+files.b1 = bam_init1();
+files.b2 = bam_init1();
+
if (position)
{
int ref;
bam_index_t *idx;
idx = bam_index_load(inbam); // load BAM index
if (!idx)
{
samclose(tmp.in);
errAbort("BAM indexing file is not available.\n");
}
bam_parse_region(tmp.in->header, position, &ref,
&tmp.beg, &tmp.end); // parse the region
if (ref < 0)
{
samclose(tmp.in);
errAbort("Invalid region %s\n", position);
}
bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, &files, fetch_func);
bam_index_destroy(idx);
samclose(tmp.in);
}
else
{ // region not specified
int ret;
bam1_t *b = bam_init1();
while ((ret = samread(tmp.in, b)) >= 0)
fetch_func(b, &files); // read through entire file
bam_destroy1(b);
}
fclose(files.f1);
fclose(files.f2);
+bam_destroy1(files.b1);
+bam_destroy1(files.b2);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc < 4)
usage();
if (optionExists("position"))
position = optionVal("position", NULL);
char *inbam = argv[1];
char *fileRead1 = argv[2];
char *fileRead2 = argv[3];
bamToFastQ(inbam, fileRead1, fileRead2);
return 0;
}
#else // USE_BAM not defined
int main(int argc, char *argv[])
/* Process command line. */
{
printf("BAM not installed");
return 0;
}
#endif