7f5b0b5cf3ed7efc0f9e84aa5bdcc097473c7584 kent Thu Feb 3 21:18:16 2011 -0800 Some preliminary work putting proper bam support in table browser. Still quite a ways to go, but have implemented a way to get a list of alignments in 'sam' format out of a BAM. This I think contains all the info. diff --git src/hg/lib/bamFile.c src/hg/lib/bamFile.c index 45648f7..bb1543d 100644 --- src/hg/lib/bamFile.c +++ src/hg/lib/bamFile.c @@ -1,24 +1,25 @@ /* bam -- interface to binary alignment format files using Heng Li's samtools lib. */ #ifdef USE_BAM #include "common.h" #include "htmshell.h" #include "hdb.h" #include "hgConfig.h" #include "udc.h" +#include "samAlignment.h" #include "bamFile.h" static char const rcsid[] = "$Id: bamFile.c,v 1.22 2010/03/04 05:14:13 angie Exp $"; char *bamFileNameFromTable(struct sqlConnection *conn, char *table, char *bamSeqName) /* Return file name from table. If table has a seqName column, then grab the * row associated with bamSeqName (which is not nec. in chromInfo, e.g. * bam file might have '1' not 'chr1'). */ { boolean checkSeqName = (sqlFieldIndex(conn, table, "seqName") >= 0); if (checkSeqName && bamSeqName == NULL) errAbort("bamFileNameFromTable: table %s has seqName column, but NULL seqName passed in", table); char query[512]; if (checkSeqName) @@ -185,55 +186,59 @@ #ifndef KNETFILE_HOOKS setCurrentDir(runDir); #endif//ndef KNETFILE_HOOKS samclose(fh); if (idx == NULL) { warn("bamFileExists: failed to read index corresponding to %s", bamFileName); return FALSE; } free(idx); // Not freeMem, freez etc -- sam just uses malloc/calloc. return TRUE; } return FALSE; } -void bamFetch(char *fileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData) +void bamFetch(char *fileOrUrl, char *position, bam_fetch_f callbackFunc, void *callbackData, + samfile_t **pSamFile) /* Open the .bam file, fetch items in the seq:start-end position range, * and call callbackFunc on each bam item retrieved from the file plus callbackData. - * This handles BAM files with "chr"-less sequence names, e.g. from Ensembl. */ + * This handles BAM files with "chr"-less sequence names, e.g. from Ensembl. + * The pSamFile parameter is optional. If non-NULL it will be filled in, just for + * the benefit of the callback function, with the open samFile. */ { char *bamFileName = samtoolsFileName(fileOrUrl); samfile_t *fh = samopen(bamFileName, "rb", NULL); if (fh == NULL) { boolean usingUrl = (strstr(fileOrUrl, "tp://") || strstr(fileOrUrl, "https://")); struct dyString *urlWarning = dyStringNew(0); if (usingUrl) { char *udcFuseRoot = cfgOption("udcFuse.mountPoint"); boolean usingUdc = (udcFuseRoot != NULL && startsWith(udcFuseRoot, bamFileName)); if (usingUdc) dyStringAppend(urlWarning, " (using udcFuse)"); dyStringAppend(urlWarning, ". If you are able to access the URL with your web browser, " "please try reloading this page."); } warn("failed to open %s%s", fileOrUrl, urlWarning->string); return; } - +if (pSamFile != NULL) + *pSamFile = fh; int chromId, start, end; int ret = bam_parse_region(fh->header, position, &chromId, &start, &end); if (ret != 0 && startsWith("chr", position)) ret = bam_parse_region(fh->header, position+strlen("chr"), &chromId, &start, &end); if (ret != 0) // If the bam file does not cover the current chromosome, OK return; #ifndef KNETFILE_HOOKS // Since samtools' url-handling code saves the .bai file to the current directory, // chdir to a trash directory before calling bam_index_load, then chdir back. char *runDir = getCurrentDir(); char *samDir = getSamDir(); setCurrentDir(samDir); #endif//ndef KNETFILE_HOOKS @@ -269,80 +274,99 @@ unsigned int *cigarPacked = bam1_cigar(bam); const bam1_core_t *core = &bam->core; char op; int n = bamUnpackCigarElement(cigarPacked[0], &op); int low = (op == 'S') ? n : 0; n = bamUnpackCigarElement(cigarPacked[core->n_cigar-1], &op); int high = (op == 'S') ? n : 0; if (retLow != NULL) *retLow = low; if (retHigh != NULL) *retHigh = high; if (retClippedQLen != NULL) *retClippedQLen = (core->l_qseq - low - high); } -char *bamGetQuerySequence(const bam1_t *bam, boolean useStrand) -/* Return the nucleotide sequence encoded in bam. The BAM format + +void bamUnpackQuerySequence(const bam1_t *bam, boolean useStrand, char *qSeq) +/* Fill in qSeq with the nucleotide sequence encoded in bam. The BAM format * reverse-complements query sequence when the alignment is on the - strand, * so if useStrand is given we rev-comp it back to restore the original query * sequence. */ { const bam1_core_t *core = &bam->core; int qLen = core->l_qseq; -char *qSeq = needMem(qLen+1); uint8_t *packedQSeq = bam1_seq(bam); int i; for (i = 0; i < qLen; i++) qSeq[i] = bam_nt16_rev_table[bam1_seqi(packedQSeq, i)]; qSeq[i] = '\0'; if (useStrand && bamIsRc(bam)) reverseComplement(qSeq, qLen); +} + +char *bamGetQuerySequence(const bam1_t *bam, boolean useStrand) +/* Allocate and return the nucleotide sequence encoded in bam. The BAM format + * reverse-complements query sequence when the alignment is on the - strand, + * so if useStrand is given we rev-comp it back to restore the original query + * sequence. */ +{ +const bam1_core_t *core = &bam->core; +int qLen = core->l_qseq; +char *qSeq = needMem(qLen+1); +bamUnpackQuerySequence(bam, useStrand, qSeq); return qSeq; } UBYTE *bamGetQueryQuals(const bam1_t *bam, boolean useStrand) /* Return the base quality scores encoded in bam as an array of ubytes. */ { const bam1_core_t *core = &bam->core; int qLen = core->l_qseq; UBYTE *arr = needMem(qLen); boolean isRc = useStrand && bamIsRc(bam); UBYTE *qualStr = bam1_qual(bam); int i; for (i = 0; i < core->l_qseq; i++) { int offset = isRc ? (qLen - 1 - i) : i; arr[i] = (qualStr[0] == 255) ? 255 : qualStr[offset]; } return arr; } -char *bamGetCigar(const bam1_t *bam) -/* Return a BAM-enhanced CIGAR string, decoded from the packed encoding in bam. */ +static void bamUnpackCigar(const bam1_t *bam, struct dyString *dyCigar) +/* Unpack CIGAR string into dynamic string */ { unsigned int *cigarPacked = bam1_cigar(bam); const bam1_core_t *core = &bam->core; -struct dyString *dyCigar = dyStringNew(min(8, core->n_cigar*4)); int i; for (i = 0; i < core->n_cigar; i++) { char op; int n = bamUnpackCigarElement(cigarPacked[i], &op); dyStringPrintf(dyCigar, "%d", n); dyStringAppendC(dyCigar, op); } +} + +char *bamGetCigar(const bam1_t *bam) +/* Return a BAM-enhanced CIGAR string, decoded from the packed encoding in bam. */ +{ +const bam1_core_t *core = &bam->core; +struct dyString *dyCigar = dyStringNew(min(8, core->n_cigar*4)); +bamUnpackCigar(bam, dyCigar); return dyStringCannibalize(&dyCigar); } void bamShowCigarEnglish(const bam1_t *bam) /* Print out cigar in English e.g. "20 (mis)Match, 1 Deletion, 3 (mis)Match" */ { unsigned int *cigarPacked = bam1_cigar(bam); const bam1_core_t *core = &bam->core; int i; for (i = 0; i < core->n_cigar; i++) { char op; int n = bamUnpackCigarElement(cigarPacked[i], &op); if (i > 0) printf(", "); @@ -575,16 +599,146 @@ else { if (type == 'A' || type == 'C' || type == 'c') { ++s; } else if (type == 'S' || type == 's') { s += 2; } else if (type == 'I' || type == 'i' || type == 'f') { s += 4; } else if (type == 'd') { s += 8; } else if (type == 'Z' || type == 'H') { while (*s++); } } } return val; } +void bamUnpackAux(const bam1_t *bam, struct dyString *dy) +/* Unpack the tag:type:val part of bam into dy */ +{ +// adapted from part of bam.c bam_format1: +uint8_t *s = bam1_aux(bam); +boolean firstTime = TRUE; +while (s < bam->data + bam->data_len) + { + if (firstTime) + firstTime = FALSE; + else + dyStringAppendC(dy, '\t'); + dyStringAppendC(dy, *s++); + dyStringAppendC(dy, *s++); + dyStringAppendC(dy, ':'); + dyStringAppendC(dy, s[0]); + dyStringAppendC(dy, ':'); + uint8_t type = *s++; + if (type == 'A') { dyStringPrintf(dy, "%c", *s); ++s; } + else if (type == 'C') { dyStringPrintf(dy, "%u", *s); ++s; } + else if (type == 'c') { dyStringPrintf(dy, "%d", *s); ++s; } + else if (type == 'S') { dyStringPrintf(dy, "%u", *(uint16_t*)s); s += 2; } + else if (type == 's') { dyStringPrintf(dy, "%d", *(int16_t*)s); s += 2; } + else if (type == 'I') { dyStringPrintf(dy, "%u", *(uint32_t*)s); s += 4; } + else if (type == 'i') { dyStringPrintf(dy, "%d", *(int32_t*)s); s += 4; } + else if (type == 'f') { dyStringPrintf(dy, "%g", *(float*)s); s += 4; } + else if (type == 'd') { dyStringPrintf(dy, "%lg", *(double*)s); s += 8; } + else if (type == 'Z' || type == 'H') + { + dyStringAppend(dy, (char *)s); + s += strlen((char *)s) + 1; + } + } +} + + +struct bamToSamHelper +/* Helper structure to help get sam alignments out of a BAM file */ + { + struct lm *lm; /* Local memory pool to allocate into */ + char *chrom; /* Chromosome name */ + struct dyString *dy; /* Dynamic temporary string */ + samfile_t *samFile; /* Open sam/bam file header. */ + struct samAlignment *samList; /* List of alignments. */ + }; + +static void addToChars(char *s, int size, char amount) +/* Add amount to each char in s of given size. */ +{ +int i; +for (i=0; i<size; ++i) + s[i] += amount; +} + +static boolean isAllSameChar(char *s, int size, char c) +/* Return TRUE if first size chars of s are 0 */ +{ +int i; +for (i=0; i<size; ++i) + if (s[i] != c) + return FALSE; +return TRUE; +} + + +int bamAddOneSamAlignment(const bam1_t *bam, void *data) +/* bam_fetch() calls this on each bam alignment retrieved. Translate each bam + * into a linkedFeaturesSeries item, and either store it until we find its mate + * or add it to tg->items. */ +{ +struct bamToSamHelper *helper = (struct bamToSamHelper *)data; +struct lm *lm = helper->lm; +struct samAlignment *sam; +lmAllocVar(lm, sam); +const bam1_core_t *core = &bam->core; +struct dyString *dy = helper->dy; +sam->qName = lmCloneString(lm, bam1_qname(bam)); +sam->flag = core->flag; +sam->rName = helper->chrom; +sam->pos = core->pos + 1; +sam->mapQ = core->qual; +dyStringClear(dy); +bamUnpackCigar(bam, dy); +sam->cigar = lmCloneStringZ(lm, dy->string, dy->stringSize); +if (core->mtid >= 0) + { + if (core->tid == core->mtid) + sam->rNext = "="; + else + sam->rNext = lmCloneString(lm, helper->samFile->header->target_name[core->mtid]); + } +else + sam->rNext = "*"; +sam->pNext = core->mpos + 1; +sam->tLen = core->isize; +sam->seq = lmAlloc(lm, core->l_qseq + 1); +bamUnpackQuerySequence(bam, FALSE, sam->seq); +char *bamQual = (char *)bam1_qual(bam); +if (isAllSameChar(bamQual, core->l_qseq, -1)) + sam->qual = "*"; +else + { + sam->qual = lmCloneStringZ(lm, bamQual, core->l_qseq); + addToChars(sam->qual, core->l_qseq, 33); + } +dyStringClear(dy); +bamUnpackAux(bam, dy); +sam->tagTypeVals = lmCloneStringZ(lm, dy->string, dy->stringSize); +slAddHead(&helper->samList, sam); +return 0; +} + +struct samAlignment *bamFetchSamAlignment(char *fileOrUrl, char *chrom, int start, int end, + struct lm *lm) +/* Fetch region as a list of samAlignments - which is more or less an unpacked + * bam record. Results is allocated out of lm, since it tends to be large... */ +{ +struct bamToSamHelper helper; +helper.lm = lm; +helper.chrom = chrom; +helper.dy = dyStringNew(0); +helper.samList = NULL; +char posForBam[256]; +safef(posForBam, sizeof(posForBam), "%s:%d-%d", chrom, start, end); +bamFetch(fileOrUrl, posForBam, bamAddOneSamAlignment, &helper, &helper.samFile); +dyStringFree(&helper.dy); +slReverse(&helper.samList); +return helper.samList; +} + #endif//def USE_BAM