b29f1fbf93c9df5aa65fed050d9f8132dc6a2c73 braney Tue Nov 24 14:20:10 2015 -0800 support htslib for sam/bam/cram/tabix support #14717 diff --git src/lib/linefile.c src/lib/linefile.c index 5416c24..08a0e38 100644 --- src/lib/linefile.c +++ src/lib/linefile.c @@ -3,30 +3,33 @@ * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "hash.h" #include <fcntl.h> #include <signal.h> #include "dystring.h" #include "errAbort.h" #include "linefile.h" #include "pipeline.h" #include "localmem.h" #include "cheapcgi.h" #include "udc.h" +#ifdef USE_HTS +#include "htslib/tbx.h" +#endif char *getFileNameFromHdrSig(char *m) /* Check if header has signature of supported compression stream, and return a phoney filename for it, or NULL if no sig found. */ { char buf[20]; char *ext=NULL; if (startsWith("\x1f\x8b",m)) ext = "gz"; else if (startsWith("\x1f\x9d\x90",m)) ext = "Z"; else if (startsWith("BZ",m)) ext = "bz2"; else if (startsWith("PK\x03\x04",m)) ext = "zip"; if (ext==NULL) return NULL; safef(buf, sizeof(buf), LF_BOGUS_FILE_PREFIX "%s", ext); return cloneString(buf); @@ -215,80 +218,112 @@ #endif struct lineFile *lineFileTabixMayOpen(char *fileOrUrl, bool zTerm) /* Wrap a line file around a data file that has been compressed and indexed * by the tabix command line program. The index file <fileOrUrl>.tbi must be * readable in addition to fileOrUrl. If there's a problem, warn & return NULL. * This works only if kent/src has been compiled with USE_TABIX=1 and linked * with the tabix C library. */ { #ifdef USE_TABIX if (fileOrUrl == NULL) errAbort("lineFileTabixMayOpen: fileOrUrl is NULL"); int tbiNameSize = strlen(fileOrUrl) + strlen(".tbi") + 1; char tbiName[tbiNameSize]; safef(tbiName, sizeof(tbiName), "%s.tbi", fileOrUrl); +#ifdef USE_HTS +htsFile *htsFile = hts_open(fileOrUrl, "r"); +if (htsFile == NULL) + { + warn("Unable to open \"%s\"", fileOrUrl); + return NULL; + } +tbx_t *tabix; +if ((tabix = ti_index_load(tbiName)) == NULL) +#else tabix_t *tabix = ti_open(fileOrUrl, tbiName); if (tabix == NULL) { warn("Unable to open \"%s\"", fileOrUrl); return NULL; } if ((tabix->idx = ti_index_load(tbiName)) == NULL) +#endif { warn("Unable to load tabix index from \"%s\"", tbiName); ti_close(tabix); tabix = NULL; return NULL; } struct lineFile *lf = needMem(sizeof(struct lineFile)); lf->fileName = cloneString(fileOrUrl); lf->fd = -1; lf->bufSize = 64 * 1024; lf->buf = needMem(lf->bufSize); lf->zTerm = zTerm; lf->tabix = tabix; +#ifdef USE_HTS +lf->htsFile = htsFile; +kstring_t *kline; +AllocVar(kline); +kline->s = malloc(8192); +lf->kline = kline; +lf->tabixIter = tbx_itr_queryi(tabix, HTS_IDX_REST, 0, 0); +#else lf->tabixIter = ti_iter_first(); +#endif return lf; #else // no USE_TABIX warn(COMPILE_WITH_TABIX, "lineFileTabixMayOpen"); return NULL; #endif // no USE_TABIX } boolean lineFileSetTabixRegion(struct lineFile *lf, char *seqName, int start, int end) /* Assuming lf was created by lineFileTabixMayOpen, tell tabix to seek to the specified region * and return TRUE (or if there are no items in region, return FALSE). */ { #ifdef USE_TABIX if (lf->tabix == NULL) errAbort("lineFileSetTabixRegion: lf->tabix is NULL. Did you open lf with lineFileTabixMayOpen?"); if (seqName == NULL) return FALSE; +#ifdef USE_HTS +int tabixSeqId = ti_get_tid(lf->tabix, seqName); +if (tabixSeqId < 0 && startsWith("chr", seqName)) + // We will get some files that have chr-less Ensembl chromosome names: + tabixSeqId = ti_get_tid(lf->tabix, seqName+strlen("chr")); +if (tabixSeqId < 0) + return FALSE; +ti_iter_t *iter = ti_queryi((tbx_t *)lf->tabix, tabixSeqId, start, end); +#else int tabixSeqId = ti_get_tid(lf->tabix->idx, seqName); if (tabixSeqId < 0 && startsWith("chr", seqName)) // We will get some files that have chr-less Ensembl chromosome names: tabixSeqId = ti_get_tid(lf->tabix->idx, seqName+strlen("chr")); if (tabixSeqId < 0) return FALSE; ti_iter_t iter = ti_queryi(lf->tabix, tabixSeqId, start, end); +#endif if (iter == NULL) return FALSE; if (lf->tabixIter != NULL) ti_iter_destroy(lf->tabixIter); lf->tabixIter = iter; +#ifndef USE_HTS lf->bufOffsetInFile = bgzf_tell(lf->tabix->fp); +#endif lf->bytesInBuf = 0; lf->lineIx = -1; lf->lineStart = 0; lf->lineEnd = 0; return TRUE; #else // no USE_TABIX warn(COMPILE_WITH_TABIX, "lineFileSetTabixRegion"); return FALSE; #endif // no USE_TABIX } struct lineFile *lineFileUdcMayOpen(char *fileOrUrl, bool zTerm) /* Create a line file object with an underlying UDC cache. NULL if not found. */ { if (fileOrUrl == NULL) @@ -467,42 +502,53 @@ lf->lineIx = -1; lf->lineStart = 0; lf->lineEnd = lineSize; *retStart = line; freeMem(lf->buf); lf->buf = line; lf->bufSize = lineSize; return TRUE; } #ifdef USE_TABIX if (lf->tabix != NULL && lf->tabixIter != NULL) { // Just use line-oriented ti_read: int lineSize = 0; +#ifdef USE_HTS + lineSize = tbx_itr_next(lf->htsFile, lf->tabix, lf->tabixIter, lf->kline); + if (lineSize == -1) + return FALSE; +#else const char *line = ti_read(lf->tabix, lf->tabixIter, &lineSize); if (line == NULL) return FALSE; +#endif lf->bufOffsetInFile = -1; lf->bytesInBuf = lineSize; lf->lineIx = -1; lf->lineStart = 0; lf->lineEnd = lineSize; if (lineSize > lf->bufSize) // shouldn't be! but just in case: lineFileExpandBuf(lf, lineSize * 2); +#ifdef USE_HTS + kstring_t *kline = lf->kline; + safecpy(lf->buf, lf->bufSize, kline->s); +#else safecpy(lf->buf, lf->bufSize, line); +#endif *retStart = lf->buf; if (retSize != NULL) *retSize = lineSize; return TRUE; } #endif // USE_TABIX determineNlType(lf, buf+endIx, bytesInBuf); /* Find next end of line in buffer. */ switch(lf->nlType) { case nlt_unix: case nlt_dos: for (endIx = lf->lineEnd; endIx < bytesInBuf; ++endIx) @@ -536,31 +582,35 @@ int oldEnd = lf->lineEnd; int sizeLeft = bytesInBuf - oldEnd; int bufSize = lf->bufSize; int readSize = bufSize - sizeLeft; if (oldEnd > 0 && sizeLeft > 0) { memmove(buf, buf+oldEnd, sizeLeft); } lf->bufOffsetInFile += oldEnd; if (lf->fd >= 0) readSize = lineFileLongNetRead(lf->fd, buf+sizeLeft, readSize); #ifdef USE_TABIX else if (lf->tabix != NULL && readSize > 0) { +#ifdef USE_HTS + errAbort("bgzf read not supported with htslib (yet)"); +#else readSize = bgzf_read(lf->tabix->fp, buf+sizeLeft, readSize); +#endif if (readSize < 1) return FALSE; } #endif // USE_TABIX else readSize = 0; if ((readSize == 0) && (endIx > oldEnd)) { endIx = sizeLeft; buf[endIx] = 0; lf->bytesInBuf = newStart = lf->lineStart = 0; lf->lineEnd = endIx; ++lf->lineIx; if (retSize != NULL)