16439684a0ecc75ede242ded740c51cf1f60c8a4 angie Tue Feb 22 11:36:43 2011 -0800 Feature #2820 (tabix: add as optional linked library in kent/src):Added lineFile wrapper on tabix: lineFileOnTabix to open; then lineFileNext and its derivatives work as usual. lineFileSetTabixRegion seeks to the given position range, and subsequent lineFileNext's return lines in that position range. lineFileSeek is not supported -- tabix doesn't have linear offsets but rather a block number and offset which are packed into 64 bits. Line numbers are not known after calls to lineFileSetTabixRegion. diff --git src/lib/linefile.c src/lib/linefile.c index b2d9a11..14e1a5c 100644 --- src/lib/linefile.c +++ src/lib/linefile.c @@ -1,32 +1,30 @@ /* lineFile - stuff to rapidly read text files and parse them into * lines. * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #include "common.h" #include "hash.h" #include <fcntl.h> #include "dystring.h" #include "errabort.h" #include "linefile.h" #include "pipeline.h" #include <signal.h> -static char const rcsid[] = "$Id: linefile.c,v 1.61 2010/06/10 20:13:29 braney Exp $"; - char *getFileNameFromHdrSig(char *m) /* Check if header has signature of supported compression stream, and return a phoney filename for it, or NULL if no sig found. */ { char buf[20]; char *ext=NULL; if (startsWith("\x1f\x8b",m)) ext = "gz"; else if (startsWith("\x1f\x9d\x90",m)) ext = "Z"; else if (startsWith("BZ",m)) ext = "bz2"; else if (startsWith("PK\x03\x04",m)) ext = "zip"; if (ext==NULL) return NULL; safef(buf, sizeof(buf), "somefile.%s", ext); return cloneString(buf); } @@ -186,30 +184,101 @@ struct lineFile *lineFileOnString(char *name, bool zTerm, char *s) /* Wrap a line file object around string in memory. This buffer * have zeroes written into it and be freed when the line file * is closed. */ { struct lineFile *lf; AllocVar(lf); lf->fileName = cloneString(name); lf->fd = -1; lf->bufSize = lf->bytesInBuf = strlen(s); lf->zTerm = zTerm; lf->buf = s; return lf; } +struct lineFile *lineFileOnTabix(char *fileName, bool zTerm) +/* Wrap a line file around a data file that has been compressed and indexed + * by the tabix command line program. The index file <fileName>.tbi must be + * readable in addition to fileName. If there's a problem, warn & return NULL. + * This works only if kent/src has been compiled with USE_TABIX=1 and linked + * with the tabix C library. */ +{ +#ifdef USE_TABIX +int tbiNameSize = strlen(fileName) + strlen(".tbi") + 1; +char *tbiName = needMem(tbiNameSize); +safef(tbiName, tbiNameSize, "%s.tbi", fileName); +tabix_t *tabix = ti_open(fileName, tbiName); +if (tabix == NULL) + { + warn("Unable to open \"%s\"", fileName); + freez(&tbiName); + return NULL; + } +if ((tabix->idx = ti_index_load(tbiName)) == NULL) + { + warn("Unable to load tabix index from \"%s\"", tbiName); + freez(&tbiName); + return NULL; + } +struct lineFile *lf = needMem(sizeof(struct lineFile)); +lf->fileName = cloneString(fileName); +lf->fd = -1; +lf->bufSize = 64 * 1024; +lf->buf = needMem(lf->bufSize); +lf->zTerm = zTerm; +lf->tabix = tabix; +freez(&tbiName); +return lf; +#else // no USE_TABIX +warn(COMPILE_WITH_TABIX, "lineFileOnTabix"); +return NULL; +#endif // no USE_TABIX +} + +boolean lineFileSetTabixRegion(struct lineFile *lf, char *seqName, int start, int end) +/* Assuming lf was created by lineFileOnTabix, tell tabix to seek to the specified region + * and return TRUE (or if unable, return FALSE). */ +{ +#ifdef USE_TABIX +if (lf->tabix == NULL) + errAbort("lineFileSetTabixRegion: lf->tabix is NULL. Did you open lf with lineFileOnTabix?"); +int tabixSeqId = ti_get_tid(lf->tabix->idx, seqName); +if (tabixSeqId < 0 && startsWith("chr", seqName)) + // We will get some files that have chr-less Ensembl chromosome names: + tabixSeqId = ti_get_tid(lf->tabix->idx, seqName+strlen("chr")); +if (tabixSeqId < 0) + return FALSE; +ti_iter_t iter = ti_queryi(lf->tabix, tabixSeqId, start, end); +if (iter == NULL) + return FALSE; +if (lf->tabixIter != NULL) + ti_iter_destroy(lf->tabixIter); +lf->tabixIter = iter; +lf->bufOffsetInFile = ti_bgzf_tell(lf->tabix->fp); +lf->bytesInBuf = 0; +lf->lineIx = -1; +lf->lineStart = 0; +lf->lineEnd = 0; +return TRUE; +#else // no USE_TABIX +warn(COMPILE_WITH_TABIX, "lineFileSetTabixRegion"); +return FALSE; +#endif // no USE_TABIX +} + + void lineFileExpandBuf(struct lineFile *lf, int newSize) /* Expand line file buffer. */ { assert(newSize > lf->bufSize); lf->buf = needMoreMem(lf->buf, lf->bytesInBuf, newSize); lf->bufSize = newSize; } struct lineFile *lineFileStdin(bool zTerm) /* Wrap a line file around stdin. */ { return lineFileAttach("stdin", zTerm, fileno(stdin)); } @@ -233,33 +302,42 @@ /* Open up a lineFile or die trying. */ { struct lineFile *lf = lineFileMayOpen(fileName, zTerm); if (lf == NULL) errAbort("Couldn't open %s , %s", fileName, strerror(errno)); return lf; } void lineFileReuse(struct lineFile *lf) /* Reuse current line. */ { lf->reuse = TRUE; } +INLINE void noTabixSupport(struct lineFile *lf, char *where) +{ +#ifdef USE_TABIX +if (lf->tabix != NULL) + lineFileAbort(lf, "%s: not implemented for lineFile opened with lineFileOnTabix.", where); +#endif // USE_TABIX +} + void lineFileSeek(struct lineFile *lf, off_t offset, int whence) /* Seek to read next line from given position. */ { +noTabixSupport(lf, "lineFileSeek"); if (lf->pl != NULL) errnoAbort("Can't lineFileSeek on a compressed file: %s", lf->fileName); lf->reuse = FALSE; if (whence == SEEK_SET && offset >= lf->bufOffsetInFile && offset < lf->bufOffsetInFile + lf->bytesInBuf) { lf->lineStart = lf->lineEnd = offset - lf->bufOffsetInFile; } else { lf->lineStart = lf->lineEnd = lf->bytesInBuf = 0; if ((lf->bufOffsetInFile = lseek(lf->fd, offset, whence)) == -1) errnoAbort("Couldn't lineFileSeek %s", lf->fileName); } } @@ -321,30 +399,54 @@ int endIx = lf->lineEnd; boolean gotLf = FALSE; int newStart; if (lf->reuse) { lf->reuse = FALSE; if (retSize != NULL) *retSize = lf->lineEnd - lf->lineStart; *retStart = buf + lf->lineStart; if (lf->metaOutput && *retStart[0] == '#') metaDataAdd(lf, *retStart); return TRUE; } +#ifdef USE_TABIX +if (lf->tabix != NULL && lf->tabixIter != NULL) + { + // Just use line-oriented ti_read: + int lineSize = 0; + const char *line = ti_read(lf->tabix, lf->tabixIter, &lineSize); + if (line == NULL) + return FALSE; + lf->bufOffsetInFile = -1; + lf->bytesInBuf = lineSize; + lf->lineIx = -1; + lf->lineStart = 0; + lf->lineEnd = lineSize; + if (lineSize > lf->bufSize) + // shouldn't be! but just in case: + lineFileExpandBuf(lf, lineSize * 2); + safecpy(lf->buf, lf->bufSize, line); + *retStart = lf->buf; + if (retSize != NULL) + *retSize = lineSize; + return TRUE; + } +#endif // USE_TABIX + determineNlType(lf, buf+endIx, bytesInBuf); /* Find next end of line in buffer. */ switch(lf->nlType) { case nlt_unix: case nlt_dos: for (endIx = lf->lineEnd; endIx < bytesInBuf; ++endIx) { if (buf[endIx] == '\n') { gotLf = TRUE; endIx += 1; break; } @@ -368,30 +470,38 @@ /* If not in buffer read in a new buffer's worth. */ while (!gotLf) { int oldEnd = lf->lineEnd; int sizeLeft = bytesInBuf - oldEnd; int bufSize = lf->bufSize; int readSize = bufSize - sizeLeft; if (oldEnd > 0 && sizeLeft > 0) { memmove(buf, buf+oldEnd, sizeLeft); } lf->bufOffsetInFile += oldEnd; if (lf->fd >= 0) readSize = lineFileLongNetRead(lf->fd, buf+sizeLeft, readSize); +#ifdef USE_TABIX + else if (lf->tabix != NULL && readSize > 0) + { + readSize = ti_bgzf_read(lf->tabix->fp, buf+sizeLeft, readSize); + if (readSize < 1) + return FALSE; + } +#endif // USE_TABIX else readSize = 0; if ((readSize == 0) && (endIx > oldEnd)) { endIx = sizeLeft; buf[endIx] = 0; lf->bytesInBuf = newStart = lf->lineStart = 0; lf->lineEnd = endIx; ++lf->lineIx; if (retSize != NULL) *retSize = endIx - newStart; *retStart = buf + newStart; if (*retStart[0] == '#') metaDataAdd(lf, *retStart); @@ -507,30 +617,38 @@ /* Close up a line file. */ { struct lineFile *lf; if ((lf = *pLf) != NULL) { if (lf->pl != NULL) { pipelineWait(lf->pl); pipelineFree(&lf->pl); } else if (lf->fd > 0 && lf->fd != fileno(stdin)) { close(lf->fd); freeMem(lf->buf); } +#ifdef USE_TABIX + else if (lf->tabix != NULL) + { + if (lf->tabixIter != NULL) + ti_iter_destroy(lf->tabixIter); + ti_close(lf->tabix); + } +#endif // USE_TABIX freeMem(lf->fileName); metaDataFree(lf); freez(pLf); } } void lineFileCloseList(struct lineFile **pList) /* Close up a list of line files. */ { struct lineFile *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; lineFileClose(&el);