16439684a0ecc75ede242ded740c51cf1f60c8a4
angie
  Tue Feb 22 11:36:43 2011 -0800
Feature #2820 (tabix: add as optional linked library in kent/src):Added lineFile wrapper on tabix: lineFileOnTabix to open; then
lineFileNext and its derivatives work as usual.
lineFileSetTabixRegion seeks to the given position range, and
subsequent lineFileNext's return lines in that position range.
lineFileSeek is not supported -- tabix doesn't have linear offsets
but rather a block number and offset which are packed into 64 bits.
Line numbers are not known after calls to lineFileSetTabixRegion.

diff --git src/lib/linefile.c src/lib/linefile.c
index b2d9a11..14e1a5c 100644
--- src/lib/linefile.c
+++ src/lib/linefile.c
@@ -1,32 +1,30 @@
 /* lineFile - stuff to rapidly read text files and parse them into
  * lines. 
  *
  * This file is copyright 2002 Jim Kent, but license is hereby
  * granted for all use - public, private or commercial. */
 
 #include "common.h"
 #include "hash.h"
 #include <fcntl.h>
 #include "dystring.h"
 #include "errabort.h"
 #include "linefile.h"
 #include "pipeline.h"
 #include <signal.h>
 
-static char const rcsid[] = "$Id: linefile.c,v 1.61 2010/06/10 20:13:29 braney Exp $";
-
 char *getFileNameFromHdrSig(char *m)
 /* Check if header has signature of supported compression stream,
    and return a phoney filename for it, or NULL if no sig found. */
 {
 char buf[20];
 char *ext=NULL;
 if (startsWith("\x1f\x8b",m)) ext = "gz";
 else if (startsWith("\x1f\x9d\x90",m)) ext = "Z";
 else if (startsWith("BZ",m)) ext = "bz2";
 else if (startsWith("PK\x03\x04",m)) ext = "zip";
 if (ext==NULL) 
     return NULL;
 safef(buf, sizeof(buf), "somefile.%s", ext);
 return cloneString(buf);
 }   
@@ -186,30 +184,101 @@
 struct lineFile *lineFileOnString(char *name, bool zTerm, char *s)
 /* Wrap a line file object around string in memory. This buffer
  * have zeroes written into it and be freed when the line file
  * is closed. */
 {
 struct lineFile *lf;
 AllocVar(lf);
 lf->fileName = cloneString(name);
 lf->fd = -1;
 lf->bufSize = lf->bytesInBuf = strlen(s);
 lf->zTerm = zTerm;
 lf->buf = s;
 return lf;
 }
 
+struct lineFile *lineFileOnTabix(char *fileName, bool zTerm)
+/* Wrap a line file around a data file that has been compressed and indexed
+ * by the tabix command line program.  The index file <fileName>.tbi must be
+ * readable in addition to fileName. If there's a problem, warn & return NULL.
+ * This works only if kent/src has been compiled with USE_TABIX=1 and linked
+ * with the tabix C library. */
+{
+#ifdef USE_TABIX
+int tbiNameSize = strlen(fileName) + strlen(".tbi") + 1;
+char *tbiName = needMem(tbiNameSize);
+safef(tbiName, tbiNameSize, "%s.tbi", fileName);
+tabix_t *tabix = ti_open(fileName, tbiName);
+if (tabix == NULL)
+    {
+    warn("Unable to open \"%s\"", fileName);
+    freez(&tbiName);
+    return NULL;
+    }
+if ((tabix->idx = ti_index_load(tbiName)) == NULL)
+    {
+    warn("Unable to load tabix index from \"%s\"", tbiName);
+    freez(&tbiName);
+    return NULL;
+    }
+struct lineFile *lf = needMem(sizeof(struct lineFile));
+lf->fileName = cloneString(fileName);
+lf->fd = -1;
+lf->bufSize = 64 * 1024;
+lf->buf = needMem(lf->bufSize);
+lf->zTerm = zTerm;
+lf->tabix = tabix;
+freez(&tbiName);
+return lf;
+#else // no USE_TABIX
+warn(COMPILE_WITH_TABIX, "lineFileOnTabix");
+return NULL;
+#endif // no USE_TABIX
+}
+
+boolean lineFileSetTabixRegion(struct lineFile *lf, char *seqName, int start, int end)
+/* Assuming lf was created by lineFileOnTabix, tell tabix to seek to the specified region
+ * and return TRUE (or if unable, return FALSE). */
+{
+#ifdef USE_TABIX
+if (lf->tabix == NULL)
+    errAbort("lineFileSetTabixRegion: lf->tabix is NULL.  Did you open lf with lineFileOnTabix?");
+int tabixSeqId = ti_get_tid(lf->tabix->idx, seqName);
+if (tabixSeqId < 0 && startsWith("chr", seqName))
+    // We will get some files that have chr-less Ensembl chromosome names:
+    tabixSeqId = ti_get_tid(lf->tabix->idx, seqName+strlen("chr"));
+if (tabixSeqId < 0)
+    return FALSE;
+ti_iter_t iter = ti_queryi(lf->tabix, tabixSeqId, start, end);
+if (iter == NULL)
+    return FALSE;
+if (lf->tabixIter != NULL)
+    ti_iter_destroy(lf->tabixIter);
+lf->tabixIter = iter;
+lf->bufOffsetInFile = ti_bgzf_tell(lf->tabix->fp);
+lf->bytesInBuf = 0;
+lf->lineIx = -1;
+lf->lineStart = 0;
+lf->lineEnd = 0;
+return TRUE;
+#else // no USE_TABIX
+warn(COMPILE_WITH_TABIX, "lineFileSetTabixRegion");
+return FALSE;
+#endif // no USE_TABIX
+}
+
+
 void lineFileExpandBuf(struct lineFile *lf, int newSize)
 /* Expand line file buffer. */
 {
 assert(newSize > lf->bufSize);
 lf->buf = needMoreMem(lf->buf, lf->bytesInBuf, newSize);
 lf->bufSize = newSize;
 }
 
 
 struct lineFile *lineFileStdin(bool zTerm)
 /* Wrap a line file around stdin. */
 {
 return lineFileAttach("stdin", zTerm, fileno(stdin));
 }
 
@@ -233,33 +302,42 @@
 /* Open up a lineFile or die trying. */
 {
 struct lineFile *lf = lineFileMayOpen(fileName, zTerm);
 if (lf == NULL)
     errAbort("Couldn't open %s , %s", fileName, strerror(errno));
 return lf;
 }
 
 void lineFileReuse(struct lineFile *lf)
 /* Reuse current line. */
 {
 lf->reuse = TRUE;
 }
 
 
+INLINE void noTabixSupport(struct lineFile *lf, char *where)
+{
+#ifdef USE_TABIX
+if (lf->tabix != NULL)
+    lineFileAbort(lf, "%s: not implemented for lineFile opened with lineFileOnTabix.", where);
+#endif // USE_TABIX
+}
+
 void lineFileSeek(struct lineFile *lf, off_t offset, int whence)
 /* Seek to read next line from given position. */
 {
+noTabixSupport(lf, "lineFileSeek");
 if (lf->pl != NULL)
     errnoAbort("Can't lineFileSeek on a compressed file: %s", lf->fileName);
 lf->reuse = FALSE;
 if (whence == SEEK_SET && offset >= lf->bufOffsetInFile 
 	&& offset < lf->bufOffsetInFile + lf->bytesInBuf)
     {
     lf->lineStart = lf->lineEnd = offset - lf->bufOffsetInFile;
     }
 else
     {
     lf->lineStart = lf->lineEnd = lf->bytesInBuf = 0;
     if ((lf->bufOffsetInFile = lseek(lf->fd, offset, whence)) == -1)
 	errnoAbort("Couldn't lineFileSeek %s", lf->fileName);
     }
 }
@@ -321,30 +399,54 @@
 int endIx = lf->lineEnd;
 boolean gotLf = FALSE;
 int newStart;
 
 if (lf->reuse)
     {
     lf->reuse = FALSE;
     if (retSize != NULL)
 	*retSize = lf->lineEnd - lf->lineStart;
     *retStart = buf + lf->lineStart;
     if (lf->metaOutput && *retStart[0] == '#') 
         metaDataAdd(lf, *retStart); 
     return TRUE;
     }
 
+#ifdef USE_TABIX
+if (lf->tabix != NULL && lf->tabixIter != NULL)
+    {
+    // Just use line-oriented ti_read:
+    int lineSize = 0;
+    const char *line = ti_read(lf->tabix, lf->tabixIter, &lineSize);
+    if (line == NULL)
+	return FALSE;
+    lf->bufOffsetInFile = -1;
+    lf->bytesInBuf = lineSize;
+    lf->lineIx = -1;
+    lf->lineStart = 0;
+    lf->lineEnd = lineSize;
+    if (lineSize > lf->bufSize)
+	// shouldn't be!  but just in case:
+	lineFileExpandBuf(lf, lineSize * 2);
+    safecpy(lf->buf, lf->bufSize, line);
+    *retStart = lf->buf;
+    if (retSize != NULL)
+	*retSize = lineSize;
+    return TRUE;
+    }
+#endif // USE_TABIX
+
 determineNlType(lf, buf+endIx, bytesInBuf);
 
 /* Find next end of line in buffer. */
 switch(lf->nlType)
     {
     case nlt_unix:
     case nlt_dos:
 	for (endIx = lf->lineEnd; endIx < bytesInBuf; ++endIx)
 	    {
 	    if (buf[endIx] == '\n')
 		{
 		gotLf = TRUE;
 		endIx += 1;
 		break;
 		}
@@ -368,30 +470,38 @@
 /* If not in buffer read in a new buffer's worth. */
 while (!gotLf)
     {
     int oldEnd = lf->lineEnd;
     int sizeLeft = bytesInBuf - oldEnd;
     int bufSize = lf->bufSize;
     int readSize = bufSize - sizeLeft;
 
     if (oldEnd > 0 && sizeLeft > 0)
 	{
 	memmove(buf, buf+oldEnd, sizeLeft);
 	}
     lf->bufOffsetInFile += oldEnd;
     if (lf->fd >= 0)
 	readSize = lineFileLongNetRead(lf->fd, buf+sizeLeft, readSize);
+#ifdef USE_TABIX
+    else if (lf->tabix != NULL && readSize > 0)
+	{
+	readSize = ti_bgzf_read(lf->tabix->fp, buf+sizeLeft, readSize);
+	if (readSize < 1)
+	    return FALSE;
+	}
+#endif // USE_TABIX
     else
         readSize = 0;
 
     if ((readSize == 0) && (endIx > oldEnd))
 	{
 	endIx = sizeLeft;
 	buf[endIx] = 0;
 	lf->bytesInBuf = newStart = lf->lineStart = 0;
 	lf->lineEnd = endIx;
 	++lf->lineIx;
 	if (retSize != NULL)
 	    *retSize = endIx - newStart;
 	*retStart = buf + newStart;
         if (*retStart[0] == '#')
             metaDataAdd(lf, *retStart);
@@ -507,30 +617,38 @@
 /* Close up a line file. */
 {
 struct lineFile *lf;
 if ((lf = *pLf) != NULL)
     {
     if (lf->pl != NULL)
         {
         pipelineWait(lf->pl);
         pipelineFree(&lf->pl);
         }
     else if (lf->fd > 0 && lf->fd != fileno(stdin))
 	{
 	close(lf->fd);
 	freeMem(lf->buf);
 	}
+#ifdef USE_TABIX
+    else if (lf->tabix != NULL)
+	{
+	if (lf->tabixIter != NULL)
+	    ti_iter_destroy(lf->tabixIter);
+	ti_close(lf->tabix);
+	}
+#endif // USE_TABIX
     freeMem(lf->fileName);
     metaDataFree(lf);
     freez(pLf);
     }
 }
 
 void lineFileCloseList(struct lineFile **pList)
 /* Close up a list of line files. */
 {
 struct lineFile *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     lineFileClose(&el);