src/lib/linefile.c 358232fdd2fe9b78248b54fd38a9cd2f52ff4586

358232fdd2fe9b78248b54fd38a9cd2f52ff4586
angie
  Wed Oct 21 12:49:05 2020 -0700
lineFileUdcMayOpen automatically decompresses local files but not urls.
netLineFileMayOpen does automatically decompress.
For now, just use netLineFileMayOpen, although in the long term it would
be better to ad support to lineFileUdcMayOpen to get the benefits of
caching.

diff --git src/lib/linefile.c src/lib/linefile.c
index f2d0960..3e12bcc 100644
--- src/lib/linefile.c
+++ src/lib/linefile.c
@@ -1,1481 +1,1488 @@
 /* lineFile - stuff to rapidly read text files and parse them into
  * lines.
  *
  * This file is copyright 2002 Jim Kent, but license is hereby
  * granted for all use - public, private or commercial. */
 
 #include "common.h"
 #include "hash.h"
 #include <fcntl.h>
 #include <signal.h>
 #include "dystring.h"
 #include "errAbort.h"
 #include "linefile.h"
 #include "pipeline.h"
 #include "localmem.h"
 #include "cheapcgi.h"
 #include "udc.h"
 #include "htslib/tbx.h"
 
 char *getFileNameFromHdrSig(char *m)
 /* Check if header has signature of supported compression stream,
    and return a phoney filename for it, or NULL if no sig found. */
 {
 char buf[20];
 char *ext=NULL;
 if (startsWith("\x1f\x8b",m)) ext = "gz";
 else if (startsWith("\x1f\x9d\x90",m)) ext = "Z";
 else if (startsWith("BZ",m)) ext = "bz2";
 else if (startsWith("PK\x03\x04",m)) ext = "zip";
 if (ext==NULL)
     return NULL;
 safef(buf, sizeof(buf), LF_BOGUS_FILE_PREFIX "%s", ext);
 return cloneString(buf);
 }
 
 static char **getDecompressor(char *fileName)
 /* if a file is compressed, return the command to decompress the
  * approriate format, otherwise return NULL */
 {
 static char *GZ_READ[] = {"gzip", "-dc", NULL};
 static char *Z_READ[] = {"gzip", "-dc", NULL};
 static char *BZ2_READ[] = {"bzip2", "-dc", NULL};
 static char *ZIP_READ[] = {"gzip", "-dc", NULL};
 
 char **result = NULL;
 char *fileNameDecoded = cloneString(fileName);
 if (startsWith("http://" , fileName)
  || startsWith("https://", fileName)
  || startsWith("ftp://",   fileName))
     cgiDecode(fileName, fileNameDecoded, strlen(fileName));
 
 if      (endsWith(fileNameDecoded, ".gz"))
     result = GZ_READ;
 else if (endsWith(fileNameDecoded, ".Z"))
     result = Z_READ;
 else if (endsWith(fileNameDecoded, ".bz2"))
     result = BZ2_READ;
 else if (endsWith(fileNameDecoded, ".zip"))
     result = ZIP_READ;
 
 freeMem(fileNameDecoded);
 return result;
 
 }
 
 static void metaDataAdd(struct lineFile *lf, char *line)
 /* write a line of metaData to output file
  * internal function called by lineFileNext */
 {
 struct metaOutput *meta = NULL;
 
 if (lf->isMetaUnique)
     {
     /* suppress repetition of comments */
     if (hashLookup(lf->metaLines, line))
         {
         return;
         }
     hashAdd(lf->metaLines, line, NULL);
     }
 for (meta = lf->metaOutput ; meta != NULL ; meta = meta->next)
     if (line != NULL && meta->metaFile != NULL)
         fprintf(meta->metaFile,"%s\n", line);
 }
 
 static void metaDataFree(struct lineFile *lf)
 /* free saved comments */
 {
 if (lf->isMetaUnique && lf->metaLines)
     freeHash(&lf->metaLines);
 }
 
 void lineFileSetMetaDataOutput(struct lineFile *lf, FILE *f)
 /* set file to write meta data to,
  * should be called before reading from input file */
 {
 struct metaOutput *meta = NULL;
 if (lf == NULL)
     return;
 AllocVar(meta);
 meta->next = NULL;
 meta->metaFile = f;
 slAddHead(&lf->metaOutput, meta);
 }
 
 void lineFileSetUniqueMetaData(struct lineFile *lf)
 /* suppress duplicate lines in metadata */
 {
 lf->isMetaUnique = TRUE;
 lf->metaLines = hashNew(8);
 }
 
 static char * headerBytes(char *fileName, int numbytes)
 /* Return specified number of header bytes from file
  * if file exists as a string which should be freed. */
 {
 int fd,bytesread=0;
 char *result = NULL;
 if ((fd = open(fileName, O_RDONLY)) >= 0)
     {
     result=needMem(numbytes+1);
     if ((bytesread=read(fd,result,numbytes)) < numbytes)
 	freez(&result);  /* file too short? can read numbytes */
     else
 	result[numbytes]=0;
     close(fd);
     }
 return result;
 }
 
 
 struct lineFile *lineFileDecompress(char *fileName, bool zTerm)
 /* open a linefile with decompression */
 {
 struct pipeline *pl;
 struct lineFile *lf;
 char *testName = NULL;
 char *testbytes = NULL;    /* the header signatures for .gz, .bz2, .Z,
 			    * .zip are all 2-4 bytes only */
 if (fileName==NULL)
   return NULL;
 testbytes=headerBytes(fileName,4);
 if (!testbytes)
     return NULL;  /* avoid error from pipeline */
 testName=getFileNameFromHdrSig(testbytes);
 freez(&testbytes);
 if (!testName)
     return NULL;  /* avoid error from pipeline */
 pl = pipelineOpen1(getDecompressor(fileName), pipelineRead|pipelineSigpipe, fileName, NULL);
 lf = lineFileAttach(fileName, zTerm, pipelineFd(pl));
 lf->pl = pl;
 return lf;
 }
 
 struct lineFile *lineFileDecompressFd(char *name, bool zTerm, int fd)
 /* open a linefile with decompression from a file or socket descriptor */
 {
 struct pipeline *pl;
 struct lineFile *lf;
 pl = pipelineOpenFd1(getDecompressor(name), pipelineRead|pipelineSigpipe, fd, STDERR_FILENO);
 lf = lineFileAttach(name, zTerm, pipelineFd(pl));
 lf->pl = pl;
 return lf;
 }
 
 
 
 struct lineFile *lineFileDecompressMem(bool zTerm, char *mem, long size)
 /* open a linefile with decompression from a memory stream */
 {
 struct pipeline *pl;
 struct lineFile *lf;
 char *fileName = getFileNameFromHdrSig(mem);
 if (fileName==NULL)
   return NULL;
 pl = pipelineOpenMem1(getDecompressor(fileName), pipelineRead|pipelineSigpipe, mem, size, STDERR_FILENO);
 lf = lineFileAttach(fileName, zTerm, pipelineFd(pl));
 lf->pl = pl;
 return lf;
 }
 
 
 
 struct lineFile *lineFileAttach(char *fileName, bool zTerm, int fd)
 /* Wrap a line file around an open'd file. */
 {
 struct lineFile *lf;
 AllocVar(lf);
 lf->fileName = cloneString(fileName);
 lf->fd = fd;
 lf->bufSize = 64*1024;
 lf->zTerm = zTerm;
 lf->buf = needMem(lf->bufSize+1);
 return lf;
 }
 
 struct lineFile *lineFileOnString(char *name, bool zTerm, char *s)
 /* Wrap a line file object around string in memory. This buffer
  * have zeroes written into it and be freed when the line file
  * is closed. */
 {
 struct lineFile *lf;
 AllocVar(lf);
 lf->fileName = cloneString(name);
 lf->fd = -1;
 lf->bufSize = lf->bytesInBuf = strlen(s);
 lf->zTerm = zTerm;
 lf->buf = s;
 return lf;
 }
 
 
 struct lineFile *lineFileTabixMayOpen(char *fileOrUrl, bool zTerm)
 /* Wrap a line file around a data file that has been compressed and indexed
  * by the tabix command line program. <fileOrUrl>.tbi must be readable in
  * addition to fileOrUrl. If there's a problem, warn & return NULL.  This works
  * only if kent/src has been compiled with USE_TABIX=1 and linked
  * with the tabix C library. */
 {
 return lineFileTabixAndIndexMayOpen(fileOrUrl, NULL, zTerm);
 }
 
 
 struct lineFile *lineFileTabixAndIndexMayOpen(char *fileOrUrl, char *tbiFileOrUrl, bool zTerm)
 /* Wrap a line file around a data file that has been compressed and indexed
  * by the tabix command line program. tbiFileOrUrl can be NULL, it defaults to <fileOrUrl>.tbi.
  * It must be readable in addition to fileOrUrl. If there's a problem, warn & return NULL.
  * This works only if kent/src has been compiled with USE_TABIX=1 and linked
  * with the tabix C library. */
 {
 if (fileOrUrl == NULL)
     errAbort("lineFileTabixMayOpen: fileOrUrl is NULL");
 
 char tbiName[4096];
 if (tbiFileOrUrl==NULL)
     safef(tbiName, sizeof(tbiName), "%s.tbi", fileOrUrl);
 else
     safef(tbiName, sizeof(tbiName), "%s", tbiFileOrUrl);
 
 htsFile *htsFile = hts_open(fileOrUrl, "r");
 if (htsFile == NULL)
     {
     warn("Unable to open \"%s\"", fileOrUrl);
     return NULL;
     }
 tbx_t *tabix;
 if ((tabix = tbx_index_load2(fileOrUrl, tbiName)) == NULL)
     {
     warn("Unable to load tabix index from \"%s\"", tbiName);
     if (tabix)
         ti_close(tabix);
     tabix = NULL;
     return NULL;
     }
 struct lineFile *lf = needMem(sizeof(struct lineFile));
 lf->fileName = cloneString(fileOrUrl);
 lf->fd = -1;
 lf->bufSize = 64 * 1024;
 lf->buf = needMem(lf->bufSize);
 lf->zTerm = zTerm;
 lf->tabix = tabix;
 lf->htsFile = htsFile;
 kstring_t *kline;
 AllocVar(kline);
 kline->s = malloc(8192);
 lf->kline = kline;
 lf->tabixIter = tbx_itr_queryi(tabix, HTS_IDX_REST, 0, 0);
 return lf;
 }
 
 boolean lineFileSetTabixRegion(struct lineFile *lf, char *seqName, int start, int end)
 /* Assuming lf was created by lineFileTabixMayOpen, tell tabix to seek to the specified region
  * and return TRUE (or if there are no items in region, return FALSE). */
 {
 if (lf->tabix == NULL)
     errAbort("lineFileSetTabixRegion: lf->tabix is NULL.  Did you open lf with lineFileTabixMayOpen?");
 if (seqName == NULL)
     return FALSE;
 int tabixSeqId = ti_get_tid(lf->tabix, seqName);
 if (tabixSeqId < 0 && startsWith("chr", seqName))
     // We will get some files that have chr-less Ensembl chromosome names:
     tabixSeqId = ti_get_tid(lf->tabix, seqName+strlen("chr"));
 // Allow SARS-CoV-2 VCF to use GenBank or RefSeq ID instead of our chromified RefSeq ID:
 if (tabixSeqId < 0 && sameString(seqName, "NC_045512v2"))
     {
     tabixSeqId = ti_get_tid(lf->tabix, "MN908947.3");
     if (tabixSeqId < 0)
         tabixSeqId = ti_get_tid(lf->tabix, "NC_045512.2");
     }
 if (tabixSeqId < 0)
     return FALSE;
 ti_iter_t *iter = ti_queryi((tbx_t *)lf->tabix, tabixSeqId, start, end);
 if (iter == NULL)
     return FALSE;
 if (lf->tabixIter != NULL)
     ti_iter_destroy(lf->tabixIter);
 lf->tabixIter = iter;
 lf->bytesInBuf = 0;
 lf->lineIx = -1;
 lf->lineStart = 0;
 lf->lineEnd = 0;
 return TRUE;
 }
 
 struct lineFile *lineFileUdcMayOpen(char *fileOrUrl, bool zTerm)
 /* Create a line file object with an underlying UDC cache. NULL if not found. */
 {
 if (fileOrUrl == NULL)
     errAbort("lineFileUdcMayOpen: fileOrUrl is NULL");
 
 if (udcIsLocal(fileOrUrl))
      return lineFileOpen(fileOrUrl, zTerm);
 else
     {
+    if (getDecompressor(fileOrUrl) != NULL)
+        {
+        warn("lineFileUdcMayOpen: can't open %s, support for compressed files not implemented. "
+             "[developer: use netLineFileMayOpen for compressed remote files.]",
+             fileOrUrl);
+        return NULL;
+        }
     struct udcFile *udcFile = udcFileMayOpen(fileOrUrl, NULL);
     if (udcFile == NULL)
 	return NULL;
     struct lineFile *lf;
     AllocVar(lf);
     lf->fileName = cloneString(fileOrUrl);
     lf->fd = -1;
     lf->bufSize = 0;
     lf->buf = NULL;
     lf->zTerm = zTerm;
     lf->udcFile = udcFile;
     return lf;
     }
 }
 
 
 void lineFileExpandBuf(struct lineFile *lf, int newSize)
 /* Expand line file buffer. */
 {
 assert(newSize > lf->bufSize);
 lf->buf = needMoreMem(lf->buf, lf->bytesInBuf, newSize);
 lf->bufSize = newSize;
 }
 
 
 struct lineFile *lineFileStdin(bool zTerm)
 /* Wrap a line file around stdin. */
 {
 return lineFileAttach("stdin", zTerm, fileno(stdin));
 }
 
 struct lineFile *lineFileMayOpen(char *fileName, bool zTerm)
 /* Try and open up a lineFile. */
 {
 if (sameString(fileName, "stdin"))
     return lineFileStdin(zTerm);
 else if (getDecompressor(fileName) != NULL)
     return lineFileDecompress(fileName, zTerm);
 else
     {
     int fd = open(fileName, O_RDONLY);
     if (fd == -1)
         return NULL;
     return lineFileAttach(fileName, zTerm, fd);
     }
 }
 
 struct lineFile *lineFileOpen(char *fileName, bool zTerm)
 /* Open up a lineFile or die trying. */
 {
 struct lineFile *lf = lineFileMayOpen(fileName, zTerm);
 if (lf == NULL)
     errAbort("Couldn't open %s , %s", fileName, strerror(errno));
 return lf;
 }
 
 void lineFileReuse(struct lineFile *lf)
 /* Reuse current line. */
 {
 lf->reuse = TRUE;
 }
 
 
 INLINE void noTabixSupport(struct lineFile *lf, char *where)
 {
 if (lf->tabix != NULL)
     lineFileAbort(lf, "%s: not implemented for lineFile opened with lineFileTabixMayOpen.", where);
 }
 
 void lineFileSeek(struct lineFile *lf, off_t offset, int whence)
 /* Seek to read next line from given position. */
 {
 noTabixSupport(lf, "lineFileSeek");
 if (lf->checkSupport)
     lf->checkSupport(lf, "lineFileSeek");
 if (lf->pl != NULL)
     errnoAbort("Can't lineFileSeek on a compressed file: %s", lf->fileName);
 lf->reuse = FALSE;
 if (lf->udcFile)
     {
     udcSeek(lf->udcFile, offset);
     return;
     }
 lf->lineStart = lf->lineEnd = lf->bytesInBuf = 0;
 if ((lf->bufOffsetInFile = lseek(lf->fd, offset, whence)) == -1)
     errnoAbort("Couldn't lineFileSeek %s", lf->fileName);
 }
 
 void lineFileRewind(struct lineFile *lf)
 /* Return lineFile to start. */
 {
 lineFileSeek(lf, 0, SEEK_SET);
 lf->lineIx = 0;
 }
 
 int lineFileLongNetRead(int fd, char *buf, int size)
 /* Keep reading until either get no new characters or
  * have read size */
 {
 int oneSize, totalRead = 0;
 
 while (size > 0)
     {
     oneSize = read(fd, buf, size);
     if (oneSize <= 0)
         break;
     totalRead += oneSize;
     buf += oneSize;
     size -= oneSize;
     }
 return totalRead;
 }
 
 void lineFileCarefulNewlines(struct lineFile *lf)
 /* Tell lf to use a less efficient method of scanning for the next newline that can handle
  * files with a mix of newline conventions. */
 {
 lf->nlType = nlt_mixed;
 }
 
 static void determineNlType(struct lineFile *lf, char *buf, int bufSize)
 /* determine type of newline used for the file, assumes buffer not empty */
 {
 char *c = buf;
 if (bufSize==0) return;
 if (lf->nlType != nlt_undet) return;  /* if already determined just exit */
 while (c < buf+bufSize)
     {
     if (*c=='\r')
 	{
     	lf->nlType = nlt_mac;
 	if (++c < buf+bufSize)
     	    if (*c == '\n')
     		lf->nlType = nlt_dos;
 	return;
 	}
     if (*(c++) == '\n')
 	{
         lf->nlType = nlt_unix;
 	return;
 	}
     }
 }
 
 static boolean findNextNewline(struct lineFile *lf, char *buf, int bytesInBuf, int *pEndIx)
 /* Return TRUE if able to find next end of line in buf, starting at buf[*pEndIx], up to bytesInBuf.
  * When done set *pEndIx to the start of the next line if applicable, otherwise bytesInBuf. */
 {
 boolean gotLf = FALSE;
 int endIx = *pEndIx;
 switch (lf->nlType)
     {
     case nlt_unix:
     case nlt_dos:
         for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx)
             {
             if (buf[endIx] == '\n')
                 {
                 gotLf = TRUE;
                 endIx += 1;
                 break;
                 }
             }
         break;
     case nlt_mac:
         for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx)
             {
             if (buf[endIx] == '\r')
                 {
                 gotLf = TRUE;
                 endIx += 1;
                 break;
                 }
             }
         break;
     case nlt_mixed:
     case nlt_undet:
         for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx)
             {
             char c = buf[endIx];
             if (c == '\r' || c == '\n')
                 {
                 gotLf = TRUE;
                 if (lf->zTerm)
                     buf[endIx] = '\0';
                 endIx += 1;
                 if (c == '\r' && buf[endIx] == '\n')
                     {
                     if (lf->zTerm)
                         buf[endIx] = '\0';
                     endIx += 1;
                     }
                 break;
                 }
             }
         break;
     }
 *pEndIx = endIx;
 return gotLf;
 }
 
 boolean lineFileNext(struct lineFile *lf, char **retStart, int *retSize)
 /* Fetch next line from file. */
 {
 int newStart;
 
 if (lf->reuse)
     {
     lf->reuse = FALSE;
     if (retSize != NULL)
 	*retSize = lf->lineEnd - lf->lineStart;
     *retStart = lf->buf + lf->lineStart;
     if (lf->metaOutput && *retStart[0] == '#')
         metaDataAdd(lf, *retStart);
     return TRUE;
     }
 
 if (lf->nextCallBack)
     return lf->nextCallBack(lf, retStart, retSize);
 
 if (lf->udcFile)
     {
     lf->bufOffsetInFile = udcTell(lf->udcFile);
     char *line = udcReadLine(lf->udcFile);
     if (line==NULL)
         return FALSE;
     int lineSize = strlen(line);
     lf->bytesInBuf = lineSize;
     ++lf->lineIx;
     lf->lineStart = 0;
     lf->lineEnd = lineSize;
     *retStart = line;
     freeMem(lf->buf);
     lf->buf = line;
     lf->bufSize = lineSize;
     if (retSize != NULL)
 	*retSize = lineSize;
     return TRUE;
     }
 
 if (lf->tabix != NULL && lf->tabixIter != NULL)
     {
     // Just use line-oriented ti_read:
     int lineSize = 0;
     lineSize = tbx_itr_next(lf->htsFile, lf->tabix, lf->tabixIter, lf->kline);
     if (lineSize == -1)
 	return FALSE;
     lf->bufOffsetInFile = -1;
     lf->bytesInBuf = lineSize;
     lf->lineIx = -1;
     lf->lineStart = 0;
     lf->lineEnd = lineSize;
     if (lineSize > lf->bufSize)
 	// shouldn't be!  but just in case:
 	lineFileExpandBuf(lf, lineSize * 2);
     kstring_t *kline = lf->kline;
     safecpy(lf->buf, lf->bufSize, kline->s);
     *retStart = lf->buf;
     if (retSize != NULL)
 	*retSize = lineSize;
     return TRUE;
     }
 
 char *buf = lf->buf;
 int endIx = lf->lineEnd;
 int bytesInBuf = lf->bytesInBuf;
 determineNlType(lf, buf+endIx, bytesInBuf-endIx);
 boolean gotLf = findNextNewline(lf, buf, bytesInBuf, &endIx);
 
 /* If not in buffer read in a new buffer's worth. */
 while (!gotLf)
     {
     int oldEnd = lf->lineEnd;
     int sizeLeft = bytesInBuf - oldEnd;
     int bufSize = lf->bufSize;
     int readSize = bufSize - sizeLeft;
 
     if (oldEnd > 0 && sizeLeft > 0)
 	{
 	memmove(buf, buf+oldEnd, sizeLeft);
 	}
     lf->bufOffsetInFile += oldEnd;
     if (lf->fd >= 0)
 	readSize = lineFileLongNetRead(lf->fd, buf+sizeLeft, readSize);
     else if (lf->tabix != NULL && readSize > 0)
 	{
         errAbort("bgzf read not supported with htslib (yet)");
 	if (readSize < 1)
 	    return FALSE;
 	}
     else
         readSize = 0;
 
     if ((readSize == 0) && (endIx > oldEnd))
 	{
 	endIx = sizeLeft;
 	buf[endIx] = 0;
 	lf->bytesInBuf = newStart = lf->lineStart = 0;
 	lf->lineEnd = endIx;
 	++lf->lineIx;
 	if (retSize != NULL)
 	    *retSize = endIx - newStart;
 	*retStart = buf + newStart;
         if (*retStart[0] == '#')
             metaDataAdd(lf, *retStart);
 	return TRUE;
 	}
     else if (readSize <= 0)
 	{
 	lf->bytesInBuf = lf->lineStart = lf->lineEnd = 0;
 	return FALSE;
 	}
     else
         endIx = sizeLeft;
 
     bytesInBuf = lf->bytesInBuf = readSize + sizeLeft;
     lf->lineEnd = 0;
 
     determineNlType(lf, buf+endIx, bytesInBuf-endIx);
     gotLf = findNextNewline(lf, buf, bytesInBuf, &endIx);
 
     if (!gotLf && bytesInBuf == lf->bufSize)
         {
 	if (bufSize >= 512*1024*1024)
 	    {
 	    errAbort("Line too long (more than %d chars) line %d of %s",
 		lf->bufSize, lf->lineIx+1, lf->fileName);
 	    }
 	else
 	    {
 	    lineFileExpandBuf(lf, bufSize*2);
 	    buf = lf->buf;
 	    }
 	}
     }
 
 if (lf->zTerm)
     {
     buf[endIx-1] = 0;
     if ((lf->nlType == nlt_dos) && (buf[endIx-2]=='\r'))
 	{
 	buf[endIx-2] = 0;
 	}
     }
 
 lf->lineStart = newStart = lf->lineEnd;
 lf->lineEnd = endIx;
 ++lf->lineIx;
 if (retSize != NULL)
     *retSize = endIx - newStart;
 *retStart = buf + newStart;
 if (*retStart[0] == '#')
     metaDataAdd(lf, *retStart);
 return TRUE;
 }
 
 void lineFileVaAbort(struct lineFile *lf, char *format, va_list args)
 /* Print file name, line number, and error message, and abort. */
 {
 struct dyString *dy = dyStringNew(0);
 dyStringPrintf(dy,  "Error line %d of %s: ", lf->lineIx, lf->fileName);
 dyStringVaPrintf(dy, format, args);
 errAbort("%s", dy->string);
 dyStringFree(&dy);
 }
 
 void lineFileAbort(struct lineFile *lf, char *format, ...)
 /* Print file name, line number, and error message, and abort. */
 {
 va_list args;
 va_start(args, format);
 lineFileVaAbort(lf, format, args);
 va_end(args);
 }
 
 void lineFileUnexpectedEnd(struct lineFile *lf)
 /* Complain about unexpected end of file. */
 {
 errAbort("Unexpected end of file in %s", lf->fileName);
 }
 
 void lineFileNeedNext(struct lineFile *lf, char **retStart, int *retSize)
 /* Fetch next line from file.  Squawk and die if it's not there. */
 {
 if (!lineFileNext(lf, retStart, retSize))
     lineFileUnexpectedEnd(lf);
 }
 
 void lineFileClose(struct lineFile **pLf)
 /* Close up a line file. */
 {
 struct lineFile *lf;
 if ((lf = *pLf) != NULL)
     {
     struct pipeline *pl = lf->pl;
     if (pl != NULL)
         {
         pipelineClose(&lf->pl);
         }
     else if (lf->fd > 0 && lf->fd != fileno(stdin))
 	{
 	close(lf->fd);
 	freeMem(lf->buf);
 	}
     else if (lf->tabix != NULL)
 	{
 	if (lf->tabixIter != NULL)
 	    ti_iter_destroy(lf->tabixIter);
 	ti_close(lf->tabix);
         hts_close(lf->htsFile);
         kstring_t *kline = lf->kline;
         free(kline->s);
 	}
     else if (lf->udcFile != NULL)
         udcFileClose(&lf->udcFile);
 
     if (lf->closeCallBack)
         lf->closeCallBack(lf);
     freeMem(lf->fileName);
     metaDataFree(lf);
     freez(pLf);
     }
 }
 
 void lineFileCloseList(struct lineFile **pList)
 /* Close up a list of line files. */
 {
 struct lineFile *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     lineFileClose(&el);
     }
 *pList = NULL;
 }
 
 void lineFileExpectWordsMesg(struct lineFile *lf, int expecting, int got, char* extraMessage)
 /* Check line has right number of words. Add extraMessage to end of error message. */
 {
 if (expecting != got)
     errAbort("Expecting %d words line %d of %s got %d. %s",
 	    expecting, lf->lineIx, lf->fileName, got, extraMessage);
 }
 
 void lineFileExpectWords(struct lineFile *lf, int expecting, int got)
 /* Check line has right number of words. */
 {
     lineFileExpectWordsMesg(lf, expecting, got, "");
 }
 
 void lineFileExpectAtLeast(struct lineFile *lf, int expecting, int got)
 /* Check line has right number of words. */
 {
 if (got < expecting)
     errAbort("Expecting at least %d words line %d of %s got %d",
 	    expecting, lf->lineIx, lf->fileName, got);
 }
 
 void lineFileShort(struct lineFile *lf)
 /* Complain that line is too short. */
 {
 errAbort("Short line %d of %s", lf->lineIx, lf->fileName);
 }
 
 void lineFileReuseFull(struct lineFile *lf)
 // Reuse last full line read.  Unlike lineFileReuse,
 // lineFileReuseFull only works with previous lineFileNextFull call
 {
 assert(lf->fullLine != NULL);
 lf->fullLineReuse = TRUE;
 }
 
 
 boolean lineFileNextFull(struct lineFile *lf, char **retFull, int *retFullSize,
                         char **retRaw, int *retRawSize)
 // Fetch next line from file joining up any that are continued by ending '\'
 // If requested, and was joined, the unjoined raw lines are also returned
 // NOTE: comment lines can't be continued!  ("# comment \ \n more comment" is 2 lines.)
 {
 // May have requested reusing the last full line.
 if (lf->fullLineReuse)
     {
     lf->fullLineReuse = FALSE;
     assert(lf->fullLine != NULL);
     *retFull = dyStringContents(lf->fullLine);
     if (retFullSize)
         *retFullSize = dyStringLen(lf->fullLine);
     if (retRaw != NULL)
         {
         assert(lf->rawLines != NULL);
         *retRaw = dyStringContents(lf->rawLines);
         if (retRawSize)
             *retRawSize = dyStringLen(lf->rawLines);
         }
     return TRUE;
     }
 
 // Empty pointers
 *retFull = NULL;
 if (retRaw != NULL)
     *retRaw = NULL;
 
 // Prepare lf buffers
 if (lf->fullLine == NULL)
     {
     lf->fullLine = dyStringNew(1024);
     lf->rawLines = dyStringNew(1024); // Better to always create it than test every time
     }
 else
     {
     dyStringClear(lf->fullLine);
     dyStringClear(lf->rawLines);
     }
 
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     char *start = skipLeadingSpaces(line);
 
     // Will the next line continue this one?
     char *end = start;
     if (*start == '#')  // Comment lines can't be continued!
         end = start + strlen(start);
     else
         {
         while (*end != '\0')  // walking forward for efficiency (avoid strlens())
             {
             for (;*end != '\0' && *end != '\\'; end++) ; // Tight loop to find '\'
             if (*end == '\0')
                 break;
 
             // This could be a continuation
             char *slash = end;
             if (*(++end) == '\\')  // escaped
                 continue;
             end = skipLeadingSpaces(end);
 
             if (*end == '\0') // Just whitespace after '\', so true continuation mark
                 {
                 if (retRaw != NULL) // Only if actually requested.
                     {
                     dyStringAppendN(lf->rawLines,line,(end - line));
                     dyStringAppendC(lf->rawLines,'\n'); // New lines delimit raw lines.
                     }
                 end = slash; // Don't need to zero, because of appending by length
                 break;
                 }
             }
         }
 
     // Stitch together full lines
     if (dyStringLen(lf->fullLine) == 0)
         dyStringAppendN(lf->fullLine,line,(end - line)); // includes first line's whitespace
     else if (start < end)             // don't include continued line's leading spaces
         dyStringAppendN(lf->fullLine,start,(end - start));
 
     if (*end == '\\')
         continue;
 
     // Got a full line now!
     *retFull = dyStringContents(lf->fullLine);
     if (retFullSize)
         *retFullSize = dyStringLen(lf->fullLine);
 
     if (retRaw != NULL && dyStringLen(lf->rawLines) > 0) // Only if actually requested & continued
         {
         // This is the final line which doesn't have a continuation char
         dyStringAppendN(lf->rawLines,line,(end - line));
         *retRaw = dyStringContents(lf->rawLines);
         if (retRawSize)
             *retRawSize = dyStringLen(lf->rawLines);
         }
     return TRUE;
     }
 return FALSE;
 }
 
 boolean lineFileNextReal(struct lineFile *lf, char **retStart)
 /* Fetch next line from file that is not blank and
  *  * does not start with a '#'. */
 {
 char *s, c;
 while (lineFileNext(lf, retStart, NULL))
     {
     s = skipLeadingSpaces(*retStart);
     c = s[0];
     if (c != 0 && c != '#')
         return TRUE;
     }
 return FALSE;
 }
 
 boolean lineFileNextFullReal(struct lineFile *lf, char **retStart)
 // Fetch next line from file that is not blank and does not start with a '#'.
 // Continuation lines (ending in '\') are joined into a single line.
 {
 while (lineFileNextFull(lf, retStart, NULL, NULL, NULL))
     {
     char *clippedText = skipLeadingSpaces(*retStart);
     if (clippedText[0] != '\0' && clippedText[0] != '#')
         return TRUE;
     }
 return FALSE;
 }
 
 
 int lineFileChopNext(struct lineFile *lf, char *words[], int maxWords)
 /* Return next non-blank line that doesn't start with '#' chopped into words. */
 {
 int lineSize, wordCount;
 char *line;
 
 while (lineFileNext(lf, &line, &lineSize))
     {
     if (line[0] == '#')
         continue;
     wordCount = chopByWhite(line, words, maxWords);
     if (wordCount != 0)
         return wordCount;
     }
 return 0;
 }
 
 int lineFileChopCharNext(struct lineFile *lf, char sep, char *words[], int maxWords)
 /* Return next non-blank line that doesn't start with '#' chopped into
    words delimited by sep. */
 {
 int lineSize, wordCount;
 char *line;
 
 while (lineFileNext(lf, &line, &lineSize))
     {
     if (line[0] == '#')
         continue;
     wordCount = chopByChar(line, sep, words, maxWords);
     if (wordCount != 0)
         return wordCount;
     }
 return 0;
 }
 
 int lineFileChopNextTab(struct lineFile *lf, char *words[], int maxWords)
 /* Return next non-blank line that doesn't start with '#' chopped into words
  * on tabs */
 {
 int lineSize, wordCount;
 char *line;
 
 while (lineFileNext(lf, &line, &lineSize))
     {
     if (line[0] == '#')
         continue;
     wordCount = chopByChar(line, '\t', words, maxWords);
     if (wordCount != 0)
         return wordCount;
     }
 return 0;
 }
 
 boolean lineFileNextCharRow(struct lineFile *lf, char sep, char *words[], int wordCount)
 /* Return next non-blank line that doesn't start with '#' chopped into words
  * delimited by sep. Returns FALSE at EOF.  Aborts on error. */
 {
 int wordsRead;
 wordsRead = lineFileChopCharNext(lf, sep, words, wordCount);
 if (wordsRead == 0)
     return FALSE;
 if (wordsRead < wordCount)
     lineFileExpectWords(lf, wordCount, wordsRead);
 return TRUE;
 }
 
 boolean lineFileNextRow(struct lineFile *lf, char *words[], int wordCount)
 /* Return next non-blank line that doesn't start with '#' chopped into words.
  * Returns FALSE at EOF.  Aborts on error. */
 {
 int wordsRead;
 wordsRead = lineFileChopNext(lf, words, wordCount);
 if (wordsRead == 0)
     return FALSE;
 if (wordsRead < wordCount)
     lineFileExpectWords(lf, wordCount, wordsRead);
 return TRUE;
 }
 
 boolean lineFileNextRowTab(struct lineFile *lf, char *words[], int wordCount)
 /* Return next non-blank line that doesn't start with '#' chopped into words
  * at tabs. Returns FALSE at EOF.  Aborts on error. */
 {
 int wordsRead;
 wordsRead = lineFileChopNextTab(lf, words, wordCount);
 if (wordsRead == 0)
     return FALSE;
 if (wordsRead < wordCount)
     lineFileExpectWords(lf, wordCount, wordsRead);
 return TRUE;
 }
 
 int lineFileNeedFullNum(struct lineFile *lf, char *words[], int wordIx)
 /* Make sure that words[wordIx] is an ascii integer, and return
  * binary representation of it. Require all chars in word to be digits.*/
 {
 char *c;
 for (c = words[wordIx]; *c; c++)
     {
     if (*c == '-' || isdigit(*c))
         /* NOTE: embedded '-' will be caught by lineFileNeedNum */
         continue;
     errAbort("Expecting integer field %d line %d of %s, got %s",
             wordIx+1, lf->lineIx, lf->fileName, words[wordIx]);
     }
 return lineFileNeedNum(lf, words, wordIx);
 }
 
 int lineFileNeedNum(struct lineFile *lf, char *words[], int wordIx)
 /* Make sure that words[wordIx] is an ascii integer, and return
  * binary representation of it. Conversion stops at first non-digit char. */
 {
 char *ascii = words[wordIx];
 char c = ascii[0];
 if (c != '-' && !isdigit(c))
     errAbort("Expecting number field %d line %d of %s, got %s",
     	wordIx+1, lf->lineIx, lf->fileName, ascii);
 return atoi(ascii);
 }
 
 int lineFileCheckAllIntsNoAbort(char *s, void *val, 
     boolean isSigned, int byteCount, char *typeString, boolean noNeg, 
     char *errMsg, int errMsgSize)
 /* Convert string to (signed) integer of the size specified.  
  * Unlike atol assumes all of string is number, no trailing trash allowed.
  * Returns 0 if conversion possible, and value is returned in 'val'
  * Otherwise 1 for empty string or trailing chars, and 2 for numeric overflow,
  * and 3 for (-) sign in unsigned number.
  * Error messages if any are written into the provided buffer.
  * Pass NULL val if you only want validation.
  * Use noNeg if negative values are not allowed despite the type being signed,
  * returns 4. */
 {
 unsigned long long res = 0, oldRes = 0;
 boolean isMinus = FALSE;
 
 if ((byteCount != 1) 
  && (byteCount != 2)
  && (byteCount != 4)
  && (byteCount != 8))
     errAbort("Unexpected error: Invalid byte count for integer size in lineFileCheckAllIntsNoAbort, expected 1 2 4 or 8, got %d.", byteCount);
 
 unsigned long long limit = 0xFFFFFFFFFFFFFFFFULL >> (8*(8-byteCount));
 
 if (isSigned) 
     limit >>= 1;
 
 char *p, *p0 = s;
 
 if (*p0 == '-')
     {
     if (isSigned)
 	{
 	if (noNeg)
 	    {
 	    safef(errMsg, errMsgSize, "Negative value not allowed");
 	    return 4; 
 	    }
 	p0++;
 	++limit;
 	isMinus = TRUE;
 	}
     else
 	{
 	safef(errMsg, errMsgSize, "Unsigned %s may not begin with minus sign (-)", typeString);
 	return 3; 
 	}
     }
 p = p0;
 while ((*p >= '0') && (*p <= '9'))
     {
     res *= 10;
     if (res < oldRes)
 	{
 	safef(errMsg, errMsgSize, "%s%s overflowed", isSigned ? "signed ":"", typeString);
 	return 2; 
 	}
     oldRes = res;
     res += *p - '0';
     if (res < oldRes)
 	{
 	safef(errMsg, errMsgSize, "%s%s overflowed", isSigned ? "signed ":"", typeString);
 	return 2; 
 	}
     if (res > limit)
 	{
 	safef(errMsg, errMsgSize, "%s%s overflowed, limit=%s%llu", isSigned ? "signed ":"", typeString, isMinus ? "-" : "", limit);
 	return 2; 
 	}
     oldRes = res;
     p++;
     }
 /* test for invalid character, empty, or just a minus */
 if (*p != '\0')
     {
     safef(errMsg, errMsgSize, "Trailing characters parsing %s%s", isSigned ? "signed ":"", typeString);
     return 1;
     }
 if (p == p0)
     {
     safef(errMsg, errMsgSize, "Empty string parsing %s%s", isSigned ? "signed ":"", typeString);
     return 1;
     }
 
 if (!val)
     return 0;  // only validation required
 
 switch (byteCount)
     {
     case 1:
 	if (isSigned)
 	    {
 	    if (isMinus)
 		*(char *)val = -res;
 	    else
 		*(char *)val = res;
 	    }
 	else
 	    *(unsigned char *)val = res;
 	break;
     case 2:
 	if (isSigned)
 	    {
 	    if (isMinus)
 		*(short *)val = -res;
 	    else
 		*(short *)val = res;
 	    }
 	else
 	    *(unsigned short *)val = res;
 	break;
     case 4:
 	if (isSigned)
 	    {
 	    if (isMinus)
 		*(int *)val = -res;
 	    else
 		*(int *)val = res;
 	    }
 	else
 	    *(unsigned *)val = res;
 	break;
     case 8:
 	if (isSigned)
 	    {
 	    if (isMinus)
 		*(long long *)val = -res;
 	    else
 		*(long long *) val =res;
 	    }
 	else
 	    *(unsigned long long *)val = res;
 	break;
     }
 
 
 return 0;
 }
 
 void lineFileAllInts(struct lineFile *lf, char *words[], int wordIx, void *val,
   boolean isSigned,  int byteCount, char *typeString, boolean noNeg)
 /* Returns long long integer from converting the input string. Aborts on error. */
 {
 char *s = words[wordIx];
 char errMsg[256];
 int res = lineFileCheckAllIntsNoAbort(s, val, isSigned, byteCount, typeString, noNeg, errMsg, sizeof errMsg);
 if (res > 0)
     {
     errAbort("%s in field %d line %d of %s, got %s",
 	errMsg, wordIx+1, lf->lineIx, lf->fileName, s);
     }
 }
 
 int lineFileAllIntsArray(struct lineFile *lf, char *words[], int wordIx, void *array, int arraySize,
   boolean isSigned,  int byteCount, char *typeString, boolean noNeg)
 /* Convert comma separated list of numbers to an array.  Pass in
  * array and max size of array. Aborts on error. Returns number of elements in parsed array. */
 {
 char *s = words[wordIx];
 char errMsg[256];
 unsigned count = 0;
 char *cArray = array;
 for (;;)
     {
     char *e;
     if (s == NULL || s[0] == 0 || count == arraySize)
         break;
     e = strchr(s, ',');
     if (e)
         *e = 0;
     int res = lineFileCheckAllIntsNoAbort(s, cArray, isSigned, byteCount, typeString, noNeg, errMsg, sizeof errMsg);
     if (res > 0)
 	{
 	errAbort("%s in column %d of array field %d line %d of %s, got %s",
 	    errMsg, count, wordIx+1, lf->lineIx, lf->fileName, s);
 	}
     if (cArray) // NULL means validation only.
 	cArray += byteCount;  
     count++;
     if (e)  // restore input string
         *e++ = ',';
     s = e;
     }
 return count;
 }
 
 
 double lineFileNeedDouble(struct lineFile *lf, char *words[], int wordIx)
 /* Make sure that words[wordIx] is an ascii double value, and return
  * binary representation of it. */
 {
 char *valEnd;
 char *val = words[wordIx];
 double doubleValue;
 
 doubleValue = strtod(val, &valEnd);
 if ((*val == '\0') || (*valEnd != '\0'))
     errAbort("Expecting double field %d line %d of %s, got %s",
     	wordIx+1, lf->lineIx, lf->fileName, val);
 return doubleValue;
 }
 
 void lineFileSkip(struct lineFile *lf, int lineCount)
 /* Skip a number of lines. */
 {
 int i, lineSize;
 char *line;
 
 for (i=0; i<lineCount; ++i)
     {
     if (!lineFileNext(lf, &line, &lineSize))
         errAbort("Premature end of file in %s", lf->fileName);
     }
 }
 
 char *lineFileSkipToLineStartingWith(struct lineFile *lf, char *start, int maxCount)
 /* Skip to next line that starts with given string.  Return NULL
  * if no such line found, otherwise return the line. */
 {
 char *line;
 while (lineFileNext(lf, &line, NULL) && --maxCount >= 0)
     {
     if (startsWith(start, line))
         return line;
     }
 return NULL;
 }
 
 char *lineFileReadAll(struct lineFile *lf)
 /* Read remainder of lineFile and return it as a string. */
 {
 struct dyString *dy = dyStringNew(1024*4);
 lf->zTerm = 0;
 int size;
 char *line;
 while (lineFileNext(lf, &line, &size))
     dyStringAppendN(dy, line, size);
 return dyStringCannibalize(&dy);
 }
 
 boolean lineFileParseHttpHeader(struct lineFile *lf, char **hdr,
 				boolean *chunked, int *contentLength)
 /* Extract HTTP response header from lf into hdr, tell if it's
  * "Transfer-Encoding: chunked" or if it has a contentLength. */
 {
   struct dyString *header = newDyString(1024);
   char *line;
   int lineSize;
 
   if (chunked != NULL)
     *chunked = FALSE;
   if (contentLength != NULL)
     *contentLength = -1;
   dyStringClear(header);
   if (lineFileNext(lf, &line, &lineSize))
     {
       if (startsWith("HTTP/", line))
 	{
 	char *version, *code;
 	dyStringAppendN(header, line, lineSize-1);
 	dyStringAppendC(header, '\n');
 	version = nextWord(&line);
 	code = nextWord(&line);
 	if (code == NULL)
 	    {
 	    warn("%s: Expecting HTTP/<version> <code> header line, got this: %s\n", lf->fileName, header->string);
 	    *hdr = cloneString(header->string);
 	    dyStringFree(&header);
 	    return FALSE;
 	    }
 	if (!sameString(code, "200"))
 	    {
 	    warn("%s: Errored HTTP response header: %s %s %s\n", lf->fileName, version, code, line);
 	    *hdr = cloneString(header->string);
 	    dyStringFree(&header);
 	    return FALSE;
 	    }
 	while (lineFileNext(lf, &line, &lineSize))
 	    {
 	    /* blank line means end of HTTP header */
 	    if ((line[0] == '\r' && line[1] == 0) || line[0] == 0)
 	        break;
 	    if (strstr(line, "Transfer-Encoding: chunked") && chunked != NULL)
 	        *chunked = TRUE;
 	    dyStringAppendN(header, line, lineSize-1);
 	    dyStringAppendC(header, '\n');
 	    if (strstr(line, "Content-Length:"))
 	      {
 		code = nextWord(&line);
 		code = nextWord(&line);
 		if (contentLength != NULL)
 		    *contentLength = atoi(code);
 	      }
 	    }
 	}
       else
 	{
 	  /* put the line back, don't put it in header/hdr */
 	  lineFileReuse(lf);
 	  warn("%s: Expecting HTTP/<version> <code> header line, got this: %s\n", lf->fileName, header->string);
 	  *hdr = cloneString(header->string);
 	  dyStringFree(&header);
 	  return FALSE;
 	}
     }
   else
     {
       *hdr = cloneString(header->string);
       dyStringFree(&header);
       return FALSE;
     }
 
   *hdr = cloneString(header->string);
   dyStringFree(&header);
   return TRUE;
 } /* lineFileParseHttpHeader */
 
 struct dyString *lineFileSlurpHttpBody(struct lineFile *lf,
 				       boolean chunked, int contentLength)
 /* Return a dyString that contains the http response body in lf.  Handle
  * chunk-encoding and content-length. */
 {
   struct dyString *body = newDyString(64*1024);
   char *line;
   int lineSize;
 
   dyStringClear(body);
   if (chunked)
     {
       /* Handle "Transfer-Encoding: chunked" body */
       /* Procedure from RFC2068 section 19.4.6 */
       char *csword;
       unsigned chunkSize = 0;
       unsigned size;
       do
 	{
 	  /* Read line that has chunk size (in hex) as first word. */
 	  if (lineFileNext(lf, &line, NULL))
 	    csword = nextWord(&line);
 	  else break;
 	  if (sscanf(csword, "%x", &chunkSize) < 1)
 	    {
 	      warn("%s: chunked transfer-encoding chunk size parse error.\n",
 		   lf->fileName);
 	      break;
 	    }
 	  /* If chunk size is 0, read in a blank line & then we're done. */
 	  if (chunkSize == 0)
 	    {
 	      lineFileNext(lf, &line, NULL);
 	      if (line == NULL || (line[0] != '\r' && line[0] != 0))
 		warn("%s: chunked transfer-encoding: expected blank line, got %s\n",
 		     lf->fileName, line);
 
 	      break;
 	    }
 	  /* Read (and save) lines until we have read in chunk. */
 	  for (size = 0;  size < chunkSize;  size += lineSize)
 	    {
 	      if (! lineFileNext(lf, &line, &lineSize))
 		break;
 	      dyStringAppendN(body, line, lineSize-1);
 	      dyStringAppendC(body, '\n');
 	    }
 	  /* Read blank line - or extra CRLF inserted in the middle of the
 	   * current line, in which case we need to trim it. */
 	  if (size > chunkSize)
 	    {
 	      body->stringSize -= (size - chunkSize);
 	      body->string[body->stringSize] = 0;
 	    }
 	  else if (size == chunkSize)
 	    {
 	      lineFileNext(lf, &line, NULL);
 	      if (line == NULL || (line[0] != '\r' && line[0] != 0))
 		warn("%s: chunked transfer-encoding: expected blank line, got %s\n",
 		     lf->fileName, line);
 	    }
 	} while (chunkSize > 0);
       /* Try to read in next line.  If it's an HTTP header, put it back. */
       /* If there is a next line but it's not an HTTP header, it's a footer. */
       if (lineFileNext(lf, &line, NULL))
 	{
 	  if (startsWith("HTTP/", line))
 	    lineFileReuse(lf);
 	  else
 	    {
 	      /* Got a footer -- keep reading until blank line */
 	      warn("%s: chunked transfer-encoding: got footer %s, discarding it.\n",
 		   lf->fileName, line);
 	      while (lineFileNext(lf, &line, NULL))
 		{
 		  if ((line[0] == '\r' && line[1] == 0) || line[0] == 0)
 		    break;
 		  warn("discarding footer line: %s\n", line);
 		}
 	    }
 	}
     }
   else if (contentLength >= 0)
     {
       /* Read in known length */
       int size;
       for (size = 0;  size < contentLength;  size += lineSize)
 	{
 	  if (! lineFileNext(lf, &line, &lineSize))
 	    break;
 	  dyStringAppendN(body, line, lineSize-1);
 	  dyStringAppendC(body, '\n');
 	}
     }
   else
     {
       /* Read in to end of file (assume it's not a persistent connection) */
       while (lineFileNext(lf, &line, &lineSize))
 	{
 	  dyStringAppendN(body, line, lineSize-1);
 	  dyStringAppendC(body, '\n');
 	}
     }
 
   return(body);
 } /* lineFileSlurpHttpBody */
 
 void lineFileRemoveInitialCustomTrackLines(struct lineFile *lf)
 /* remove initial browser and track lines */
 {
 char *line;
 while (lineFileNextReal(lf, &line))
     {
     if (!(startsWith("browser", line) || startsWith("track", line) ))
         {
         verbose(2, "found line not browser or track: %s\n", line);
         lineFileReuse(lf);
         break;
         }
     verbose(2, "skipping %s\n", line);
     }
 }