bbeeeeb5d888089a025aa547c05bfc9b443dc39f
angie
  Thu Dec 20 14:06:39 2018 -0800
Adding support for files that may have a mix of newline styles (\r\n, \r, \n), enabled by calling lineFileCarefulNewlines.  refs #22638
Scanning for any type of newline is not quite as efficient as scanning for only one pre-determined type, but it's necessary to deal with the kind of garbage data that has snuck into some saved sessions.
I also fixed a couple subtle cases that have not caused any trouble in our day-to-day dealings with nice \n-separated input with line sizes shorter than the default lf->buf size (64k):
* determineNlType initialized lf->nlType to UNIX, but if the first non-empty buffer did not contain any newline, UNIX may or may not have been the correct type.
* The second time determineNlType was called, it was using an outdated endIx.  Note the second instance of scanning for newlines used < sizeLeft as a test instead of endIx; that needed to be applied to determineNlType too.
* determineNlType was called with buf+endIx, but with a byte limit that didn't account for endIx.
I tested lineFile with an initial buf size of 16 (in lineFileAttach) to test the looping on gotLf.

diff --git src/inc/linefile.h src/inc/linefile.h
index 1481c98..0c0092e 100644
--- src/inc/linefile.h
+++ src/inc/linefile.h
@@ -1,308 +1,313 @@
 /* lineFile - stuff to rapidly read text files and parse them into
  * lines.
  *
  * This file is copyright 2002 Jim Kent, but license is hereby
  * granted for all use - public, private or commercial. */
 
 #ifndef LINEFILE_H
 #define LINEFILE_H
 
 #include "dystring.h"
 #include "udc.h"
 
 #define tabix_t tbx_t
 #define ti_iter_t hts_itr_t
 #define ti_open hts_open
 #define ti_index_load tbx_index_load
 #define ti_close tbx_destroy
 #define ti_get_tid tbx_name2id
 #define ti_queryi tbx_itr_queryi
 #define ti_iter_destroy tbx_itr_destroy
 
 #define LF_BOGUS_FILE_PREFIX "somefile."
 
 enum nlType {
  nlt_undet, /* undetermined */
  nlt_unix,  /* lf   */
  nlt_dos,   /* crlf */
- nlt_mac    /* cr   */
+ nlt_mac,   /* cr   */
+ nlt_mixed  /* could be any or all of the above */
 };
 
 struct metaOutput
 /* struct to store list of file handles to output meta data to
  * meta data is text after # */
     {
     struct metaOutput *next;    /* next file handle */
     FILE *metaFile;             /* file to write metadata to */
     };
 
 struct lineFile
 /* Structure to handle fast, line oriented
  * fileIo. */
     {
     struct lineFile *next;	/* Might need to be on a list. */
     char *fileName;		/* Name of file. */
     int fd;			/* File handle.  -1 for 'memory' files. */
     int bufSize;		/* Size of buffer. */
     off_t bufOffsetInFile;	/* Offset in file of first buffer byte. */
     int bytesInBuf;		/* Bytes read into buffer. */
     int reserved;		/* Reserved (zero for now). */
     int lineIx;			/* Current line. */
     int lineStart;		/* Offset of line in buffer. */
     int lineEnd;		/* End of line in buffer. */
     bool zTerm;			/* Replace '\n' with zero? */
     enum nlType nlType;         /* type of line endings: dos, unix, mac or undet */
     bool reuse;			/* Set if reusing input. */
     char *buf;			/* Buffer. */
     struct pipeline *pl;        /* pipeline if reading compressed */
     struct metaOutput *metaOutput;   /* list of FILE handles to write metaData to */
     bool isMetaUnique;          /* if set, do not repeat comments in output */
     struct hash *metaLines;     /* save lines to suppress repetition */
     void *htsFile;              /* HTS file handle */
     void *tabix;		/* A tabix-compressed file and its binary index file (.tbi) */
     void *tabixIter;	        /* An iterator to get decompressed indexed lines of text */
     void *kline;                /* A buffer used for reading from htsfile. */
     struct udcFile *udcFile;    /* udc file if using caching */
     struct dyString *fullLine;  // Filled with full line when a lineFileNextFull is called
     struct dyString *rawLines;  // Filled with raw lines used to create the full line
     boolean fullLineReuse;      // If TRUE, next call to lineFileNextFull will get
                                 // already built fullLine
     void *dataForCallBack;                                 // ptr to data needed for callbacks
     void(*checkSupport)(struct lineFile *lf, char *where); // check if operation supported 
     boolean(*nextCallBack)(struct lineFile *lf, char **retStart, int *retSize); // next line callback
     void(*closeCallBack)(struct lineFile *lf);             // close callback
     };
 
 char *getFileNameFromHdrSig(char *m);
 /* Check if header has signature of supported compression stream,
    and return a phoney filename for it, or NULL if no sig found. */
 
 struct lineFile *lineFileDecompressFd(char *name, bool zTerm, int fd);
 /* open a linefile with decompression from a file or socket descriptor */
 
 struct lineFile *lineFileDecompressMem(bool zTerm, char *mem, long size);
 /* open a linefile with decompression from a memory stream */
 
 struct lineFile *lineFileMayOpen(char *fileName, bool zTerm);
 /* Try and open up a lineFile. If fileName ends in .gz, .Z, or .bz2,
  * it will be read from a decompress pipeline. */
 
 struct lineFile *lineFileUdcMayOpen(char *fileName, bool zTerm);
 /* Open a lineFile through the UDC */
 
 struct lineFile *lineFileOpen(char *fileName, bool zTerm);
 /* Open up a lineFile or die trying If fileName ends in .gz, .Z, or .bz2,
  * it will be read from a decompress pipeline.. */
 
 struct lineFile *lineFileAttach(char *fileName, bool zTerm, int fd);
 /* Wrap a line file around an open'd file. */
 
 struct lineFile *lineFileStdin(bool zTerm);
 /* Wrap a line file around stdin. */
 
 struct lineFile *lineFileOnString(char *name, bool zTerm, char *s);
 /* Wrap a line file object around string in memory. This buffer
  * have zeroes written into it if zTerm is non-zero.  It will
  * be freed when the line file is closed. */
 
 struct lineFile *lineFileOnBigBed(char *bigBedFileName);
 /* Wrap a line file object around a BigBed. */
 
 void lineFileClose(struct lineFile **pLf);
 /* Close up a line file. */
 
 void lineFileCloseList(struct lineFile **pList);
 /* Close up a list of line files. */
 
 boolean lineFileNext(struct lineFile *lf, char **retStart, int *retSize);
 /* Fetch next line from file. */
 
 boolean lineFileNextFull(struct lineFile *lf, char **retFull, int *retFullSize,
                         char **retRaw, int *retRawSize);
 // Fetch next line from file joining up any that are continued by ending '\'
 // If requested, and was joined, the unjoined raw lines are also returned
 // NOTE: comment lines can't be continued!  ("# comment \ \n more comment" is 2 lines.)
 
 boolean lineFileNextReal(struct lineFile *lf, char **retStart);
 /* Fetch next line from file that is not blank and
  * does not start with a '#'. */
 
 boolean lineFileNextFullReal(struct lineFile *lf, char **retStart);
 // Fetch next line from file that is not blank and does not start with a '#'.
 // Continuation lines (ending in '\') are joined into a single line.
 
 void lineFileNeedNext(struct lineFile *lf, char **retStart, int *retSize);
 /* Fetch next line from file.  Squawk and die if it's not there. */
 
 void lineFileReuse(struct lineFile *lf);
 /* Reuse current line. */
 
 void lineFileReuseFull(struct lineFile *lf);
 // Reuse last full line read.  Unlike lineFileReuse,
 // lineFileReuseFull only works with previous lineFileNextFull call
 
 #define lineFileString(lf) ((lf)->buf + (lf)->lineStart)
 /* Current string in line file. */
 
 #define lineFileTell(lf) ((lf)->bufOffsetInFile + (lf)->lineStart)
 /* Current offset (of string start) in file. */
 
 void lineFileSeek(struct lineFile *lf, off_t offset, int whence);
 /* Seek to read next line from given position. */
 
 void lineFileRewind(struct lineFile *lf);
 /* Return lineFile to start. */
 
 void lineFileAbort(struct lineFile *lf, char *format, ...)
 /* Print file name, line number, and error message, and abort. */
 #if defined(__GNUC__)
 __attribute__((format(printf, 2, 3)))
 #endif
 ;
 
 void lineFileVaAbort(struct lineFile *lf, char *format, va_list args);
 /* Print file name, line number, and error message, and abort. */
 
 void lineFileUnexpectedEnd(struct lineFile *lf);
 /* Complain about unexpected end of file. */
 
 void lineFileExpectWords(struct lineFile *lf, int expecting, int got);
 /* Check line has right number of words. */
 
 void lineFileExpectAtLeast(struct lineFile *lf, int expecting, int got);
 /* Check line has right number of words. */
 
 void lineFileShort(struct lineFile *lf);
 /* Complain that line is too short. */
 
 boolean lineFileNextRow(struct lineFile *lf, char *words[], int wordCount);
 /* Return next non-blank line that doesn't start with '#' chopped into words.
  * Returns FALSE at EOF.  Aborts on error. */
 
 #define lineFileRow(lf, words) lineFileNextRow(lf, words, ArraySize(words))
 /* Read in line chopped into fixed size word array. */
 
 boolean lineFileNextCharRow(struct lineFile *lf, char sep, char *words[], int wordCount);
 /* Return next non-blank line that doesn't start with '#' chopped into words
  * delimited by sep. Returns FALSE at EOF.  Aborts on error. */
 
 boolean lineFileNextRowTab(struct lineFile *lf, char *words[], int wordCount);
 /* Return next non-blank line that doesn't start with '#' chopped into words
  * at tabs. Returns FALSE at EOF.  Aborts on error. */
 
 #define lineFileRowTab(lf, words) \
 	lineFileNextRowTab(lf, words, ArraySize(words))
 /* Read in line chopped by tab into fixed size word array. */
 
 int lineFileChopNext(struct lineFile *lf, char *words[], int maxWords);
 /* Return next non-blank line that doesn't start with '#' chopped into words. */
 
 #define lineFileChop(lf, words) lineFileChopNext(lf, words, ArraySize(words))
 /* Ease-of-usef macro for lineFileChopNext above. */
 
 int lineFileChopCharNext(struct lineFile *lf, char sep, char *words[], int maxWords);
 /* Return next non-blank line that doesn't start with '#' chopped into
    words delimited by sep. */
 
 int lineFileChopNextTab(struct lineFile *lf, char *words[], int maxWords);
 /* Return next non-blank line that doesn't start with '#' chopped into words
  * on tabs */
 
 #define lineFileChopTab(lf, words) lineFileChopNextTab(lf, words, ArraySize(words))
 /* Ease-of-usef macro for lineFileChopNext above. */
 
 int lineFileCheckAllIntsNoAbort(char *s, void *val, 
     boolean isSigned, int byteCount, char *typeString, boolean noNeg, 
     char *errMsg, int errMsgSize);
 /* Convert string to (signed) integer of the size specified.  
  * Unlike atol assumes all of string is number, no trailing trash allowed.
  * Returns 0 if conversion possible, and value is returned in 'val'
  * Otherwise 1 for empty string or trailing chars, and 2 for numeric overflow,
  * and 3 for (-) sign in unsigned number.
  * Error messages if any are written into the provided buffer.
  * Pass NULL val if you only want validation.
  * Use noNeg if negative values are not allowed despite the type being signed,
  * returns 4. */
 
 void lineFileAllInts(struct lineFile *lf, char *words[], int wordIx, void *val,
   boolean isSigned,  int byteCount, char *typeString, boolean noNeg);
 /* Returns long long integer from converting the input string. Aborts on error. */
 
 int lineFileAllIntsArray(struct lineFile *lf, char *words[], int wordIx, void *array, int arraySize,
   boolean isSigned,  int byteCount, char *typeString, boolean noNeg);
 /* Convert comma separated list of numbers to an array.  Pass in
  * array and max size of array. Aborts on error. Returns number of elements in parsed array. */
 
 int lineFileNeedNum(struct lineFile *lf, char *words[], int wordIx);
 /* Make sure that words[wordIx] is an ascii integer, and return
  * binary representation of it. */
 
 int lineFileNeedFullNum(struct lineFile *lf, char *words[], int wordIx);
 /* Make sure that words[wordIx] is an ascii integer, and return
  * binary representation of it. Require all chars in word to be digits.*/
 
 double lineFileNeedDouble(struct lineFile *lf, char *words[], int wordIx);
 /* Make sure that words[wordIx] is an ascii double value, and return
  * binary representation of it. */
 
 void lineFileSkip(struct lineFile *lf, int lineCount);
 /* Skip a number of lines. */
 
 char *lineFileSkipToLineStartingWith(struct lineFile *lf, char *start, int maxCount);
 /* Skip to next line that starts with given string.  Return NULL
  * if no such line found, otherwise return the line. */
 
 char *lineFileReadAll(struct lineFile *lf);
 /* Read remainder of lineFile and return it as a string. */
 
 boolean lineFileParseHttpHeader(struct lineFile *lf, char **hdr,
 				boolean *chunked, int *contentLength);
 /* Extract HTTP response header from lf into hdr, tell if it's
  * "Transfer-Encoding: chunked" or if it has a contentLength. */
 
 struct dyString *lineFileSlurpHttpBody(struct lineFile *lf,
 				       boolean chunked, int contentLength);
 /* Return a dyString that contains the http response body in lf.  Handle
  * chunk-encoding and content-length. */
 
 void lineFileSetMetaDataOutput(struct lineFile *lf, FILE *f);
 /* set file to write meta data to,
  * should be called before reading from input file */
 
 void lineFileSetUniqueMetaData(struct lineFile *lf);
 /* suppress duplicate lines in metadata */
 
 void lineFileExpandBuf(struct lineFile *lf, int newSize);
 /* Expand line file buffer. */
 
 void lineFileRemoveInitialCustomTrackLines(struct lineFile *lf);
 /* remove initial browser and track lines */
 
 /*----- Optionally-compiled wrapper on tabix (compression + indexing): -----*/
 
 #define COMPILE_WITH_TABIX "%s: Sorry, this functionality is available only when\n" \
     "you have installed the tabix library from\n" \
      "http://samtools.sourceforge.net/ and rebuilt kent/src with USE_TABIX=1\n" \
      "(see http://genomewiki.ucsc.edu/index.php/Build_Environment_Variables)."
 
 struct lineFile *lineFileTabixAndIndexMayOpen(char *fileOrUrl, char *tbiFileOrUrl, bool zTerm);
 /* Wrap a line file around a data file that has been compressed and indexed
  * by the tabix command line program. tbiFileOrUrl can be NULL, it defaults to <fileOrUrl>.tbi.
  * It must be readable in addition to fileOrUrl. If there's a problem, warn & return NULL.
  * This works only if kent/src has been compiled with USE_TABIX=1 and linked
  * with the tabix C library. */
 
 struct lineFile *lineFileTabixMayOpen(char *fileOrUrl, bool zTerm);
 /* Wrap a line file around a data file that has been compressed and indexed
  * by the tabix command line program.  The index file <fileName>.tbi must be
  * readable in addition to fileName. If there's a problem, warn & return NULL.
  * This works only if kent/src has been compiled with USE_TABIX=1 and linked
  * with the tabix C library. */
 
 boolean lineFileSetTabixRegion(struct lineFile *lf, char *seqName, int start, int end);
 /* Assuming lf was created by lineFileTabixMayOpen, tell tabix to seek to the specified region
  * and return TRUE (or if there are no items in region, return FALSE). */
 
+void lineFileCarefulNewlines(struct lineFile *lf);
+/* Tell lf to use a less efficient method of scanning for the next newline that can handle
+ * files with a mix of newline conventions. */
+
 #endif /* LINEFILE_H */