bbeeeeb5d888089a025aa547c05bfc9b443dc39f
angie
  Thu Dec 20 14:06:39 2018 -0800
Adding support for files that may have a mix of newline styles (\r\n, \r, \n), enabled by calling lineFileCarefulNewlines.  refs #22638
Scanning for any type of newline is not quite as efficient as scanning for only one pre-determined type, but it's necessary to deal with the kind of garbage data that has snuck into some saved sessions.
I also fixed a couple subtle cases that have not caused any trouble in our day-to-day dealings with nice \n-separated input with line sizes shorter than the default lf->buf size (64k):
* determineNlType initialized lf->nlType to UNIX, but if the first non-empty buffer did not contain any newline, UNIX may or may not have been the correct type.
* The second time determineNlType was called, it was using an outdated endIx.  Note the second instance of scanning for newlines used < sizeLeft as a test instead of endIx; that needed to be applied to determineNlType too.
* determineNlType was called with buf+endIx, but with a byte limit that didn't account for endIx.
I tested lineFile with an initial buf size of 16 (in lineFileAttach) to test the looping on gotLf.

diff --git src/lib/linefile.c src/lib/linefile.c
index 55176f7..6b77efc 100644
--- src/lib/linefile.c
+++ src/lib/linefile.c
@@ -406,69 +406,129 @@
 {
 int oneSize, totalRead = 0;
 
 while (size > 0)
     {
     oneSize = read(fd, buf, size);
     if (oneSize <= 0)
         break;
     totalRead += oneSize;
     buf += oneSize;
     size -= oneSize;
     }
 return totalRead;
 }
 
+void lineFileCarefulNewlines(struct lineFile *lf)
+/* Tell lf to use a less efficient method of scanning for the next newline that can handle
+ * files with a mix of newline conventions. */
+{
+lf->nlType = nlt_mixed;
+}
+
 static void determineNlType(struct lineFile *lf, char *buf, int bufSize)
 /* determine type of newline used for the file, assumes buffer not empty */
 {
 char *c = buf;
 if (bufSize==0) return;
 if (lf->nlType != nlt_undet) return;  /* if already determined just exit */
-lf->nlType = nlt_unix;  /* start with default of unix lf type */
 while (c < buf+bufSize)
     {
     if (*c=='\r')
 	{
     	lf->nlType = nlt_mac;
 	if (++c < buf+bufSize)
     	    if (*c == '\n')
     		lf->nlType = nlt_dos;
 	return;
 	}
     if (*(c++) == '\n')
 	{
+        lf->nlType = nlt_unix;
 	return;
 	}
     }
 }
 
+static boolean findNextNewline(struct lineFile *lf, char *buf, int bytesInBuf, int *pEndIx)
+/* Return TRUE if able to find next end of line in buf, starting at buf[*pEndIx], up to bytesInBuf.
+ * When done set *pEndIx to the start of the next line if applicable, otherwise bytesInBuf. */
+{
+boolean gotLf = FALSE;
+int endIx = *pEndIx;
+switch (lf->nlType)
+    {
+    case nlt_unix:
+    case nlt_dos:
+        for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx)
+            {
+            if (buf[endIx] == '\n')
+                {
+                gotLf = TRUE;
+                endIx += 1;
+                break;
+                }
+            }
+        break;
+    case nlt_mac:
+        for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx)
+            {
+            if (buf[endIx] == '\r')
+                {
+                gotLf = TRUE;
+                endIx += 1;
+                break;
+                }
+            }
+        break;
+    case nlt_mixed:
+    case nlt_undet:
+        for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx)
+            {
+            char c = buf[endIx];
+            if (c == '\r' || c == '\n')
+                {
+                gotLf = TRUE;
+                if (lf->zTerm)
+                    buf[endIx] = '\0';
+                endIx += 1;
+                if (c == '\r' && buf[endIx] == '\n')
+                    {
+                    if (lf->zTerm)
+                        buf[endIx] = '\0';
+                    endIx += 1;
+                    }
+                break;
+                }
+            }
+        break;
+    }
+*pEndIx = endIx;
+return gotLf;
+}
+
 boolean lineFileNext(struct lineFile *lf, char **retStart, int *retSize)
 /* Fetch next line from file. */
 {
-char *buf = lf->buf;
-int bytesInBuf = lf->bytesInBuf;
-int endIx = lf->lineEnd;
-boolean gotLf = FALSE;
 int newStart;
 
 if (lf->reuse)
     {
     lf->reuse = FALSE;
     if (retSize != NULL)
 	*retSize = lf->lineEnd - lf->lineStart;
-    *retStart = buf + lf->lineStart;
+    *retStart = lf->buf + lf->lineStart;
     if (lf->metaOutput && *retStart[0] == '#')
         metaDataAdd(lf, *retStart);
     return TRUE;
     }
 
 if (lf->nextCallBack)
     return lf->nextCallBack(lf, retStart, retSize);
 
 if (lf->udcFile)
     {
     lf->bufOffsetInFile = udcTell(lf->udcFile);
     char *line = udcReadLine(lf->udcFile);
     if (line==NULL)
         return FALSE;
     int lineSize = strlen(line);
@@ -494,61 +554,35 @@
     lf->bytesInBuf = lineSize;
     lf->lineIx = -1;
     lf->lineStart = 0;
     lf->lineEnd = lineSize;
     if (lineSize > lf->bufSize)
 	// shouldn't be!  but just in case:
 	lineFileExpandBuf(lf, lineSize * 2);
     kstring_t *kline = lf->kline;
     safecpy(lf->buf, lf->bufSize, kline->s);
     *retStart = lf->buf;
     if (retSize != NULL)
 	*retSize = lineSize;
     return TRUE;
     }
 
-determineNlType(lf, buf+endIx, bytesInBuf);
-
-/* Find next end of line in buffer. */
-switch(lf->nlType)
-    {
-    case nlt_unix:
-    case nlt_dos:
-	for (endIx = lf->lineEnd; endIx < bytesInBuf; ++endIx)
-	    {
-	    if (buf[endIx] == '\n')
-		{
-		gotLf = TRUE;
-		endIx += 1;
-		break;
-		}
-	    }
-	break;
-    case nlt_mac:
-	for (endIx = lf->lineEnd; endIx < bytesInBuf; ++endIx)
-	    {
-	    if (buf[endIx] == '\r')
-		{
-		gotLf = TRUE;
-		endIx += 1;
-		break;
-		}
-	    }
-	break;
-    case nlt_undet:
-	break;
-    }
+char *buf = lf->buf;
+int endIx = lf->lineEnd;
+int bytesInBuf = lf->bytesInBuf;
+determineNlType(lf, buf+endIx, bytesInBuf-endIx);
+boolean gotLf = findNextNewline(lf, buf, bytesInBuf, &endIx);
 
 /* If not in buffer read in a new buffer's worth. */
 while (!gotLf)
     {
     int oldEnd = lf->lineEnd;
     int sizeLeft = bytesInBuf - oldEnd;
     int bufSize = lf->bufSize;
     int readSize = bufSize - sizeLeft;
 
     if (oldEnd > 0 && sizeLeft > 0)
 	{
 	memmove(buf, buf+oldEnd, sizeLeft);
 	}
     lf->bufOffsetInFile += oldEnd;
     if (lf->fd >= 0)
@@ -569,64 +603,39 @@
 	lf->bytesInBuf = newStart = lf->lineStart = 0;
 	lf->lineEnd = endIx;
 	++lf->lineIx;
 	if (retSize != NULL)
 	    *retSize = endIx - newStart;
 	*retStart = buf + newStart;
         if (*retStart[0] == '#')
             metaDataAdd(lf, *retStart);
 	return TRUE;
 	}
     else if (readSize <= 0)
 	{
 	lf->bytesInBuf = lf->lineStart = lf->lineEnd = 0;
 	return FALSE;
 	}
+    else
+        endIx = sizeLeft;
+
     bytesInBuf = lf->bytesInBuf = readSize + sizeLeft;
     lf->lineEnd = 0;
 
-    determineNlType(lf, buf+endIx, bytesInBuf);
+    determineNlType(lf, buf+endIx, bytesInBuf-endIx);
+    gotLf = findNextNewline(lf, buf, bytesInBuf, &endIx);
 
-    /* Look for next end of line.  */
-    switch(lf->nlType)
-	{
-    	case nlt_unix:
-	case nlt_dos:
-	    for (endIx = sizeLeft; endIx <bytesInBuf; ++endIx)
-		{
-		if (buf[endIx] == '\n')
-		    {
-		    endIx += 1;
-		    gotLf = TRUE;
-		    break;
-		    }
-		}
-	    break;
-	case nlt_mac:
-	    for (endIx = sizeLeft; endIx <bytesInBuf; ++endIx)
-		{
-		if (buf[endIx] == '\r')
-		    {
-		    endIx += 1;
-		    gotLf = TRUE;
-		    break;
-		    }
-		}
-	    break;
-	case nlt_undet:
-	    break;
-	}
     if (!gotLf && bytesInBuf == lf->bufSize)
         {
 	if (bufSize >= 512*1024*1024)
 	    {
 	    errAbort("Line too long (more than %d chars) line %d of %s",
 		lf->bufSize, lf->lineIx+1, lf->fileName);
 	    }
 	else
 	    {
 	    lineFileExpandBuf(lf, bufSize*2);
 	    buf = lf->buf;
 	    }
 	}
     }