bbeeeeb5d888089a025aa547c05bfc9b443dc39f angie Thu Dec 20 14:06:39 2018 -0800 Adding support for files that may have a mix of newline styles (\r\n, \r, \n), enabled by calling lineFileCarefulNewlines. refs #22638 Scanning for any type of newline is not quite as efficient as scanning for only one pre-determined type, but it's necessary to deal with the kind of garbage data that has snuck into some saved sessions. I also fixed a couple subtle cases that have not caused any trouble in our day-to-day dealings with nice \n-separated input with line sizes shorter than the default lf->buf size (64k): * determineNlType initialized lf->nlType to UNIX, but if the first non-empty buffer did not contain any newline, UNIX may or may not have been the correct type. * The second time determineNlType was called, it was using an outdated endIx. Note the second instance of scanning for newlines used < sizeLeft as a test instead of endIx; that needed to be applied to determineNlType too. * determineNlType was called with buf+endIx, but with a byte limit that didn't account for endIx. I tested lineFile with an initial buf size of 16 (in lineFileAttach) to test the looping on gotLf. diff --git src/lib/linefile.c src/lib/linefile.c index 55176f7..6b77efc 100644 --- src/lib/linefile.c +++ src/lib/linefile.c @@ -406,69 +406,129 @@ { int oneSize, totalRead = 0; while (size > 0) { oneSize = read(fd, buf, size); if (oneSize <= 0) break; totalRead += oneSize; buf += oneSize; size -= oneSize; } return totalRead; } +void lineFileCarefulNewlines(struct lineFile *lf) +/* Tell lf to use a less efficient method of scanning for the next newline that can handle + * files with a mix of newline conventions. */ +{ +lf->nlType = nlt_mixed; +} + static void determineNlType(struct lineFile *lf, char *buf, int bufSize) /* determine type of newline used for the file, assumes buffer not empty */ { char *c = buf; if (bufSize==0) return; if (lf->nlType != nlt_undet) return; /* if already determined just exit */ -lf->nlType = nlt_unix; /* start with default of unix lf type */ while (c < buf+bufSize) { if (*c=='\r') { lf->nlType = nlt_mac; if (++c < buf+bufSize) if (*c == '\n') lf->nlType = nlt_dos; return; } if (*(c++) == '\n') { + lf->nlType = nlt_unix; return; } } } +static boolean findNextNewline(struct lineFile *lf, char *buf, int bytesInBuf, int *pEndIx) +/* Return TRUE if able to find next end of line in buf, starting at buf[*pEndIx], up to bytesInBuf. + * When done set *pEndIx to the start of the next line if applicable, otherwise bytesInBuf. */ +{ +boolean gotLf = FALSE; +int endIx = *pEndIx; +switch (lf->nlType) + { + case nlt_unix: + case nlt_dos: + for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx) + { + if (buf[endIx] == '\n') + { + gotLf = TRUE; + endIx += 1; + break; + } + } + break; + case nlt_mac: + for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx) + { + if (buf[endIx] == '\r') + { + gotLf = TRUE; + endIx += 1; + break; + } + } + break; + case nlt_mixed: + case nlt_undet: + for (endIx = *pEndIx; endIx < bytesInBuf; ++endIx) + { + char c = buf[endIx]; + if (c == '\r' || c == '\n') + { + gotLf = TRUE; + if (lf->zTerm) + buf[endIx] = '\0'; + endIx += 1; + if (c == '\r' && buf[endIx] == '\n') + { + if (lf->zTerm) + buf[endIx] = '\0'; + endIx += 1; + } + break; + } + } + break; + } +*pEndIx = endIx; +return gotLf; +} + boolean lineFileNext(struct lineFile *lf, char **retStart, int *retSize) /* Fetch next line from file. */ { -char *buf = lf->buf; -int bytesInBuf = lf->bytesInBuf; -int endIx = lf->lineEnd; -boolean gotLf = FALSE; int newStart; if (lf->reuse) { lf->reuse = FALSE; if (retSize != NULL) *retSize = lf->lineEnd - lf->lineStart; - *retStart = buf + lf->lineStart; + *retStart = lf->buf + lf->lineStart; if (lf->metaOutput && *retStart[0] == '#') metaDataAdd(lf, *retStart); return TRUE; } if (lf->nextCallBack) return lf->nextCallBack(lf, retStart, retSize); if (lf->udcFile) { lf->bufOffsetInFile = udcTell(lf->udcFile); char *line = udcReadLine(lf->udcFile); if (line==NULL) return FALSE; int lineSize = strlen(line); @@ -494,61 +554,35 @@ lf->bytesInBuf = lineSize; lf->lineIx = -1; lf->lineStart = 0; lf->lineEnd = lineSize; if (lineSize > lf->bufSize) // shouldn't be! but just in case: lineFileExpandBuf(lf, lineSize * 2); kstring_t *kline = lf->kline; safecpy(lf->buf, lf->bufSize, kline->s); *retStart = lf->buf; if (retSize != NULL) *retSize = lineSize; return TRUE; } -determineNlType(lf, buf+endIx, bytesInBuf); - -/* Find next end of line in buffer. */ -switch(lf->nlType) - { - case nlt_unix: - case nlt_dos: - for (endIx = lf->lineEnd; endIx < bytesInBuf; ++endIx) - { - if (buf[endIx] == '\n') - { - gotLf = TRUE; - endIx += 1; - break; - } - } - break; - case nlt_mac: - for (endIx = lf->lineEnd; endIx < bytesInBuf; ++endIx) - { - if (buf[endIx] == '\r') - { - gotLf = TRUE; - endIx += 1; - break; - } - } - break; - case nlt_undet: - break; - } +char *buf = lf->buf; +int endIx = lf->lineEnd; +int bytesInBuf = lf->bytesInBuf; +determineNlType(lf, buf+endIx, bytesInBuf-endIx); +boolean gotLf = findNextNewline(lf, buf, bytesInBuf, &endIx); /* If not in buffer read in a new buffer's worth. */ while (!gotLf) { int oldEnd = lf->lineEnd; int sizeLeft = bytesInBuf - oldEnd; int bufSize = lf->bufSize; int readSize = bufSize - sizeLeft; if (oldEnd > 0 && sizeLeft > 0) { memmove(buf, buf+oldEnd, sizeLeft); } lf->bufOffsetInFile += oldEnd; if (lf->fd >= 0) @@ -569,64 +603,39 @@ lf->bytesInBuf = newStart = lf->lineStart = 0; lf->lineEnd = endIx; ++lf->lineIx; if (retSize != NULL) *retSize = endIx - newStart; *retStart = buf + newStart; if (*retStart[0] == '#') metaDataAdd(lf, *retStart); return TRUE; } else if (readSize <= 0) { lf->bytesInBuf = lf->lineStart = lf->lineEnd = 0; return FALSE; } + else + endIx = sizeLeft; + bytesInBuf = lf->bytesInBuf = readSize + sizeLeft; lf->lineEnd = 0; - determineNlType(lf, buf+endIx, bytesInBuf); + determineNlType(lf, buf+endIx, bytesInBuf-endIx); + gotLf = findNextNewline(lf, buf, bytesInBuf, &endIx); - /* Look for next end of line. */ - switch(lf->nlType) - { - case nlt_unix: - case nlt_dos: - for (endIx = sizeLeft; endIx <bytesInBuf; ++endIx) - { - if (buf[endIx] == '\n') - { - endIx += 1; - gotLf = TRUE; - break; - } - } - break; - case nlt_mac: - for (endIx = sizeLeft; endIx <bytesInBuf; ++endIx) - { - if (buf[endIx] == '\r') - { - endIx += 1; - gotLf = TRUE; - break; - } - } - break; - case nlt_undet: - break; - } if (!gotLf && bytesInBuf == lf->bufSize) { if (bufSize >= 512*1024*1024) { errAbort("Line too long (more than %d chars) line %d of %s", lf->bufSize, lf->lineIx+1, lf->fileName); } else { lineFileExpandBuf(lf, bufSize*2); buf = lf->buf; } } }