bbeeeeb5d888089a025aa547c05bfc9b443dc39f angie Thu Dec 20 14:06:39 2018 -0800 Adding support for files that may have a mix of newline styles (\r\n, \r, \n), enabled by calling lineFileCarefulNewlines. refs #22638 Scanning for any type of newline is not quite as efficient as scanning for only one pre-determined type, but it's necessary to deal with the kind of garbage data that has snuck into some saved sessions. I also fixed a couple subtle cases that have not caused any trouble in our day-to-day dealings with nice \n-separated input with line sizes shorter than the default lf->buf size (64k): * determineNlType initialized lf->nlType to UNIX, but if the first non-empty buffer did not contain any newline, UNIX may or may not have been the correct type. * The second time determineNlType was called, it was using an outdated endIx. Note the second instance of scanning for newlines used < sizeLeft as a test instead of endIx; that needed to be applied to determineNlType too. * determineNlType was called with buf+endIx, but with a byte limit that didn't account for endIx. I tested lineFile with an initial buf size of 16 (in lineFileAttach) to test the looping on gotLf. diff --git src/inc/linefile.h src/inc/linefile.h index 1481c98..0c0092e 100644 --- src/inc/linefile.h +++ src/inc/linefile.h @@ -13,31 +13,32 @@ #define tabix_t tbx_t #define ti_iter_t hts_itr_t #define ti_open hts_open #define ti_index_load tbx_index_load #define ti_close tbx_destroy #define ti_get_tid tbx_name2id #define ti_queryi tbx_itr_queryi #define ti_iter_destroy tbx_itr_destroy #define LF_BOGUS_FILE_PREFIX "somefile." enum nlType { nlt_undet, /* undetermined */ nlt_unix, /* lf */ nlt_dos, /* crlf */ - nlt_mac /* cr */ + nlt_mac, /* cr */ + nlt_mixed /* could be any or all of the above */ }; struct metaOutput /* struct to store list of file handles to output meta data to * meta data is text after # */ { struct metaOutput *next; /* next file handle */ FILE *metaFile; /* file to write metadata to */ }; struct lineFile /* Structure to handle fast, line oriented * fileIo. */ { struct lineFile *next; /* Might need to be on a list. */ @@ -291,18 +292,22 @@ * It must be readable in addition to fileOrUrl. If there's a problem, warn & return NULL. * This works only if kent/src has been compiled with USE_TABIX=1 and linked * with the tabix C library. */ struct lineFile *lineFileTabixMayOpen(char *fileOrUrl, bool zTerm); /* Wrap a line file around a data file that has been compressed and indexed * by the tabix command line program. The index file <fileName>.tbi must be * readable in addition to fileName. If there's a problem, warn & return NULL. * This works only if kent/src has been compiled with USE_TABIX=1 and linked * with the tabix C library. */ boolean lineFileSetTabixRegion(struct lineFile *lf, char *seqName, int start, int end); /* Assuming lf was created by lineFileTabixMayOpen, tell tabix to seek to the specified region * and return TRUE (or if there are no items in region, return FALSE). */ +void lineFileCarefulNewlines(struct lineFile *lf); +/* Tell lf to use a less efficient method of scanning for the next newline that can handle + * files with a mix of newline conventions. */ + #endif /* LINEFILE_H */