2217111185161d4c10fb3f3e87e817bdebdd53a7 kent Thu Oct 24 17:18:15 2013 -0700 Making this tolerate new lines in fastq files between stanzas. diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c index 049919e..7c02a7d 100644 --- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c +++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c @@ -35,31 +35,31 @@ {"sampleSize", OPTION_INT}, {"seed", OPTION_INT}, {"smallOk", OPTION_BOOLEAN}, {NULL, 0}, }; /* Estimate base count from file size based on this. */ #define ZIPPED_BYTES_PER_BASE 0.80 #define UNZIPPED_BYTES_PER_BASE 2.5 static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof) /* Get next line and make sure, other than whitespace, it matches 'match'. * Return FALSE on EOF, unless noEof is set, in which case abort */ { char *line; -if (!lineFileNext(lf, &line, NULL)) +if (!lineFileNextReal(lf, &line)) { if (noEof) errAbort("Expecting %c got end of file in %s", match, lf->fileName); else return FALSE; } if (line[0] != match) errAbort("Expecting %c got %s line %d of %s", match, line, lf->lineIx, lf->fileName); return TRUE; } static int averageReadSize(char *fileName, int maxReads) /* Read up to maxReads from fastq file and return average # of reads. */ { struct lineFile *lf = lineFileOpen(fileName, FALSE); @@ -165,31 +165,31 @@ boolean oneFastqRecord(struct lineFile *lf, FILE *f, boolean copy, boolean firstTime) /* Read next fastq record from LF, and optionally copy it to f. Return FALSE at end of file * Do a _little_ error checking on record while we're at it. The format has already been * validated on the client side fairly thoroughly. */ { char *line; int lineSize; /* Treat NULL file same as non-copy, so only have one condition to check on . */ if (f == NULL) copy = FALSE; /* Deal with initial line starting with '@' */ -if (!lineFileNext(lf, &line, &lineSize)) +if (!lineFileNextRealWithSize(lf, &line, &lineSize)) return FALSE; if (line[0] != '@') errAbort("Expecting line starting with '@' got %s line %d of %s", line, lf->lineIx, lf->fileName); if (copy) mustWrite(f, line, lineSize); /* Deal with line containing sequence. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s truncated in middle of record", lf->fileName); /* Get size and add it to stats */ int seqSize = lineSize-1; if (seqSize > MAX_READ_SIZE) errAbort("Sequence size %d too long line %d of %s. Max is %d", seqSize, @@ -280,31 +280,31 @@ return TRUE; } boolean maybeCopyFastqRecord(struct lineFile *lf, FILE *f, boolean copy, int *retSeqSize) /* Read next fastq record from LF, and optionally copy it to f. Return FALSE at end of file * Do a _little_ error checking on record while we're at it. The format has already been * validated on the client side fairly thoroughly. Similar to oneFastq record but with * fewer side effects. */ { char *line; int lineSize; /* Deal with initial line starting with '@' */ -if (!lineFileNext(lf, &line, &lineSize)) +if (!lineFileNextRealWithSize(lf, &line, &lineSize)) return FALSE; if (line[0] != '@') errAbort("Expecting line starting with '@' got %s line %d of %s", line, lf->lineIx, lf->fileName); if (copy) mustWrite(f, line, lineSize); /* Deal with line containing sequence. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s truncated in middle of record", lf->fileName); if (copy) mustWrite(f, line, lineSize); int seqSize = lineSize-1;