6b6763ede646c2be8b8929c2a8700a4a68565cf1 kent Mon Jul 10 09:58:08 2017 -0700 Creating a json output option. diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c index 5dc5d98..1bf7081 100644 --- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c +++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c @@ -1,71 +1,76 @@ /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting * statistics, and also producing a smaller fastq out of a sample of the data. */ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "portable.h" #include "obscure.h" #include "hmmstats.h" +#include "jsonWrite.h" /* A note on randomness: This program is used on paired end data. This data is represented * as two separate fastq files where the forward reads are in one file and the reverse in * the other. The files are in the same order, which is how we know which forward read goes * with which reverse read. As a result it is very important that this program sample * the same records from files that have the same number of records. * * This is implemented in two passes - the first pass calculates the statistics and * produces a file with 1/10 the number of reads in it. The second pass produces the * final output by downsampling the 1/10 size file if it is big enough, or the original * file if not. * * Earlier versions of this program estimated the amount to reduce in the first pass * and were more efficient, but the estimates were based on the file sizes, and thus * sometimes varied when dealing with compressed input files, and this would break the * correspondence between read pairs, so now the estimate is always 1/10. */ int sampleSize = 100000; int seed = 0; boolean smallOk = FALSE; +boolean json = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "fastqStatsAndSubsample v2 - Go through a fastq file doing sanity checks and collecting stats\n" "and also producing a smaller fastq out of a sample of the data. The fastq input may be\n" "compressed with gzip or bzip2.\n" "usage:\n" " fastqStatsAndSubsample in.fastq out.stats out.fastq\n" "options:\n" " -sampleSize=N - default %d\n" " -seed=N - Use given seed for random number generator. Default %d.\n" " -smallOk - Not an error if less than sampleSize reads. out.fastq will be entire in.fastq\n" + " -json - out.stats will be in json rather than text format\n" + "Use /dev/null for out.fastq and/or out.stats if not interested in these outputs\n" , sampleSize, seed ); } /* Command line validation table. */ static struct optionSpec options[] = { {"sampleSize", OPTION_INT}, {"seed", OPTION_INT}, {"smallOk", OPTION_BOOLEAN}, + {"json", OPTION_BOOLEAN}, {NULL, 0}, }; static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof) /* Get next line and make sure, other than whitespace, it matches 'match'. * Return FALSE on EOF, unless noEof is set, in which case abort */ { char *line; if (!lineFileNextReal(lf, &line)) { if (noEof) errAbort("Expecting %c got end of file in %s", match, lf->fileName); else return FALSE; } @@ -92,49 +97,73 @@ for (i=0; i= 0) { char c = *s++; if (!isspace(c)) return FALSE; } return TRUE; } boolean lineFileNextRealWithSize(struct lineFile *lf, char **retStart, int *retSize) /* Fetch next line from file that is not blank and @@ -157,31 +186,31 @@ * Do a _little_ error checking on record while we're at it. The format has already been * validated on the client side fairly thoroughly. */ { char *line; int lineSize; /* Treat NULL file same as non-copy, so only have one condition to check on . */ if (f == NULL) copy = FALSE; /* Deal with initial line starting with '@' */ if (!lineFileNextRealWithSize(lf, &line, &lineSize)) return FALSE; if (line[0] != '@') { - errAbort("Expecting line starting with '@' got %s line %d of %s (ugh!)", + errAbort("Expecting line starting with '@' got '%s' line %d of %s", line, lf->lineIx, lf->fileName); } if (copy) mustWrite(f, line, lineSize); /* Deal with line containing sequence. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s truncated in middle of record", lf->fileName); /* Get size and add it to stats */ int seqSize = lineSize-1; if (seqSize > MAX_READ_SIZE) errAbort("Sequence size %d too long line %d of %s. Max is %d", seqSize, lf->lineIx, lf->fileName, MAX_READ_SIZE); if (firstTime) @@ -334,30 +363,57 @@ int i; for (i=0; idy->string); + } +carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); sampleSize = optionInt("sampleSize", sampleSize); seed = optionInt("seed", seed); srand(seed); smallOk = optionExists("smallOk"); +json = optionExists("json"); fastqStatsAndSubsample(argv[1], argv[2], argv[3]); return 0; }