6b6763ede646c2be8b8929c2a8700a4a68565cf1 kent Mon Jul 10 09:58:08 2017 -0700 Creating a json output option. diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c index 5dc5d98..1bf7081 100644 --- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c +++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c @@ -1,71 +1,76 @@ /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting * statistics, and also producing a smaller fastq out of a sample of the data. */ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "portable.h" #include "obscure.h" #include "hmmstats.h" +#include "jsonWrite.h" /* A note on randomness: This program is used on paired end data. This data is represented * as two separate fastq files where the forward reads are in one file and the reverse in * the other. The files are in the same order, which is how we know which forward read goes * with which reverse read. As a result it is very important that this program sample * the same records from files that have the same number of records. * * This is implemented in two passes - the first pass calculates the statistics and * produces a file with 1/10 the number of reads in it. The second pass produces the * final output by downsampling the 1/10 size file if it is big enough, or the original * file if not. * * Earlier versions of this program estimated the amount to reduce in the first pass * and were more efficient, but the estimates were based on the file sizes, and thus * sometimes varied when dealing with compressed input files, and this would break the * correspondence between read pairs, so now the estimate is always 1/10. */ int sampleSize = 100000; int seed = 0; boolean smallOk = FALSE; +boolean json = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "fastqStatsAndSubsample v2 - Go through a fastq file doing sanity checks and collecting stats\n" "and also producing a smaller fastq out of a sample of the data. The fastq input may be\n" "compressed with gzip or bzip2.\n" "usage:\n" " fastqStatsAndSubsample in.fastq out.stats out.fastq\n" "options:\n" " -sampleSize=N - default %d\n" " -seed=N - Use given seed for random number generator. Default %d.\n" " -smallOk - Not an error if less than sampleSize reads. out.fastq will be entire in.fastq\n" + " -json - out.stats will be in json rather than text format\n" + "Use /dev/null for out.fastq and/or out.stats if not interested in these outputs\n" , sampleSize, seed ); } /* Command line validation table. */ static struct optionSpec options[] = { {"sampleSize", OPTION_INT}, {"seed", OPTION_INT}, {"smallOk", OPTION_BOOLEAN}, + {"json", OPTION_BOOLEAN}, {NULL, 0}, }; static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof) /* Get next line and make sure, other than whitespace, it matches 'match'. * Return FALSE on EOF, unless noEof is set, in which case abort */ { char *line; if (!lineFileNextReal(lf, &line)) { if (noEof) errAbort("Expecting %c got end of file in %s", match, lf->fileName); else return FALSE; } @@ -92,49 +97,73 @@ for (i=0; i<arraySize; ++i) total += array[i]; return total; } long long sumIntArray(int *array, int arraySize) /* Return sum of all items in array */ { long long total = 0; int i; for (i=0; i<arraySize; ++i) total += array[i]; return total; } -void printAveDoubleArray(FILE *f, char *label, double *a, long long *totalAtPos, int aSize) +void printAveDoubleArray(struct jsonWrite *jw, + FILE *f, char *label, double *a, long long *totalAtPos, int aSize) /* Print a[i]/counts[i] for all elements in array */ { +if (jw != NULL) + { + jsonWriteListStart(jw, label); + int i; + for (i=0; i<aSize; ++i) + jsonWriteDouble(jw, NULL, a[i]/totalAtPos[i]); + jsonWriteListEnd(jw); + } +else + { fprintf(f, "%s ", label); int i; for (i=0; i<aSize; ++i) fprintf(f, "%g,", a[i]/totalAtPos[i]); fprintf(f, "\n"); } +} -void printAveIntArray(FILE *f, char *label, int *a, long long *totalAtPos, int aSize) +void printAveIntArray(struct jsonWrite *jw, + FILE *f, char *label, int *a, long long *totalAtPos, int aSize) /* Print a[i]/totalAtPos[i] for all elements in array */ { +if (jw != NULL) + { + jsonWriteListStart(jw, label); + int i; + for (i=0; i<aSize; ++i) + jsonWriteDouble(jw, NULL, (double)a[i]/totalAtPos[i]); + jsonWriteListEnd(jw); + } +else + { fprintf(f, "%s ", label); int i; for (i=0; i<aSize; ++i) fprintf(f, "%g,", ((double)a[i])/totalAtPos[i]); fprintf(f, "\n"); } +} static boolean isAllSpace(char *s, int size) /* Return TRUE if all characters in s are whitespace */ { while (--size >= 0) { char c = *s++; if (!isspace(c)) return FALSE; } return TRUE; } boolean lineFileNextRealWithSize(struct lineFile *lf, char **retStart, int *retSize) /* Fetch next line from file that is not blank and @@ -157,31 +186,31 @@ * Do a _little_ error checking on record while we're at it. The format has already been * validated on the client side fairly thoroughly. */ { char *line; int lineSize; /* Treat NULL file same as non-copy, so only have one condition to check on . */ if (f == NULL) copy = FALSE; /* Deal with initial line starting with '@' */ if (!lineFileNextRealWithSize(lf, &line, &lineSize)) return FALSE; if (line[0] != '@') { - errAbort("Expecting line starting with '@' got %s line %d of %s (ugh!)", + errAbort("Expecting line starting with '@' got '%s' line %d of %s", line, lf->lineIx, lf->fileName); } if (copy) mustWrite(f, line, lineSize); /* Deal with line containing sequence. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("%s truncated in middle of record", lf->fileName); /* Get size and add it to stats */ int seqSize = lineSize-1; if (seqSize > MAX_READ_SIZE) errAbort("Sequence size %d too long line %d of %s. Max is %d", seqSize, lf->lineIx, lf->fileName, MAX_READ_SIZE); if (firstTime) @@ -334,30 +363,57 @@ int i; for (i=0; i<oldSize; ++i) { int seqSize; boolean doIt = randomizer[i]; if (!maybeCopyFastqRecord(lf, f, doIt, &seqSize)) internalErr(); if (doIt) basesInSample += seqSize; } freez(&randomizer); lineFileClose(&lf); return basesInSample; } +void saveString(struct jsonWrite *jw, FILE *f, char *label, char *val) +/* Write labeled string to jw if non-NULL, else f. */ +{ +if (jw != NULL) + jsonWriteString(jw, label, val); +else + fprintf(f, "%s %s\n", label, val); +} + +void saveNumber(struct jsonWrite *jw, FILE *f, char *label, long long val) +/* Write labeled number to jw if non-NULL, else f. */ +{ +if (jw != NULL) + jsonWriteNumber(jw, label, val); +else + fprintf(f, "%s %lld\n", label, val); +} + +void saveDouble(struct jsonWrite *jw, FILE *f, char *label, double val) +/* Write labeled number to jw if non-NULL, else f. */ +{ +if (jw != NULL) + jsonWriteDouble(jw, label, val); +else + fprintf(f, "%s %g\n", label, val); +} + void fastqStatsAndSubsample(char *inFastq, char *outStats, char *outFastq) /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting * statistics, and also producing a smaller fastq out of a sample of the data. */ { /* Temporary file if any */ FILE *smallF = NULL; /* Make this work without making input. */ if (sameString("/dev/null", outFastq)) outFastq = NULL; /* Open up temp output file. This one will be for the initial scaling. We'll do * a second round of scaling as well. */ char smallFastqName[PATH_LEN] = ""; @@ -432,85 +488,99 @@ if (minQual <= 58) { qualType = "sanger"; qualZero = 33; } if (outFastq != NULL) { FILE *f = mustOpen(outFastq, "w"); basesInSample = reduceFastqSample(smallishName, f, readsCopied, sampleSize); carefulClose(&f); remove(smallFastqName); } FILE *f = mustOpen(outStats, "w"); +struct jsonWrite *jw = NULL; +if (json) + { + jw = jsonWriteNew(); + jsonWriteObjectStart(jw, NULL); + } + int posCount = maxReadBases; -fprintf(f, "readCount %d\n", totalReads); -fprintf(f, "baseCount %lld\n", sumReadBases); -fprintf(f, "sampleCount %d\n", sampleSize); -fprintf(f, "basesInSample %lld\n", basesInSample); -fprintf(f, "readSizeMean %g\n", (double)sumReadBases/totalReads); +saveNumber(jw, f, "readCount", totalReads); +saveNumber(jw, f, "baseCount", sumReadBases); +saveNumber(jw, f, "sampleCount", sampleSize); +saveNumber(jw, f, "basesInSample", basesInSample); +saveDouble(jw, f, "readSizeMean", (double)sumReadBases/totalReads); if (minReadBases != maxReadBases) - fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads)); + saveDouble(jw, f, "readSizeStd", + calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads)); else - fprintf(f, "readSizeStd 0\n"); -fprintf(f, "readSizeMin %d\n", minReadBases); -fprintf(f, "readSizeMax %d\n", maxReadBases); + saveDouble(jw, f, "readSizeStd", 0); +saveNumber(jw, f, "readSizeMin", minReadBases); +saveNumber(jw, f, "readSizeMax", maxReadBases); double qSum = sumDoubleArray(sumQuals, maxReadBases); double qSumSquared = sumDoubleArray(sumSquaredQuals, maxReadBases); -fprintf(f, "qualMean %g\n", qSum/sumReadBases - qualZero); +saveDouble(jw, f, "qualMean", qSum/sumReadBases - qualZero); if (minQual != maxQual) - fprintf(f, "qualStd %g\n", calcStdFromSums(qSum, qSumSquared, sumReadBases)); + saveDouble(jw, f, "qualStd", calcStdFromSums(qSum, qSumSquared, sumReadBases)); else - fprintf(f, "qualStd 0\n"); -fprintf(f, "qualMin %d\n", minQual - qualZero); -fprintf(f, "qualMax %d\n", maxQual - qualZero); -fprintf(f, "qualType %s\n", qualType); -fprintf(f, "qualZero %d\n", qualZero); + saveDouble(jw, f, "qualStd", 0); +saveNumber(jw, f, "qualMin", minQual - qualZero); +saveNumber(jw, f, "qualMax", maxQual - qualZero); +saveString(jw, f, "qualType", qualType); +saveNumber(jw, f, "qualZero", qualZero); /* Compute overall total nucleotide stats from count arrays. */ long long aSum = sumIntArray(aCount, maxReadBases); long long cSum = sumIntArray(cCount, maxReadBases); long long gSum = sumIntArray(gCount, maxReadBases); long long tSum = sumIntArray(tCount, maxReadBases); long long nSum = sumIntArray(nCount, maxReadBases); -fprintf(f, "atRatio %g\n", (double)(aSum + tSum)/(aSum + cSum + gSum + tSum)); -fprintf(f, "aRatio %g\n", (double)aSum/sumReadBases); -fprintf(f, "cRatio %g\n", (double)cSum/sumReadBases); -fprintf(f, "gRatio %g\n", (double)gSum/sumReadBases); -fprintf(f, "tRatio %g\n", (double)tSum/sumReadBases); -fprintf(f, "nRatio %g\n", (double)nSum/sumReadBases); +saveDouble(jw, f, "atRatio", (double)(aSum + tSum)/(aSum + cSum + gSum + tSum)); +saveDouble(jw, f, "aRatio", (double)aSum/sumReadBases); +saveDouble(jw, f, "cRatio", (double)cSum/sumReadBases); +saveDouble(jw, f, "gRatio", (double)gSum/sumReadBases); +saveDouble(jw, f, "tRatio", (double)tSum/sumReadBases); +saveDouble(jw, f, "nRatio", (double)nSum/sumReadBases); /* Now deal with array outputs. First make up count of all bases we've seen. */ -fprintf(f, "posCount %d\n", posCount); +saveNumber(jw, f, "posCount", posCount); long long totalAtPos[posCount]; int pos; for (pos=0; pos<posCount; ++pos) totalAtPos[pos] = aCount[pos] + cCount[pos] + gCount[pos] + tCount[pos] + nCount[pos]; /* Offset quality by scale */ for (pos=0; pos<posCount; ++pos) sumQuals[pos] -= totalAtPos[pos] * qualZero; -printAveDoubleArray(f, "qualPos", sumQuals, totalAtPos, posCount); -printAveIntArray(f, "aAtPos", aCount, totalAtPos, posCount); -printAveIntArray(f, "cAtPos", cCount, totalAtPos, posCount); -printAveIntArray(f, "gAtPos", gCount, totalAtPos, posCount); -printAveIntArray(f, "tAtPos", tCount, totalAtPos, posCount); -printAveIntArray(f, "nAtPos", nCount, totalAtPos, posCount); - +printAveDoubleArray(jw, f, "qualPos", sumQuals, totalAtPos, posCount); +printAveIntArray(jw, f, "aAtPos", aCount, totalAtPos, posCount); +printAveIntArray(jw, f, "cAtPos", cCount, totalAtPos, posCount); +printAveIntArray(jw, f, "gAtPos", gCount, totalAtPos, posCount); +printAveIntArray(jw, f, "tAtPos", tCount, totalAtPos, posCount); +printAveIntArray(jw, f, "nAtPos", nCount, totalAtPos, posCount); +if (json) + { + jsonWriteObjectEnd(jw); + fprintf(f, "%s\n", jw->dy->string); + } +carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); sampleSize = optionInt("sampleSize", sampleSize); seed = optionInt("seed", seed); srand(seed); smallOk = optionExists("smallOk"); +json = optionExists("json"); fastqStatsAndSubsample(argv[1], argv[2], argv[3]); return 0; }