src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c 6b6763ede646c2be8b8929c2a8700a4a68565cf1

6b6763ede646c2be8b8929c2a8700a4a68565cf1
kent
  Mon Jul 10 09:58:08 2017 -0700
Creating a json output option.

diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
index 5dc5d98..1bf7081 100644
--- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
+++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
@@ -1,71 +1,76 @@
 /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting 
  * statistics,  and also producing a smaller fastq out of a sample of the data. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "portable.h"
 #include "obscure.h"
 #include "hmmstats.h"
+#include "jsonWrite.h"
 
 /* A note on randomness: This program is used on paired end data.  This data is represented
  * as two separate fastq files where the forward reads are in one file and the reverse in
  * the other.  The files are in the same order, which is how we know which forward read goes
  * with which reverse read.   As a result it is very important that this program sample
  * the same records from files that have the same number of records. 
  *
  * This is implemented in two passes - the first pass calculates the statistics and
  * produces a file with 1/10 the number of reads in it.  The second pass produces the
  * final output by downsampling the 1/10 size file if it is big enough, or the original
  * file if not. 
  *
  * Earlier versions of this program estimated the amount to reduce in the first pass
  * and were more efficient, but the estimates were based on the file sizes, and thus
  * sometimes varied when dealing with compressed input files, and this would break the
  * correspondence between read pairs, so now the estimate is always 1/10. */
 
 int sampleSize = 100000;
 int seed = 0;
 boolean smallOk = FALSE;
+boolean json = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "fastqStatsAndSubsample v2 - Go through a fastq file doing sanity checks and collecting stats\n"
   "and also producing a smaller fastq out of a sample of the data.  The fastq input may be\n"
   "compressed with gzip or bzip2.\n"
   "usage:\n"
   "   fastqStatsAndSubsample in.fastq out.stats out.fastq\n"
   "options:\n"
   "   -sampleSize=N - default %d\n"
   "   -seed=N - Use given seed for random number generator.  Default %d.\n"
   "   -smallOk - Not an error if less than sampleSize reads.  out.fastq will be entire in.fastq\n"
+  "   -json - out.stats will be in json rather than text format\n"
+  "Use /dev/null for out.fastq and/or out.stats if not interested in these outputs\n"
   , sampleSize, seed
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"sampleSize", OPTION_INT},
    {"seed", OPTION_INT},
    {"smallOk", OPTION_BOOLEAN},
+   {"json", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof)
 /* Get next line and make sure, other than whitespace, it matches 'match'.
  * Return FALSE on EOF, unless noEof is set, in which case abort */
 {
 char *line;
 if (!lineFileNextReal(lf, &line))
     {
     if (noEof)
         errAbort("Expecting %c got end of file in %s", match, lf->fileName);
     else
         return FALSE;
     }
@@ -92,49 +97,73 @@
 for (i=0; i<arraySize; ++i)
     total += array[i];
 return total;
 }
 
 long long sumIntArray(int *array, int arraySize)
 /* Return sum of all items in array */
 {
 long long total = 0;
 int i;
 for (i=0; i<arraySize; ++i)
     total += array[i];
 return total;
 }
 
-void printAveDoubleArray(FILE *f, char *label, double *a, long long *totalAtPos, int aSize)
+void printAveDoubleArray(struct jsonWrite *jw, 
+    FILE *f, char *label, double *a, long long *totalAtPos, int aSize)
 /* Print a[i]/counts[i] for all elements in array */
 {
+if (jw != NULL)
+    {
+    jsonWriteListStart(jw, label);
+    int i;
+    for (i=0; i<aSize; ++i)
+	jsonWriteDouble(jw, NULL, a[i]/totalAtPos[i]);
+    jsonWriteListEnd(jw);
+    }
+else
+    {
     fprintf(f, "%s ", label);
     int i;
     for (i=0; i<aSize; ++i)
 	fprintf(f, "%g,", a[i]/totalAtPos[i]);
     fprintf(f, "\n");
     }
+}
 
-void printAveIntArray(FILE *f, char *label, int *a, long long *totalAtPos, int aSize)
+void printAveIntArray(struct jsonWrite *jw, 
+    FILE *f, char *label, int *a, long long *totalAtPos, int aSize)
 /* Print a[i]/totalAtPos[i] for all elements in array */
 {
+if (jw != NULL)
+    {
+    jsonWriteListStart(jw, label);
+    int i;
+    for (i=0; i<aSize; ++i)
+	jsonWriteDouble(jw, NULL, (double)a[i]/totalAtPos[i]);
+    jsonWriteListEnd(jw);
+    }
+else
+    {
     fprintf(f, "%s ", label);
     int i;
     for (i=0; i<aSize; ++i)
 	fprintf(f, "%g,", ((double)a[i])/totalAtPos[i]);
     fprintf(f, "\n");
     }
+}
 
 static boolean isAllSpace(char *s, int size)
 /* Return TRUE if all characters in s are whitespace */
 {
 while (--size >= 0)
     {
     char c = *s++;
     if (!isspace(c))
         return FALSE;
     }
 return TRUE;
 }
 
 boolean lineFileNextRealWithSize(struct lineFile *lf, char **retStart, int *retSize)
 /* Fetch next line from file that is not blank and
@@ -157,31 +186,31 @@
  * Do a _little_ error checking on record while we're at it.  The format has already been
  * validated on the client side fairly thoroughly. */
 {
 char *line;
 int lineSize;
 
 /* Treat NULL file same as non-copy, so only have one condition to check on . */
 if (f == NULL)
     copy = FALSE;
 
 /* Deal with initial line starting with '@' */
 if (!lineFileNextRealWithSize(lf, &line, &lineSize))
     return FALSE;
 if (line[0] != '@')
     {
-    errAbort("Expecting line starting with '@' got %s line %d of %s (ugh!)", 
+    errAbort("Expecting line starting with '@' got '%s' line %d of %s", 
 	line, lf->lineIx, lf->fileName);
     }
 if (copy)
     mustWrite(f, line, lineSize);
 
 /* Deal with line containing sequence. */
 if (!lineFileNext(lf, &line, &lineSize))
     errAbort("%s truncated in middle of record", lf->fileName);
 
 /* Get size and add it to stats */
 int seqSize = lineSize-1;
 if (seqSize > MAX_READ_SIZE)
     errAbort("Sequence size %d too long line %d of %s.  Max is %d", seqSize, 
 	lf->lineIx, lf->fileName, MAX_READ_SIZE);
 if (firstTime)
@@ -334,30 +363,57 @@
 int i;
 for (i=0; i<oldSize; ++i)
     {
     int seqSize;
     boolean doIt = randomizer[i];
     if (!maybeCopyFastqRecord(lf, f, doIt, &seqSize))
          internalErr();
     if (doIt)
 	basesInSample += seqSize;
     }
 freez(&randomizer);
 lineFileClose(&lf);
 return basesInSample;
 }
 
+void saveString(struct jsonWrite *jw, FILE *f, char *label, char *val)
+/* Write labeled string to jw if non-NULL, else f. */
+{
+if (jw != NULL)
+    jsonWriteString(jw, label, val);
+else
+    fprintf(f, "%s %s\n", label, val);
+}
+
+void saveNumber(struct jsonWrite *jw, FILE *f, char *label, long long val)
+/* Write labeled number to jw if non-NULL, else f. */
+{
+if (jw != NULL)
+    jsonWriteNumber(jw, label, val);
+else
+    fprintf(f, "%s %lld\n", label, val);
+}
+
+void saveDouble(struct jsonWrite *jw, FILE *f, char *label, double val)
+/* Write labeled number to jw if non-NULL, else f. */
+{
+if (jw != NULL)
+    jsonWriteDouble(jw, label, val);
+else
+    fprintf(f, "%s %g\n", label, val);
+}
+
 void fastqStatsAndSubsample(char *inFastq, char *outStats, char *outFastq)
 /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting 
  * statistics,  and also producing a smaller fastq out of a sample of the data. */
 {
 /* Temporary file if any */
 FILE *smallF = NULL;
 
 /* Make this work without making input. */
 if (sameString("/dev/null", outFastq))
     outFastq = NULL;
 
 
 /* Open up temp output file.  This one will be for the initial scaling.  We'll do
  * a second round of scaling as well. */
 char smallFastqName[PATH_LEN] = "";
@@ -432,85 +488,99 @@
 if (minQual <= 58)
     {
     qualType = "sanger";
     qualZero = 33;
     }
 
 if (outFastq != NULL)
     {
     FILE *f = mustOpen(outFastq, "w");
     basesInSample = reduceFastqSample(smallishName, f, readsCopied, sampleSize);
     carefulClose(&f);
     remove(smallFastqName);
     }
 
 FILE *f = mustOpen(outStats, "w");
+struct jsonWrite *jw = NULL;
+if (json)
+    {
+    jw = jsonWriteNew();
+    jsonWriteObjectStart(jw, NULL);
+    }
+
 int posCount = maxReadBases;
-fprintf(f, "readCount %d\n", totalReads);
-fprintf(f, "baseCount %lld\n", sumReadBases);
-fprintf(f, "sampleCount %d\n", sampleSize);
-fprintf(f, "basesInSample %lld\n", basesInSample);
-fprintf(f, "readSizeMean %g\n", (double)sumReadBases/totalReads);
+saveNumber(jw, f, "readCount", totalReads);
+saveNumber(jw, f, "baseCount", sumReadBases);
+saveNumber(jw, f, "sampleCount", sampleSize);
+saveNumber(jw, f, "basesInSample", basesInSample);
+saveDouble(jw, f, "readSizeMean", (double)sumReadBases/totalReads);
 if (minReadBases != maxReadBases)
-    fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads));
+    saveDouble(jw, f, "readSizeStd", 
+	calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads));
 else
-    fprintf(f, "readSizeStd 0\n");
-fprintf(f, "readSizeMin %d\n", minReadBases);
-fprintf(f, "readSizeMax %d\n", maxReadBases);
+    saveDouble(jw, f, "readSizeStd", 0);
+saveNumber(jw, f, "readSizeMin", minReadBases);
+saveNumber(jw, f, "readSizeMax", maxReadBases);
 double qSum = sumDoubleArray(sumQuals, maxReadBases);
 double qSumSquared = sumDoubleArray(sumSquaredQuals, maxReadBases);
-fprintf(f, "qualMean %g\n", qSum/sumReadBases - qualZero);
+saveDouble(jw, f, "qualMean", qSum/sumReadBases - qualZero);
 if (minQual != maxQual)
-    fprintf(f, "qualStd %g\n", calcStdFromSums(qSum, qSumSquared, sumReadBases));
+    saveDouble(jw, f, "qualStd", calcStdFromSums(qSum, qSumSquared, sumReadBases));
 else
-    fprintf(f, "qualStd 0\n");
-fprintf(f, "qualMin %d\n", minQual - qualZero);
-fprintf(f, "qualMax %d\n", maxQual - qualZero);
-fprintf(f, "qualType %s\n", qualType);
-fprintf(f, "qualZero %d\n", qualZero);
+    saveDouble(jw, f, "qualStd",  0);
+saveNumber(jw, f, "qualMin", minQual - qualZero);
+saveNumber(jw, f, "qualMax", maxQual - qualZero);
+saveString(jw, f, "qualType", qualType);
+saveNumber(jw, f, "qualZero", qualZero);
 
 /* Compute overall total nucleotide stats from count arrays. */
 long long aSum = sumIntArray(aCount, maxReadBases);
 long long cSum = sumIntArray(cCount, maxReadBases);
 long long gSum = sumIntArray(gCount, maxReadBases);
 long long tSum = sumIntArray(tCount, maxReadBases);
 long long nSum = sumIntArray(nCount, maxReadBases);
-fprintf(f, "atRatio %g\n", (double)(aSum + tSum)/(aSum + cSum + gSum + tSum));
-fprintf(f, "aRatio %g\n", (double)aSum/sumReadBases);
-fprintf(f, "cRatio %g\n", (double)cSum/sumReadBases);
-fprintf(f, "gRatio %g\n", (double)gSum/sumReadBases);
-fprintf(f, "tRatio %g\n", (double)tSum/sumReadBases);
-fprintf(f, "nRatio %g\n", (double)nSum/sumReadBases);
+saveDouble(jw, f, "atRatio", (double)(aSum + tSum)/(aSum + cSum + gSum + tSum));
+saveDouble(jw, f, "aRatio", (double)aSum/sumReadBases);
+saveDouble(jw, f, "cRatio", (double)cSum/sumReadBases);
+saveDouble(jw, f, "gRatio", (double)gSum/sumReadBases);
+saveDouble(jw, f, "tRatio", (double)tSum/sumReadBases);
+saveDouble(jw, f, "nRatio", (double)nSum/sumReadBases);
 
 /* Now deal with array outputs.   First make up count of all bases we've seen. */
-fprintf(f, "posCount %d\n", posCount);
+saveNumber(jw, f, "posCount", posCount);
 long long totalAtPos[posCount];
 int pos;
 for (pos=0; pos<posCount; ++pos)
     totalAtPos[pos] = aCount[pos] + cCount[pos] + gCount[pos] + tCount[pos] + nCount[pos];
 
 /* Offset quality by scale */
 for (pos=0; pos<posCount; ++pos)
     sumQuals[pos] -= totalAtPos[pos] * qualZero;
 
-printAveDoubleArray(f, "qualPos", sumQuals, totalAtPos, posCount);
-printAveIntArray(f, "aAtPos", aCount, totalAtPos, posCount);
-printAveIntArray(f, "cAtPos", cCount, totalAtPos, posCount);
-printAveIntArray(f, "gAtPos", gCount, totalAtPos, posCount);
-printAveIntArray(f, "tAtPos", tCount, totalAtPos, posCount);
-printAveIntArray(f, "nAtPos", nCount, totalAtPos, posCount);
-
+printAveDoubleArray(jw, f, "qualPos", sumQuals, totalAtPos, posCount);
+printAveIntArray(jw, f, "aAtPos", aCount, totalAtPos, posCount);
+printAveIntArray(jw, f, "cAtPos", cCount, totalAtPos, posCount);
+printAveIntArray(jw, f, "gAtPos", gCount, totalAtPos, posCount);
+printAveIntArray(jw, f, "tAtPos", tCount, totalAtPos, posCount);
+printAveIntArray(jw, f, "nAtPos", nCount, totalAtPos, posCount);
+if (json)
+    {
+    jsonWriteObjectEnd(jw);
+    fprintf(f, "%s\n", jw->dy->string);
+    }
+carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 sampleSize = optionInt("sampleSize", sampleSize);
 seed = optionInt("seed", seed);
 srand(seed);
 smallOk = optionExists("smallOk");
+json = optionExists("json");
 fastqStatsAndSubsample(argv[1], argv[2], argv[3]);
 return 0;
 }