src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c 3dd6cd1eed6c6f52a3506dd53d1f4a81a6e4532b

3dd6cd1eed6c6f52a3506dd53d1f4a81a6e4532b
kent
  Tue Aug 6 12:09:34 2013 -0700
Added basesInSample output.
diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
index 1f3e7a7..4b23fdd 100644
--- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
+++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
@@ -1,444 +1,518 @@
 /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting 
  * statistics,  and also producing a smaller fastq out of a sample of the data. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "portable.h"
 #include "obscure.h"
 #include "hmmstats.h"
 
 int sampleSize = 100000;
 int seed = 0;
 boolean smallOk = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting statistics\n"
   "and also producing a smaller fastq out of a sample of the data.  The fastq input may be\n"
   "compressed with gzip or bzip2.  Unfortunately the fastq input can't be in a pipe\n"
   "usage:\n"
   "   fastqStatsAndSubsample in.fastq out.stats out.fastq\n"
   "options:\n"
   "   -sampleSize=N - default %d\n"
-  "   -seed=N - use given seed for random number generator.  Default %d.\n"
+  "   -seed=N - Use given seed for random number generator.  Default %d.\n"
   "   -smallOk - Not an error if less than sampleSize reads.  out.fastq will be entire in.fastq\n"
   , sampleSize, seed
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"sampleSize", OPTION_INT},
    {"seed", OPTION_INT},
    {"smallOk", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 /* Estimate base count from file size based on this. */
 #define ZIPPED_BYTES_PER_BASE 0.80
 #define UNZIPPED_BYTES_PER_BASE 2.5
-#define READ_SIZE 100
 
 static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof)
 /* Get next line and make sure, other than whitespace, it matches 'match'.
  * Return FALSE on EOF, unless noEof is set, in which case abort */
 {
 char *line;
 if (!lineFileNext(lf, &line, NULL))
     {
     if (noEof)
         errAbort("Expecting %c got end of file in %s", match, lf->fileName);
     else
         return FALSE;
     }
 if (line[0] != match)
     errAbort("Expecting %c got %s line %d of %s", match, line, lf->lineIx, lf->fileName);
 return TRUE;
 }
 
 static int averageReadSize(char *fileName, int maxReads)
 /* Read up to maxReads from fastq file and return average # of reads. */
 {
 struct lineFile *lf = lineFileOpen(fileName, FALSE);
 int i;
 long total = 0;
 int count = 0;
 
 for (i=0; i<maxReads; ++i)
     {
     /* Deal with initial line starting with '@' */
     if (!nextLineMustMatchChar(lf, '@', FALSE))
 	break;
 
     /* Deal with line containing sequence. */
     char *line;
     int lineSize = 0;
     if (!lineFileNext(lf, &line, &lineSize))
 	errAbort("%s truncated in middle of record", lf->fileName);
 
     /* Get size and add it to stats */
     int seqSize = lineSize-1;
     total += seqSize;
     count += 1;
 
     /* Deal with next two lines '+' and quality lines. */
     nextLineMustMatchChar(lf, '+', TRUE);
     lineFileNeedNext(lf, &line, &lineSize);
     }
 lineFileClose(&lf);
 if (count < 1)
     errAbort("No data in %s", fileName);
 return (total + (count>>1))/count;
 }
 
 int calcInitialReduction(char *fileName, int desiredReadCount)
 /* Using file name and size figure out how much to reduce it to get ~2x the subsample we want. */
 {
 size_t initialSize = fileSize(fileName);
 int readSize = averageReadSize(fileName, 100);
 long long estimatedBases;
 if (endsWith(fileName, ".gz") || endsWith(fileName, ".bz2"))
     estimatedBases = initialSize/ZIPPED_BYTES_PER_BASE;
 else
     estimatedBases = initialSize/UNZIPPED_BYTES_PER_BASE;
 long long estimatedReads = estimatedBases/readSize;
 double estimatedReduction = (double)estimatedReads/desiredReadCount;
 double conservativeReduction = estimatedReduction * 0.3;
 if (conservativeReduction < 1)
     conservativeReduction = 1;
 return round(conservativeReduction);
 }
 
 
 /* A bunch of statistics gathering variables set by oneFastqRecord below. */
 #define MAX_READ_SIZE 100000	/* This is fastq, right now only get 160 base reads max. */
 int maxReadBases, minReadBases, readCount;
-double sumReadBases, sumSquaredReadBases;
+long long sumReadBases;
+double sumSquaredReadBases;
 int aCount[MAX_READ_SIZE], cCount[MAX_READ_SIZE], gCount[MAX_READ_SIZE], tCount[MAX_READ_SIZE];
 int nCount[MAX_READ_SIZE];
 double sumQuals[MAX_READ_SIZE], sumSquaredQuals[MAX_READ_SIZE];
 int maxQual, minQual;
 
 double sumDoubleArray(double *array, int arraySize)
 /* Return sum of all items in array */
 {
 double total = 0;
 int i;
 for (i=0; i<arraySize; ++i)
     total += array[i];
 return total;
 }
 
 long long sumIntArray(int *array, int arraySize)
 /* Return sum of all items in array */
 {
 long long total = 0;
 int i;
 for (i=0; i<arraySize; ++i)
     total += array[i];
 return total;
 }
 
+void printAveDoubleArray(FILE *f, char *label, double *a, int *totalAtPos, int aSize)
+/* Print a[i]/counts[i] for all elements in array */
+{
+fprintf(f, "%s ", label);
+int i;
+for (i=0; i<aSize; ++i)
+    fprintf(f, "%g,", a[i]/totalAtPos[i]);
+fprintf(f, "\n");
+}
+
+void printAveIntArray(FILE *f, char *label, int *a, int *totalAtPos, int aSize)
+/* Print a[i]/totalAtPos[i] for all elements in array */
+{
+fprintf(f, "%s ", label);
+int i;
+for (i=0; i<aSize; ++i)
+    fprintf(f, "%g,", ((double)a[i])/totalAtPos[i]);
+fprintf(f, "\n");
+}
+
+
 boolean oneFastqRecord(struct lineFile *lf, FILE *f, boolean copy, boolean firstTime)
 /* Read next fastq record from LF, and optionally copy it to f.  Return FALSE at end of file 
  * Do a _little_ error checking on record while we're at it.  The format has already been
  * validated on the client side fairly thoroughly. */
 {
 char *line;
 int lineSize;
 
 /* Deal with initial line starting with '@' */
 if (!lineFileNext(lf, &line, &lineSize))
     return FALSE;
 if (line[0] != '@')
     errAbort("Expecting line starting with '@' got %s line %d of %s", 
 	line, lf->lineIx, lf->fileName);
 if (copy)
     mustWrite(f, line, lineSize);
 
 /* Deal with line containing sequence. */
 if (!lineFileNext(lf, &line, &lineSize))
     errAbort("%s truncated in middle of record", lf->fileName);
 
 /* Get size and add it to stats */
 int seqSize = lineSize-1;
 if (seqSize > MAX_READ_SIZE)
     errAbort("Sequence size %d too long line %d of %s.  Max is %d", seqSize, 
 	lf->lineIx, lf->fileName, MAX_READ_SIZE);
 if (firstTime)
     {
     maxReadBases = minReadBases = seqSize;
     }
 else
     {
     if (maxReadBases < seqSize)
         maxReadBases = seqSize;
     if (minReadBases > seqSize)
         minReadBases = seqSize;
     }
 sumReadBases += seqSize;
 sumSquaredReadBases += seqSize*seqSize;
 ++readCount;
 
 /* Save up nucleotide stats and abort on bogus nucleotides. */
 int i;
 for (i=0; i<seqSize; ++i)
     {
     char c = tolower(line[i]);
     switch (c)
         {
 	case 'a':
 	    aCount[i] += 1;
 	    break;
 	case 'c':
 	    cCount[i] += 1;
 	    break;
 	case 'g':
 	    gCount[i] += 1;
 	    break;
 	case 't':
 	    tCount[i] += 1;
 	    break;
 	case 'n':
 	case '.':
 	    nCount[i] += 1;
 	    break;
 	default:
 	    errAbort("Unrecognized nucleotide character %c line %d of %s", c,
 		lf->lineIx, lf->fileName);
 	    break;
 	}
     }
 
+
 if (copy)
     mustWrite(f, line, lineSize);
 
 /* Deal with line containing just '+' that separates sequence from quality. */
 nextLineMustMatchChar(lf, '+', TRUE);
 if (copy)
     fprintf(f, "+\n");
 
 /* Deal with quality score line. */
 if (!lineFileNext(lf, &line, &lineSize))
     errAbort("%s truncated in middle of record", lf->fileName);
 int qualSize = lineSize-1;
 
 /* Make sure it is same size */
 if (seqSize != qualSize)
     errAbort("Sequence and quality size differ line %d and %d of %s", 
 	lf->lineIx-2, lf->lineIx, lf->fileName);
 
 if (firstTime)
     {
     minQual = maxQual = line[0];
     }
 
 /* Do stats */
 for (i=0; i<seqSize; ++i)
     {
     int qual = line[i];
     if (maxQual < qual)
         maxQual = qual;
     if (minQual > qual)
         minQual = qual;
     sumQuals[i] += qual;
     sumSquaredQuals[i] += qual*qual;
     }
 
 if (copy)
     mustWrite(f, line, lineSize);
 
 
 return TRUE;
 }
 
 boolean maybeCopyFastqRecord(struct lineFile *lf, FILE *f, boolean copy, int *retSeqSize)
 /* Read next fastq record from LF, and optionally copy it to f.  Return FALSE at end of file 
  * Do a _little_ error checking on record while we're at it.  The format has already been
- * validated on the client side fairly thoroughly. */
+ * validated on the client side fairly thoroughly. Similar to oneFastq record but with
+ * fewer side effects. */
 {
 char *line;
 int lineSize;
 
 /* Deal with initial line starting with '@' */
 if (!lineFileNext(lf, &line, &lineSize))
     return FALSE;
 if (line[0] != '@')
     errAbort("Expecting line starting with '@' got %s line %d of %s", 
 	line, lf->lineIx, lf->fileName);
 if (copy)
     mustWrite(f, line, lineSize);
 
 
 /* Deal with line containing sequence. */
 if (!lineFileNext(lf, &line, &lineSize))
     errAbort("%s truncated in middle of record", lf->fileName);
 if (copy)
     mustWrite(f, line, lineSize);
 int seqSize = lineSize-1;
 
 /* Deal with line containing just '+' that separates sequence from quality. */
 /* Deal with line containing just '+' that separates sequence from quality. */
 nextLineMustMatchChar(lf, '+', TRUE);
 if (copy)
     fprintf(f, "+\n");
 
 /* Deal with quality score line. */
 if (!lineFileNext(lf, &line, &lineSize))
     errAbort("%s truncated in middle of record", lf->fileName);
 if (copy)
     mustWrite(f, line, lineSize);
 int qualSize = lineSize-1;
 
 if (seqSize != qualSize)
     errAbort("Sequence and quality size differ line %d and %d of %s", 
 	lf->lineIx-2, lf->lineIx, lf->fileName);
 
 *retSeqSize = seqSize;
 return TRUE;
 }
 
-void reduceFastqSample(char *source, FILE *f, int oldSize, int newSize)
+long long reduceFastqSample(char *source, FILE *f, int oldSize, int newSize)
 /* Copy newSize samples from source into open output f.  */
 {
+long long basesInSample = 0;
+
 /* Make up an array that tells us which random parts of the source file to use. */
 assert(oldSize >= newSize);
 char *randomizer = needMem(oldSize);
 memset(randomizer, TRUE, newSize);
 shuffleArrayOfChars(randomizer, oldSize);
 
 struct lineFile *lf = lineFileOpen(source, FALSE);
 int i;
 for (i=0; i<oldSize; ++i)
     {
     int seqSize;
     boolean doIt = randomizer[i];
     if (!maybeCopyFastqRecord(lf, f, doIt, &seqSize))
          internalErr();
+    if (doIt)
+	basesInSample += seqSize;
     }
 freez(&randomizer);
 lineFileClose(&lf);
+return basesInSample;
 }
 
 void fastqStatsAndSubsample(char *inFastq, char *outStats, char *outFastq)
 /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting 
  * statistics,  and also producing a smaller fastq out of a sample of the data. */
 {
 /* Split up outFastq path, so we can make a temp file in the same dir. */
 char outDir[PATH_LEN];
 splitPath(outFastq, outDir, NULL, NULL);
 
 /* Open up temp output file.  This one will be for the initial scaling.  We'll do
  * a second round of scaling as well. */
 char smallFastqName[PATH_LEN];
 safef(smallFastqName, PATH_LEN, "%sfastqSubsampleXXXXXX", outDir);
 int smallFd = mkstemp(smallFastqName);
 FILE *smallF = fdopen(smallFd, "w");
 
 /* Scan through input, collecting stats, validating, and creating a subset file. */
 int downStep = calcInitialReduction(inFastq, sampleSize);
 struct lineFile *lf = lineFileOpen(inFastq, FALSE);
 boolean done = FALSE;
 int readsCopied = 0, totalReads = 0;
+long long basesInSample = 0;
 boolean firstTime = TRUE;
 while (!done)
     {
     int hotPosInCycle = rand()%downStep;
     int cycle;
     for (cycle=0; cycle<downStep; ++cycle)
         {
 	boolean hotPos = (cycle == hotPosInCycle);
 	if (!oneFastqRecord(lf, smallF, hotPos, firstTime))
 	    {
 	    done = TRUE;
 	    break;
 	    }
 	if (hotPos)
 	    ++readsCopied;
 	firstTime = FALSE;
 	++totalReads;
 	}
     }
 lineFileClose(&lf);
 carefulClose(&smallF);
 
 boolean justUseSmall = FALSE;
 if (readsCopied <  sampleSize)
     {
     /* Our sample isn't big enough.  This could have two causes - a bug in
      * our estimation, maybe caused by read sizes growing a bunch in the time
      * since this code was written - or a file that is actually smaller smaller
      * than sampleSize. */
     if (sampleSize <= totalReads)
 	{
 	remove(smallFastqName);
 	errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq);
 	}
     else
         {
 	if (smallOk)
 	    {
 	    justUseSmall = TRUE;
 	    warn("%d reads total in %s, so sample is less than %d", 
 		totalReads, inFastq, sampleSize);
 	    }
 	else
 	    {
 	    remove(smallFastqName);
 	    errAbort("SampleSize is set to %d reads, but there are only %d reads in %s",
 		    sampleSize, totalReads, inFastq);
 	    }
 	}
     }
 
 char *qualType = "solexa";
 int qualZero = 64;
 if (minQual <= 58)
     {
     qualType = "sanger";
     qualZero = 33;
     }
+
+if (justUseSmall)
+    {
+    mustRename(smallFastqName, outFastq);
+    sampleSize = readsCopied;
+    basesInSample= sumReadBases;
+    }
+else
+    {
+    FILE *f = mustOpen(outFastq, "w");
+    basesInSample = reduceFastqSample(smallFastqName, f, readsCopied, sampleSize);
+    carefulClose(&f);
+    remove(smallFastqName);
+    }
+
 FILE *f = mustOpen(outStats, "w");
+int posCount = maxReadBases;
 fprintf(f, "readCount %d\n", totalReads);
-fprintf(f, "baseCount %g\n", sumReadBases);
-fprintf(f, "readSizeMean %g\n", sumReadBases/totalReads);
+fprintf(f, "baseCount %lld\n", sumReadBases);
+fprintf(f, "sampleCount %d\n", sampleSize);
+fprintf(f, "basesInSample %lld\n", basesInSample);
+fprintf(f, "readSizeMean %g\n", (double)sumReadBases/totalReads);
+if (minReadBases != maxReadBases)
 fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads));
+else
+    fprintf(f, "readSizeStd 0\n");
 fprintf(f, "readSizeMin %d\n", minReadBases);
 fprintf(f, "readSizeMax %d\n", maxReadBases);
 double qSum = sumDoubleArray(sumQuals, maxReadBases);
 double qSumSquared = sumDoubleArray(sumSquaredQuals, maxReadBases);
 fprintf(f, "qualMean %g\n", qSum/sumReadBases - qualZero);
+if (minQual != maxQual)
 fprintf(f, "qualStd %g\n", calcStdFromSums(qSum, qSumSquared, sumReadBases));
+else
+    fprintf(f, "qualStd 0\n");
 fprintf(f, "qualMin %d\n", minQual - qualZero);
 fprintf(f, "qualMax %d\n", maxQual - qualZero);
 fprintf(f, "qualType %s\n", qualType);
 fprintf(f, "qualZero %d\n", qualZero);
-carefulClose(&f);
 
-if (justUseSmall)
-    mustRename(smallFastqName, outFastq);
-else
-    {
-    f = mustOpen(outFastq, "w");
-    reduceFastqSample(smallFastqName, f, readsCopied, sampleSize);
-    carefulClose(&f);
-    remove(smallFastqName);
-    }
+/* Compute overall total nucleotide stats from count arrays. */
+long long aSum = sumIntArray(aCount, maxReadBases);
+long long cSum = sumIntArray(cCount, maxReadBases);
+long long gSum = sumIntArray(gCount, maxReadBases);
+long long tSum = sumIntArray(tCount, maxReadBases);
+long long nSum = sumIntArray(nCount, maxReadBases);
+fprintf(f, "atRatio %g\n", (double)(aSum + tSum)/(aSum + cSum + gSum + tSum));
+fprintf(f, "aRatio %g\n", (double)aSum/sumReadBases);
+fprintf(f, "cRatio %g\n", (double)cSum/sumReadBases);
+fprintf(f, "gRatio %g\n", (double)gSum/sumReadBases);
+fprintf(f, "tRatio %g\n", (double)tSum/sumReadBases);
+fprintf(f, "nRatio %g\n", (double)nSum/sumReadBases);
+
+/* Now deal with array outputs.   First make up count of all bases we've seen. */
+fprintf(f, "posCount %d\n", posCount);
+int totalAtPos[posCount];
+int pos;
+for (pos=0; pos<posCount; ++pos)
+    totalAtPos[pos] = aCount[pos] + cCount[pos] + gCount[pos] + tCount[pos] + nCount[pos];
+
+/* Offset quality by scale */
+for (pos=0; pos<posCount; ++pos)
+    sumQuals[pos] -= totalAtPos[pos] * qualZero;
+
+printAveDoubleArray(f, "qualPos", sumQuals, totalAtPos, posCount);
+printAveIntArray(f, "aAtPos", aCount, totalAtPos, posCount);
+printAveIntArray(f, "cAtPos", cCount, totalAtPos, posCount);
+printAveIntArray(f, "gAtPos", gCount, totalAtPos, posCount);
+printAveIntArray(f, "tAtPos", tCount, totalAtPos, posCount);
+printAveIntArray(f, "nAtPos", nCount, totalAtPos, posCount);
+
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 sampleSize = optionInt("sampleSize", sampleSize);
 seed = optionInt("seed", seed);
 smallOk = optionExists("smallOk");
 fastqStatsAndSubsample(argv[1], argv[2], argv[3]);
 return 0;
 }