e8934cf354e5696886de36315e7fc6dd602949dd
kent
  Thu Dec 5 10:51:17 2013 -0800
Simplifying initial step down to make it give consistent result across read pairs.
diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
index afcda73..0c25689 100644
--- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
+++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
@@ -1,130 +1,88 @@
 /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting 
  * statistics,  and also producing a smaller fastq out of a sample of the data. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "portable.h"
 #include "obscure.h"
 #include "hmmstats.h"
 
+/* A note on randomness: This program is used on paired end data.  This data is represented
+ * as two separate fastq files where the forward reads are in one file and the reverse in
+ * the other.  The files are in the same order, which is how we know which forward read goes
+ * with which reverse read.   As a result it is very important that this program sample
+ * the same records from files that have the same number of records. 
+ *
+ * This is implemented in two passes - the first pass calculates the statistics and
+ * produces a file with 1/10 the number of reads in it.  The second pass produces the
+ * final output by downsampling the 1/10 size file if it is big enough, or the original
+ * file if not. 
+ *
+ * Earlier versions of this program estimated the amount to reduce in the first pass
+ * and were more efficient, but the estimates were based on the file sizes, and thus
+ * sometimes varied when dealing with compressed input files, and this would break the
+ * correspondence between read pairs, so now the estimate is always 1/10. */
+
 int sampleSize = 100000;
 int seed = 0;
 boolean smallOk = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting statistics\n"
   "and also producing a smaller fastq out of a sample of the data.  The fastq input may be\n"
   "compressed with gzip or bzip2.  Unfortunately the fastq input can't be in a pipe\n"
   "usage:\n"
   "   fastqStatsAndSubsample in.fastq out.stats out.fastq\n"
   "options:\n"
   "   -sampleSize=N - default %d\n"
   "   -seed=N - Use given seed for random number generator.  Default %d.\n"
   "   -smallOk - Not an error if less than sampleSize reads.  out.fastq will be entire in.fastq\n"
   , sampleSize, seed
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"sampleSize", OPTION_INT},
    {"seed", OPTION_INT},
    {"smallOk", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
-/* Estimate base count from file size based on this. */
-#define ZIPPED_BYTES_PER_BASE 0.80
-#define UNZIPPED_BYTES_PER_BASE 2.5
-
 static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof)
 /* Get next line and make sure, other than whitespace, it matches 'match'.
  * Return FALSE on EOF, unless noEof is set, in which case abort */
 {
 char *line;
 if (!lineFileNextReal(lf, &line))
     {
     if (noEof)
         errAbort("Expecting %c got end of file in %s", match, lf->fileName);
     else
         return FALSE;
     }
 if (line[0] != match)
     errAbort("Expecting %c got %s line %d of %s", match, line, lf->lineIx, lf->fileName);
 return TRUE;
 }
 
-static int averageReadSize(char *fileName, int maxReads)
-/* Read up to maxReads from fastq file and return average # of reads. */
-{
-struct lineFile *lf = lineFileOpen(fileName, FALSE);
-int i;
-long total = 0;
-int count = 0;
-
-for (i=0; i<maxReads; ++i)
-    {
-    /* Deal with initial line starting with '@' */
-    if (!nextLineMustMatchChar(lf, '@', FALSE))
-	break;
-
-    /* Deal with line containing sequence. */
-    char *line;
-    int lineSize = 0;
-    if (!lineFileNext(lf, &line, &lineSize))
-	errAbort("%s truncated in middle of record", lf->fileName);
-
-    /* Get size and add it to stats */
-    int seqSize = lineSize-1;
-    total += seqSize;
-    count += 1;
-
-    /* Deal with next two lines '+' and quality lines. */
-    nextLineMustMatchChar(lf, '+', TRUE);
-    lineFileNeedNext(lf, &line, &lineSize);
-    }
-lineFileClose(&lf);
-if (count < 1)
-    errAbort("No data in %s", fileName);
-return (total + (count>>1))/count;
-}
-
-int calcInitialReduction(char *fileName, int desiredReadCount)
-/* Using file name and size figure out how much to reduce it to get ~2x the subsample we want. */
-{
-size_t initialSize = fileSize(fileName);
-int readSize = averageReadSize(fileName, 100);
-long long estimatedBases;
-if (endsWith(fileName, ".gz") || endsWith(fileName, ".bz2"))
-    estimatedBases = initialSize/ZIPPED_BYTES_PER_BASE;
-else
-    estimatedBases = initialSize/UNZIPPED_BYTES_PER_BASE;
-long long estimatedReads = estimatedBases/readSize;
-double estimatedReduction = (double)estimatedReads/desiredReadCount;
-double conservativeReduction = estimatedReduction * 0.3;
-if (conservativeReduction < 1)
-    conservativeReduction = 1;
-return round(conservativeReduction);
-}
-
-
 /* A bunch of statistics gathering variables set by oneFastqRecord below. */
 #define MAX_READ_SIZE 100000	/* This is fastq, right now only get 160 base reads max. */
 int maxReadBases, minReadBases, readCount;
 long long sumReadBases;
 double sumSquaredReadBases;
 int aCount[MAX_READ_SIZE], cCount[MAX_READ_SIZE], gCount[MAX_READ_SIZE], tCount[MAX_READ_SIZE];
 int nCount[MAX_READ_SIZE];
 double sumQuals[MAX_READ_SIZE], sumSquaredQuals[MAX_READ_SIZE];
 int maxQual, minQual;
 
 double sumDoubleArray(double *array, int arraySize)
 /* Return sum of all items in array */
 {
 double total = 0;
 int i;
@@ -388,123 +346,111 @@
 void fastqStatsAndSubsample(char *inFastq, char *outStats, char *outFastq)
 /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting 
  * statistics,  and also producing a smaller fastq out of a sample of the data. */
 {
 /* Temporary file if any */
 FILE *smallF = NULL;
 
 /* Make this work without making input. */
 if (sameString("/dev/null", outFastq))
     outFastq = NULL;
 
 
 /* Open up temp output file.  This one will be for the initial scaling.  We'll do
  * a second round of scaling as well. */
 char smallFastqName[PATH_LEN] = "";
+char *smallishName = smallFastqName;
 if (outFastq != NULL)
     {
     /* Split up outFastq path, so we can make a temp file in the same dir. */
     char outDir[PATH_LEN];
     if (outFastq)
 	splitPath(outFastq, outDir, NULL, NULL);
 
     safef(smallFastqName, PATH_LEN, "%sfastqSubsampleXXXXXX", outDir);
     int smallFd = mkstemp(smallFastqName);
     smallF = fdopen(smallFd, "w");
     }
 
 /* Scan through input, collecting stats, validating, and creating a subset file. */
-int downStep = calcInitialReduction(inFastq, sampleSize);
+int downStep = 10;
 struct lineFile *lf = lineFileOpen(inFastq, FALSE);
 boolean done = FALSE;
 int readsCopied = 0, totalReads = 0;
 long long basesInSample = 0;
 boolean firstTime = TRUE;
 while (!done)
     {
     int hotPosInCycle = rand()%downStep; 
     int cycle;
     for (cycle=0; cycle<downStep; ++cycle)
         {
 	boolean hotPos = (cycle == hotPosInCycle);
 	if (!oneFastqRecord(lf, smallF, hotPos, firstTime))
 	    {
 	    done = TRUE;
 	    break;
 	    }
 	if (hotPos)
+	    {
 	    ++readsCopied;
+	    }
 	firstTime = FALSE;
 	++totalReads;
 	}
     }
 lineFileClose(&lf);
 carefulClose(&smallF);
 
-boolean justUseSmall = FALSE;
 if (outFastq != NULL && readsCopied <  sampleSize)
     {
-    /* Our sample isn't big enough.  This could have two causes - a bug in
-     * our estimation, maybe caused by read sizes growing a bunch in the time
-     * since this code was written - or a file that is actually smaller smaller
-     * than sampleSize. */
-    if (sampleSize <= totalReads)
-	{
-	remove(smallFastqName);
-	errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq);
-	}
-    else
+    /* Our sample isn't big enough.  We'll have to reread the main file.  Good
+     * news at least is that it isn't too big. */
+    smallishName = inFastq;
+    readsCopied = totalReads;
+    if (sampleSize > totalReads)
 	{
 	if (smallOk)
 	    {
-	    justUseSmall = TRUE;
 	    warn("%d reads total in %s, so sample is less than %d", 
 		totalReads, inFastq, sampleSize);
 	    }
 	else
 	    {
 	    remove(smallFastqName);
 	    errAbort("SampleSize is set to %d reads, but there are only %d reads in %s",
 		    sampleSize, totalReads, inFastq);
 	    }
+	sampleSize = totalReads;
 	}
     }
 
 char *qualType = "solexa";
 int qualZero = 64;
 if (minQual <= 58)
     {
     qualType = "sanger";
     qualZero = 33;
     }
 
 if (outFastq != NULL)
     {
-    if (justUseSmall)
-	{
-	mustRename(smallFastqName, outFastq);
-	sampleSize = readsCopied;
-	basesInSample= sumReadBases;
-	}
-    else
-	{
     FILE *f = mustOpen(outFastq, "w");
-	basesInSample = reduceFastqSample(smallFastqName, f, readsCopied, sampleSize);
+    basesInSample = reduceFastqSample(smallishName, f, readsCopied, sampleSize);
     carefulClose(&f);
     remove(smallFastqName);
     }
-    }
 
 FILE *f = mustOpen(outStats, "w");
 int posCount = maxReadBases;
 fprintf(f, "readCount %d\n", totalReads);
 fprintf(f, "baseCount %lld\n", sumReadBases);
 fprintf(f, "sampleCount %d\n", sampleSize);
 fprintf(f, "basesInSample %lld\n", basesInSample);
 fprintf(f, "readSizeMean %g\n", (double)sumReadBases/totalReads);
 if (minReadBases != maxReadBases)
     fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads));
 else
     fprintf(f, "readSizeStd 0\n");
 fprintf(f, "readSizeMin %d\n", minReadBases);
 fprintf(f, "readSizeMax %d\n", maxReadBases);
 double qSum = sumDoubleArray(sumQuals, maxReadBases);
@@ -530,32 +476,30 @@
 fprintf(f, "cRatio %g\n", (double)cSum/sumReadBases);
 fprintf(f, "gRatio %g\n", (double)gSum/sumReadBases);
 fprintf(f, "tRatio %g\n", (double)tSum/sumReadBases);
 fprintf(f, "nRatio %g\n", (double)nSum/sumReadBases);
 
 /* Now deal with array outputs.   First make up count of all bases we've seen. */
 fprintf(f, "posCount %d\n", posCount);
 long long totalAtPos[posCount];
 int pos;
 for (pos=0; pos<posCount; ++pos)
     totalAtPos[pos] = aCount[pos] + cCount[pos] + gCount[pos] + tCount[pos] + nCount[pos];
 
 /* Offset quality by scale */
 for (pos=0; pos<posCount; ++pos)
     sumQuals[pos] -= totalAtPos[pos] * qualZero;
-#ifdef SOON
-#endif /* SOON */
 
 printAveDoubleArray(f, "qualPos", sumQuals, totalAtPos, posCount);
 printAveIntArray(f, "aAtPos", aCount, totalAtPos, posCount);
 printAveIntArray(f, "cAtPos", cCount, totalAtPos, posCount);
 printAveIntArray(f, "gAtPos", gCount, totalAtPos, posCount);
 printAveIntArray(f, "tAtPos", tCount, totalAtPos, posCount);
 printAveIntArray(f, "nAtPos", nCount, totalAtPos, posCount);
 
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)