1f4756ac08db3d4371556857dcf00ae2f1d0a8f4
kent
  Sat Jul 27 12:20:56 2013 -0700
Adding smallOk option.
diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
index f992ce2..1f3e7a7 100644
--- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
+++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
@@ -1,49 +1,52 @@
 /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting 
  * statistics,  and also producing a smaller fastq out of a sample of the data. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "portable.h"
 #include "obscure.h"
 #include "hmmstats.h"
 
 int sampleSize = 100000;
 int seed = 0;
+boolean smallOk = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting statistics\n"
   "and also producing a smaller fastq out of a sample of the data.  The fastq input may be\n"
   "compressed with gzip or bzip2.  Unfortunately the fastq input can't be in a pipe\n"
   "usage:\n"
   "   fastqStatsAndSubsample in.fastq out.stats out.fastq\n"
   "options:\n"
   "   -sampleSize=N - default %d\n"
-  "   -seed=N - use given seed for random number generator.  Default %d\n"
+  "   -seed=N - use given seed for random number generator.  Default %d.\n"
+  "   -smallOk - Not an error if less than sampleSize reads.  out.fastq will be entire in.fastq\n"
   , sampleSize, seed
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"sampleSize", OPTION_INT},
    {"seed", OPTION_INT},
+   {"smallOk", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 /* Estimate base count from file size based on this. */
 #define ZIPPED_BYTES_PER_BASE 0.80
 #define UNZIPPED_BYTES_PER_BASE 2.5
 #define READ_SIZE 100
 
 static boolean nextLineMustMatchChar(struct lineFile *lf, char match, boolean noEof)
 /* Get next line and make sure, other than whitespace, it matches 'match'.
  * Return FALSE on EOF, unless noEof is set, in which case abort */
 {
 char *line;
 if (!lineFileNext(lf, &line, NULL))
     {
@@ -189,30 +192,31 @@
     switch (c)
         {
 	case 'a':
 	    aCount[i] += 1;
 	    break;
 	case 'c':
 	    cCount[i] += 1;
 	    break;
 	case 'g':
 	    gCount[i] += 1;
 	    break;
 	case 't':
 	    tCount[i] += 1;
 	    break;
 	case 'n':
+	case '.':
 	    nCount[i] += 1;
 	    break;
 	default:
 	    errAbort("Unrecognized nucleotide character %c line %d of %s", c,
 		lf->lineIx, lf->fileName);
 	    break;
 	}
     }
 
 if (copy)
     mustWrite(f, line, lineSize);
 
 /* Deal with line containing just '+' that separates sequence from quality. */
 nextLineMustMatchChar(lf, '+', TRUE);
 if (copy)
@@ -350,70 +354,91 @@
 	boolean hotPos = (cycle == hotPosInCycle);
 	if (!oneFastqRecord(lf, smallF, hotPos, firstTime))
 	    {
 	    done = TRUE;
 	    break;
 	    }
 	if (hotPos)
 	    ++readsCopied;
 	firstTime = FALSE;
 	++totalReads;
 	}
     }
 lineFileClose(&lf);
 carefulClose(&smallF);
 
+boolean justUseSmall = FALSE;
 if (readsCopied <  sampleSize)
     {
-    remove(smallFastqName);
     /* Our sample isn't big enough.  This could have two causes - a bug in
      * our estimation, maybe caused by read sizes growing a bunch in the time
      * since this code was written - or a file that is actually smaller smaller
      * than sampleSize. */
-    if (sampleSize > totalReads)
+    if (sampleSize <= totalReads)
+	{
+	remove(smallFastqName);
+	errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq);
+	}
+    else
+        {
+	if (smallOk)
+	    {
+	    justUseSmall = TRUE;
+	    warn("%d reads total in %s, so sample is less than %d", 
+		totalReads, inFastq, sampleSize);
+	    }
+	else
+	    {
+	    remove(smallFastqName);
         errAbort("SampleSize is set to %d reads, but there are only %d reads in %s",
 		sampleSize, totalReads, inFastq);
-    else
-        errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq);
+	    }
+	}
     }
 
 char *qualType = "solexa";
 int qualZero = 64;
 if (minQual <= 58)
     {
     qualType = "sanger";
     qualZero = 33;
     }
 FILE *f = mustOpen(outStats, "w");
 fprintf(f, "readCount %d\n", totalReads);
 fprintf(f, "baseCount %g\n", sumReadBases);
 fprintf(f, "readSizeMean %g\n", sumReadBases/totalReads);
 fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads));
 fprintf(f, "readSizeMin %d\n", minReadBases);
 fprintf(f, "readSizeMax %d\n", maxReadBases);
 double qSum = sumDoubleArray(sumQuals, maxReadBases);
 double qSumSquared = sumDoubleArray(sumSquaredQuals, maxReadBases);
 fprintf(f, "qualMean %g\n", qSum/sumReadBases - qualZero);
 fprintf(f, "qualStd %g\n", calcStdFromSums(qSum, qSumSquared, sumReadBases));
 fprintf(f, "qualMin %d\n", minQual - qualZero);
 fprintf(f, "qualMax %d\n", maxQual - qualZero);
 fprintf(f, "qualType %s\n", qualType);
 fprintf(f, "qualZero %d\n", qualZero);
 carefulClose(&f);
 
+if (justUseSmall)
+    mustRename(smallFastqName, outFastq);
+else
+    {
 f = mustOpen(outFastq, "w");
 reduceFastqSample(smallFastqName, f, readsCopied, sampleSize);
 carefulClose(&f);
 remove(smallFastqName);
 }
+}
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 sampleSize = optionInt("sampleSize", sampleSize);
 seed = optionInt("seed", seed);
+smallOk = optionExists("smallOk");
 fastqStatsAndSubsample(argv[1], argv[2], argv[3]);
 return 0;
 }