src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c c864e74a423acbe711e6be084573aa11b687c1ce

c864e74a423acbe711e6be084573aa11b687c1ce
kent
  Tue Aug 6 17:47:29 2013 -0700
Making it work ok with /dev/null as fastq output.  Fixing an integer overflow bug.
diff --git src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
index 4b23fdd..049919e 100644
--- src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
+++ src/utils/fastqStatsAndSubsample/fastqStatsAndSubsample.c
@@ -131,59 +131,63 @@
 for (i=0; i<arraySize; ++i)
     total += array[i];
 return total;
 }
 
 long long sumIntArray(int *array, int arraySize)
 /* Return sum of all items in array */
 {
 long long total = 0;
 int i;
 for (i=0; i<arraySize; ++i)
     total += array[i];
 return total;
 }
 
-void printAveDoubleArray(FILE *f, char *label, double *a, int *totalAtPos, int aSize)
+void printAveDoubleArray(FILE *f, char *label, double *a, long long *totalAtPos, int aSize)
 /* Print a[i]/counts[i] for all elements in array */
 {
 fprintf(f, "%s ", label);
 int i;
 for (i=0; i<aSize; ++i)
     fprintf(f, "%g,", a[i]/totalAtPos[i]);
 fprintf(f, "\n");
 }
 
-void printAveIntArray(FILE *f, char *label, int *a, int *totalAtPos, int aSize)
+void printAveIntArray(FILE *f, char *label, int *a, long long *totalAtPos, int aSize)
 /* Print a[i]/totalAtPos[i] for all elements in array */
 {
 fprintf(f, "%s ", label);
 int i;
 for (i=0; i<aSize; ++i)
     fprintf(f, "%g,", ((double)a[i])/totalAtPos[i]);
 fprintf(f, "\n");
 }
 
 
 boolean oneFastqRecord(struct lineFile *lf, FILE *f, boolean copy, boolean firstTime)
 /* Read next fastq record from LF, and optionally copy it to f.  Return FALSE at end of file 
  * Do a _little_ error checking on record while we're at it.  The format has already been
  * validated on the client side fairly thoroughly. */
 {
 char *line;
 int lineSize;
 
+/* Treat NULL file same as non-copy, so only have one condition to check on . */
+if (f == NULL)
+    copy = FALSE;
+
 /* Deal with initial line starting with '@' */
 if (!lineFileNext(lf, &line, &lineSize))
     return FALSE;
 if (line[0] != '@')
     errAbort("Expecting line starting with '@' got %s line %d of %s", 
 	line, lf->lineIx, lf->fileName);
 if (copy)
     mustWrite(f, line, lineSize);
 
 /* Deal with line containing sequence. */
 if (!lineFileNext(lf, &line, &lineSize))
     errAbort("%s truncated in middle of record", lf->fileName);
 
 /* Get size and add it to stats */
 int seqSize = lineSize-1;
@@ -344,71 +348,83 @@
     boolean doIt = randomizer[i];
     if (!maybeCopyFastqRecord(lf, f, doIt, &seqSize))
          internalErr();
     if (doIt)
 	basesInSample += seqSize;
     }
 freez(&randomizer);
 lineFileClose(&lf);
 return basesInSample;
 }
 
 void fastqStatsAndSubsample(char *inFastq, char *outStats, char *outFastq)
 /* fastqStatsAndSubsample - Go through a fastq file doing sanity checks and collecting 
  * statistics,  and also producing a smaller fastq out of a sample of the data. */
 {
+/* Temporary file if any */
+FILE *smallF = NULL;
+
+/* Make this work without making input. */
+if (sameString("/dev/null", outFastq))
+    outFastq = NULL;
+
+
+/* Open up temp output file.  This one will be for the initial scaling.  We'll do
+ * a second round of scaling as well. */
+char smallFastqName[PATH_LEN] = "";
+if (outFastq != NULL)
+    {
 /* Split up outFastq path, so we can make a temp file in the same dir. */
 char outDir[PATH_LEN];
+    if (outFastq)
 splitPath(outFastq, outDir, NULL, NULL);
 
-/* Open up temp output file.  This one will be for the initial scaling.  We'll do
- * a second round of scaling as well. */
-char smallFastqName[PATH_LEN];
 safef(smallFastqName, PATH_LEN, "%sfastqSubsampleXXXXXX", outDir);
 int smallFd = mkstemp(smallFastqName);
-FILE *smallF = fdopen(smallFd, "w");
+    smallF = fdopen(smallFd, "w");
+    }
 
 /* Scan through input, collecting stats, validating, and creating a subset file. */
 int downStep = calcInitialReduction(inFastq, sampleSize);
 struct lineFile *lf = lineFileOpen(inFastq, FALSE);
 boolean done = FALSE;
 int readsCopied = 0, totalReads = 0;
 long long basesInSample = 0;
 boolean firstTime = TRUE;
 while (!done)
     {
     int hotPosInCycle = rand()%downStep;
     int cycle;
     for (cycle=0; cycle<downStep; ++cycle)
         {
 	boolean hotPos = (cycle == hotPosInCycle);
 	if (!oneFastqRecord(lf, smallF, hotPos, firstTime))
 	    {
 	    done = TRUE;
 	    break;
 	    }
 	if (hotPos)
 	    ++readsCopied;
 	firstTime = FALSE;
 	++totalReads;
 	}
     }
 lineFileClose(&lf);
 carefulClose(&smallF);
 
 boolean justUseSmall = FALSE;
-if (readsCopied <  sampleSize)
+if (outFastq != NULL && readsCopied <  sampleSize)
     {
     /* Our sample isn't big enough.  This could have two causes - a bug in
      * our estimation, maybe caused by read sizes growing a bunch in the time
      * since this code was written - or a file that is actually smaller smaller
      * than sampleSize. */
     if (sampleSize <= totalReads)
 	{
 	remove(smallFastqName);
 	errAbort("Internal error: bad estimate %d for downStep on %s", downStep, inFastq);
 	}
     else
         {
 	if (smallOk)
 	    {
 	    justUseSmall = TRUE;
@@ -420,43 +436,46 @@
 	    remove(smallFastqName);
 	    errAbort("SampleSize is set to %d reads, but there are only %d reads in %s",
 		    sampleSize, totalReads, inFastq);
 	    }
 	}
     }
 
 char *qualType = "solexa";
 int qualZero = 64;
 if (minQual <= 58)
     {
     qualType = "sanger";
     qualZero = 33;
     }
 
+if (outFastq != NULL)
+    {
 if (justUseSmall)
     {
     mustRename(smallFastqName, outFastq);
     sampleSize = readsCopied;
     basesInSample= sumReadBases;
     }
 else
     {
     FILE *f = mustOpen(outFastq, "w");
     basesInSample = reduceFastqSample(smallFastqName, f, readsCopied, sampleSize);
     carefulClose(&f);
     remove(smallFastqName);
     }
+    }
 
 FILE *f = mustOpen(outStats, "w");
 int posCount = maxReadBases;
 fprintf(f, "readCount %d\n", totalReads);
 fprintf(f, "baseCount %lld\n", sumReadBases);
 fprintf(f, "sampleCount %d\n", sampleSize);
 fprintf(f, "basesInSample %lld\n", basesInSample);
 fprintf(f, "readSizeMean %g\n", (double)sumReadBases/totalReads);
 if (minReadBases != maxReadBases)
     fprintf(f, "readSizeStd %g\n", calcStdFromSums(sumReadBases, sumSquaredReadBases, totalReads));
 else
     fprintf(f, "readSizeStd 0\n");
 fprintf(f, "readSizeMin %d\n", minReadBases);
 fprintf(f, "readSizeMax %d\n", maxReadBases);
 double qSum = sumDoubleArray(sumQuals, maxReadBases);
@@ -474,38 +493,40 @@
 /* Compute overall total nucleotide stats from count arrays. */
 long long aSum = sumIntArray(aCount, maxReadBases);
 long long cSum = sumIntArray(cCount, maxReadBases);
 long long gSum = sumIntArray(gCount, maxReadBases);
 long long tSum = sumIntArray(tCount, maxReadBases);
 long long nSum = sumIntArray(nCount, maxReadBases);
 fprintf(f, "atRatio %g\n", (double)(aSum + tSum)/(aSum + cSum + gSum + tSum));
 fprintf(f, "aRatio %g\n", (double)aSum/sumReadBases);
 fprintf(f, "cRatio %g\n", (double)cSum/sumReadBases);
 fprintf(f, "gRatio %g\n", (double)gSum/sumReadBases);
 fprintf(f, "tRatio %g\n", (double)tSum/sumReadBases);
 fprintf(f, "nRatio %g\n", (double)nSum/sumReadBases);
 
 /* Now deal with array outputs.   First make up count of all bases we've seen. */
 fprintf(f, "posCount %d\n", posCount);
-int totalAtPos[posCount];
+long long totalAtPos[posCount];
 int pos;
 for (pos=0; pos<posCount; ++pos)
     totalAtPos[pos] = aCount[pos] + cCount[pos] + gCount[pos] + tCount[pos] + nCount[pos];
 
 /* Offset quality by scale */
 for (pos=0; pos<posCount; ++pos)
     sumQuals[pos] -= totalAtPos[pos] * qualZero;
+#ifdef SOON
+#endif /* SOON */
 
 printAveDoubleArray(f, "qualPos", sumQuals, totalAtPos, posCount);
 printAveIntArray(f, "aAtPos", aCount, totalAtPos, posCount);
 printAveIntArray(f, "cAtPos", cCount, totalAtPos, posCount);
 printAveIntArray(f, "gAtPos", gCount, totalAtPos, posCount);
 printAveIntArray(f, "tAtPos", tCount, totalAtPos, posCount);
 printAveIntArray(f, "nAtPos", nCount, totalAtPos, posCount);
 
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)