src/utils/ave/ave.c 1.9

1.9 2009/11/17 23:39:26 hiram
Add -noQuartiles option
Index: src/utils/ave/ave.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/ave/ave.c,v
retrieving revision 1.8
retrieving revision 1.9
diff -b -B -U 4 -r1.8 -r1.9
--- src/utils/ave/ave.c	31 Mar 2007 19:38:16 -0000	1.8
+++ src/utils/ave/ave.c	17 Nov 2009 23:39:26 -0000	1.9
@@ -2,14 +2,18 @@
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
+#include "sqlNum.h"
+#include "hmmstats.h"
 #include <float.h>
 
 static char const rcsid[] = "$Id$";
 
-int col = 1;
-bool tableOut = FALSE;
+static int col = 1;
+static bool tableOut = FALSE;
+static bool noQuartiles = FALSE;
+
 
 void usage()
 /* Explain usage and exit. */
 {
@@ -19,8 +23,10 @@
   "   ave file\n"
   "options:\n"
   "   -col=N Which column to use.  Default 1\n"
   "   -tableOut - output by columns (default output in rows)\n"
+  "   -noQuartiles - only calculate min,max,mean,standard deviation\n"
+  "                - for large data sets that will not fit in memory."
   );
 }
 
 int cmpDouble(const void *va, const void *vb)
@@ -101,12 +107,58 @@
     printf("standard deviation %f\n", sqrt(totalVar/count));
     }
 }
 
+void aveNoQuartiles(char *fileName)
+/* aveNoQuartiles - Compute only min,max,mean,stdDev no quartiles */
+{
+bits64 count = 0;
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *words[128], *word;
+int wordCount;
+int wordIx = col-1;
+double sumData = 0.0, sumSquares = 0.0;
+double minVal = DBL_MAX, maxVal = -DBL_MAX;
+
+while ((wordCount = lineFileChop(lf, words)) > 0)
+    {
+    word = words[wordIx];
+    if (word[0] == '-' || isdigit(word[0]))
+        {
+	double val = sqlDouble(word);
+	if (minVal > val) minVal = val;
+	if (maxVal < val) maxVal = val;
+	sumData += val;
+	sumSquares += val * val;
+	++count;
+	}
+    }
+if (count == 0)
+    errAbort("No numerical data column %d of %s", col, fileName);
+double average = sumData/count;
+double stdDev = calcStdFromSums(sumData, sumSquares, count);
+if (tableOut)
+    {
+    printf("# min max mean N sum stddev\n");
+    printf("%g %g %g %llu %g %g\n",
+	minVal, maxVal, average, count, sumData, stdDev);
+    }
+else
+    {
+    printf("average %f\n", average);
+    printf("min %f\n", minVal);
+    printf("max %f\n", maxVal);
+    printf("count %llu\n", count);
+    printf("total %f\n", sumData);
+    printf("standard deviation %f\n", stdDev);
+    }
+}
+
 void ave(char *fileName)
 /* ave - Compute average and basic stats. */
 {
-int count = 0, alloc = 1024;
+int count = 0;
+size_t alloc = 1024;
 double *array;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *words[128], *word;
 int wordCount;
@@ -139,7 +191,12 @@
 if (argc != 2)
     usage();
 col = optionInt("col", col);
 tableOut = optionExists("tableOut");
-ave(argv[1]);
+noQuartiles = optionExists("noQuartiles");
+if (noQuartiles)
+    aveNoQuartiles(argv[1]);
+else
+    ave(argv[1]);
+
 return 0;
 }