src/utils/ave/ave.c 1.10
1.10 2009/11/18 20:07:23 hiram
Previous standard deviation was incorrect, was not using N-1
Index: src/utils/ave/ave.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/ave/ave.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/utils/ave/ave.c 17 Nov 2009 23:39:26 -0000 1.9
+++ src/utils/ave/ave.c 18 Nov 2009 20:07:23 -0000 1.10
@@ -1,202 +1,206 @@
/* ave - Compute average and basic stats. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "sqlNum.h"
#include "hmmstats.h"
#include <float.h>
static char const rcsid[] = "$Id$";
static int col = 1;
static bool tableOut = FALSE;
static bool noQuartiles = FALSE;
void usage()
/* Explain usage and exit. */
{
errAbort(
"ave - Compute average and basic stats\n"
"usage:\n"
" ave file\n"
"options:\n"
" -col=N Which column to use. Default 1\n"
" -tableOut - output by columns (default output in rows)\n"
" -noQuartiles - only calculate min,max,mean,standard deviation\n"
" - for large data sets that will not fit in memory."
);
}
int cmpDouble(const void *va, const void *vb)
/* Compare two slNames. */
{
const double *a = va;
const double *b = vb;
double diff = *a - *b;
if (diff < 0)
return -1;
else if (diff > 0)
return 1;
else
return 0;
}
void showStats(double *array, int count)
/* Compute stats on sorted array */
{
double val, minVal = DBL_MAX, maxVal = -DBL_MAX;
double total = 0, average;
int i;
int q1Index, q3Index; /* quartile positions */
double q1, q3; /* quartile values */
double oneVar, totalVar = 0;
for (i=0; i<count; ++i)
{
val = array[i];
if (minVal > val) minVal = val;
if (maxVal < val) maxVal = val;
total += val;
}
average = total/count;
q1Index = (count+1)/4; /* one fourth, rounded down */
q3Index = (3*(count+1))/4; /* three fourths, rounded down */
if (q1Index < (count-1))
{
double range = array[q1Index+1] - array[q1Index];
q1 = array[q1Index] +
((((double)count+1.0)/4.0)-(double)q1Index)*range;
}
else
q1 = array[q1Index];
if (q3Index < (count-1))
{
double range = array[q3Index+1] - array[q3Index];
q3 = array[q3Index] +
((3.0*((double)count+1.0)/4.0)-(double)q3Index)*range;
}
else
q3 = array[q3Index];
for (i=0; i<count; ++i)
{
val = array[i];
oneVar = (average-val);
totalVar += oneVar*oneVar;
}
+ double var = totalVar;
+ if (count > 1)
+ var /= count-1;
+ double stdDev = sqrt(var);
if (tableOut)
{
printf("# min Q1 median Q3 max mean N sum stddev\n");
printf("%g %g %g %g %g %g %d %g %g\n", minVal, q1, array[count/2],
- q3, maxVal, average, count, total, sqrt(totalVar/count));
+ q3, maxVal, average, count, total, stdDev);
}
else
{
printf("Q1 %f\n", q1);
printf("median %f\n", array[count/2]);
printf("Q3 %f\n", q3);
printf("average %f\n", average);
printf("min %f\n", minVal);
printf("max %f\n", maxVal);
printf("count %d\n", count);
printf("total %f\n", total);
- printf("standard deviation %f\n", sqrt(totalVar/count));
+ printf("standard deviation %f\n", stdDev);
}
}
void aveNoQuartiles(char *fileName)
/* aveNoQuartiles - Compute only min,max,mean,stdDev no quartiles */
{
bits64 count = 0;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *words[128], *word;
int wordCount;
int wordIx = col-1;
double sumData = 0.0, sumSquares = 0.0;
double minVal = DBL_MAX, maxVal = -DBL_MAX;
while ((wordCount = lineFileChop(lf, words)) > 0)
{
word = words[wordIx];
if (word[0] == '-' || isdigit(word[0]))
{
double val = sqlDouble(word);
if (minVal > val) minVal = val;
if (maxVal < val) maxVal = val;
sumData += val;
sumSquares += val * val;
++count;
}
}
if (count == 0)
errAbort("No numerical data column %d of %s", col, fileName);
double average = sumData/count;
double stdDev = calcStdFromSums(sumData, sumSquares, count);
if (tableOut)
{
printf("# min max mean N sum stddev\n");
printf("%g %g %g %llu %g %g\n",
minVal, maxVal, average, count, sumData, stdDev);
}
else
{
printf("average %f\n", average);
printf("min %f\n", minVal);
printf("max %f\n", maxVal);
printf("count %llu\n", count);
printf("total %f\n", sumData);
printf("standard deviation %f\n", stdDev);
}
}
void ave(char *fileName)
/* ave - Compute average and basic stats. */
{
int count = 0;
size_t alloc = 1024;
double *array;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *words[128], *word;
int wordCount;
int wordIx = col-1;
AllocArray(array, alloc);
while ((wordCount = lineFileChop(lf, words)) > 0)
{
if (count >= alloc)
{
alloc <<= 1;
ExpandArray(array, count, alloc);
}
word = words[wordIx];
if (word[0] == '-' || isdigit(word[0]))
{
array[count++] = atof(word);
}
}
if (count == 0)
errAbort("No numerical data column %d of %s", col, fileName);
qsort(array, count, sizeof(array[0]), cmpDouble);
showStats(array, count);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionHash(&argc, argv);
if (argc != 2)
usage();
col = optionInt("col", col);
tableOut = optionExists("tableOut");
noQuartiles = optionExists("noQuartiles");
if (noQuartiles)
aveNoQuartiles(argv[1]);
else
ave(argv[1]);
return 0;
}