src/hg/regulate/regBedStats/regBedStats.c 1.1

1.1 2010/03/10 19:41:55 kent
Little program to calculate mean/std and quartile stats for a bunch of bed files.
Index: src/hg/regulate/regBedStats/regBedStats.c
===================================================================
RCS file: src/hg/regulate/regBedStats/regBedStats.c
diff -N src/hg/regulate/regBedStats/regBedStats.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/regulate/regBedStats/regBedStats.c	10 Mar 2010 19:41:55 -0000	1.1
@@ -0,0 +1,144 @@
+/* regBedStats - Go through bed files and calculate a bunch of statistics on them. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "hmmstats.h"
+#include "obscure.h"
+#include "sqlNum.h"
+
+static char const rcsid[] = "$Id$";
+
+int chromColIx = 0, startColIx=1, endColIx=2, scoreColIx=4;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "regBedStats - Go through bed files and calculate a bunch of statistics on them\n"
+  "usage:\n"
+  "   regBedStats fileList output\n"
+  "options:\n"
+  "   -chromColIx=N (default %d)\n"
+  "   -startColIx=N (default %d)\n"
+  "   -endColIx=N (default %d)\n"
+  "   -scoreColIx=N (default %d)\n"
+  , chromColIx, startColIx, endColIx, scoreColIx
+  );
+}
+
+static struct optionSpec options[] = {
+   {"chromColIx", OPTION_INT},
+   {"startColIx", OPTION_INT},
+   {"endColIx", OPTION_INT},
+   {"scoreColIx", OPTION_INT},
+   {NULL, 0},
+};
+
+void doubleBoxWhiskerCalc(int count, double *array, double *retMin, 
+	double *retQ1, double *retMedian, double *retQ3, double *retMax)
+/* Calculate what you need to draw a box and whiskers plot from an array of doubles. */
+{
+double median;
+doubleSort(count, array);
+*retMin = array[0];
+*retQ1 = array[(count+2)/4];
+int halfCount = count>>1;
+if ((count&1) == 1)
+    *retMedian = array[halfCount];
+else
+    {
+    *retMedian = (array[halfCount] + array[halfCount-1]) * 0.5;
+    }
+*retQ3 = array[(3*count+2)/4];
+*retMax = array[count-1];
+}
+
+void slDoubleBoxWhiskerCalc(struct slDouble *list, double *retMin, 
+	double *retQ1, double *retMedian, double *retQ3, double *retMax)
+/* Calculate what you need to draw a box and whiskers plot from a list of slDoubles. */
+{
+int i,count = slCount(list);
+struct slDouble *el;
+double *array, minVal, q1, median, q3, maxVal;
+if (count == 0)
+    errAbort("Can't take do slDoubleBoxWhiskerCalc of empty list");
+AllocArray(array,count);
+for (i=0, el=list; i<count; ++i, el=el->next)
+    array[i] = el->val;
+doubleBoxWhiskerCalc(count, array, retMin, retQ1, retMedian, retQ3, retMax);
+freeMem(array);
+}
+
+void printStats(FILE *f, struct slDouble *list)
+/* Print out stats on list: ave +-std min 1/4 median 3/4 max */
+{
+int count = 0;
+struct slDouble *el;
+double sum=0, sumSquared=0;
+for (el = list; el != NULL; el = el->next)
+    {
+    sum += el->val;
+    sumSquared += el->val * el->val;
+    count += 1;
+    }
+double ave = sum/count;
+double std = calcStdFromSums(sum, sumSquared, count);
+double minVal, q1, median, q3, maxVal;
+slDoubleBoxWhiskerCalc(list, &minVal, &q1, &median, &q3, &maxVal);
+fprintf(f, "\t%g+-%g [%g %g %g %g %g]", ave, std, minVal, q1, median, q3, maxVal);
+}
+
+void bedFileStats(char *bedFile, int colCount, FILE *f)
+/* Collect stats on sizes of things in a bed file, and scores too. */
+{
+struct lineFile *lf = lineFileOpen(bedFile, TRUE);
+struct slDouble *sizeList=NULL, *scoreList=NULL, *el;
+char *row[colCount];
+while (lineFileNextRow(lf, row, colCount))
+    {
+    int size = sqlUnsigned(row[endColIx]) - sqlUnsigned(row[startColIx]);
+    el = slDoubleNew(size);
+    slAddHead(&sizeList, el);
+    double score = sqlDouble(row[scoreColIx]);
+    el = slDoubleNew(score);
+    slAddHead(&scoreList, el);
+    }
+fprintf(f, "%s\t%d\tsize:", bedFile, slCount(scoreList));
+printStats(f, sizeList);
+fprintf(f, "\tscore:");
+printStats(f, scoreList);
+fprintf(f, "\n");
+lineFileClose(&lf);
+}
+
+void regBedStats(char *fileOfFiles, char *output)
+/* regBedStats - Go through bed files and calculate a bunch of statistics on them. */
+{
+struct slName *in, *inList = readAllLines(fileOfFiles);
+FILE *f = mustOpen(output, "w");
+int colCount = max(chromColIx, startColIx);
+colCount = max(colCount, endColIx);
+colCount = max(colCount, scoreColIx);
+colCount += 1;
+for (in = inList; in != NULL; in = in->next)
+    {
+    bedFileStats(in->name, colCount, f);
+    }
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+    usage();
+chromColIx = optionInt("chromColIx", chromColIx);
+startColIx = optionInt("startColIx", startColIx);
+endColIx = optionInt("endColIx", endColIx);
+scoreColIx = optionInt("scoreColIx", scoreColIx);
+
+regBedStats(argv[1], argv[2]);
+return 0;
+}