src/hg/regulate/regBedStats/regBedStats.c 1.1
1.1 2010/03/10 19:41:55 kent
Little program to calculate mean/std and quartile stats for a bunch of bed files.
Index: src/hg/regulate/regBedStats/regBedStats.c
===================================================================
RCS file: src/hg/regulate/regBedStats/regBedStats.c
diff -N src/hg/regulate/regBedStats/regBedStats.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/regulate/regBedStats/regBedStats.c 10 Mar 2010 19:41:55 -0000 1.1
@@ -0,0 +1,144 @@
+/* regBedStats - Go through bed files and calculate a bunch of statistics on them. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "hmmstats.h"
+#include "obscure.h"
+#include "sqlNum.h"
+
+static char const rcsid[] = "$Id$";
+
+int chromColIx = 0, startColIx=1, endColIx=2, scoreColIx=4;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "regBedStats - Go through bed files and calculate a bunch of statistics on them\n"
+ "usage:\n"
+ " regBedStats fileList output\n"
+ "options:\n"
+ " -chromColIx=N (default %d)\n"
+ " -startColIx=N (default %d)\n"
+ " -endColIx=N (default %d)\n"
+ " -scoreColIx=N (default %d)\n"
+ , chromColIx, startColIx, endColIx, scoreColIx
+ );
+}
+
+static struct optionSpec options[] = {
+ {"chromColIx", OPTION_INT},
+ {"startColIx", OPTION_INT},
+ {"endColIx", OPTION_INT},
+ {"scoreColIx", OPTION_INT},
+ {NULL, 0},
+};
+
+void doubleBoxWhiskerCalc(int count, double *array, double *retMin,
+ double *retQ1, double *retMedian, double *retQ3, double *retMax)
+/* Calculate what you need to draw a box and whiskers plot from an array of doubles. */
+{
+double median;
+doubleSort(count, array);
+*retMin = array[0];
+*retQ1 = array[(count+2)/4];
+int halfCount = count>>1;
+if ((count&1) == 1)
+ *retMedian = array[halfCount];
+else
+ {
+ *retMedian = (array[halfCount] + array[halfCount-1]) * 0.5;
+ }
+*retQ3 = array[(3*count+2)/4];
+*retMax = array[count-1];
+}
+
+void slDoubleBoxWhiskerCalc(struct slDouble *list, double *retMin,
+ double *retQ1, double *retMedian, double *retQ3, double *retMax)
+/* Calculate what you need to draw a box and whiskers plot from a list of slDoubles. */
+{
+int i,count = slCount(list);
+struct slDouble *el;
+double *array, minVal, q1, median, q3, maxVal;
+if (count == 0)
+ errAbort("Can't take do slDoubleBoxWhiskerCalc of empty list");
+AllocArray(array,count);
+for (i=0, el=list; i<count; ++i, el=el->next)
+ array[i] = el->val;
+doubleBoxWhiskerCalc(count, array, retMin, retQ1, retMedian, retQ3, retMax);
+freeMem(array);
+}
+
+void printStats(FILE *f, struct slDouble *list)
+/* Print out stats on list: ave +-std min 1/4 median 3/4 max */
+{
+int count = 0;
+struct slDouble *el;
+double sum=0, sumSquared=0;
+for (el = list; el != NULL; el = el->next)
+ {
+ sum += el->val;
+ sumSquared += el->val * el->val;
+ count += 1;
+ }
+double ave = sum/count;
+double std = calcStdFromSums(sum, sumSquared, count);
+double minVal, q1, median, q3, maxVal;
+slDoubleBoxWhiskerCalc(list, &minVal, &q1, &median, &q3, &maxVal);
+fprintf(f, "\t%g+-%g [%g %g %g %g %g]", ave, std, minVal, q1, median, q3, maxVal);
+}
+
+void bedFileStats(char *bedFile, int colCount, FILE *f)
+/* Collect stats on sizes of things in a bed file, and scores too. */
+{
+struct lineFile *lf = lineFileOpen(bedFile, TRUE);
+struct slDouble *sizeList=NULL, *scoreList=NULL, *el;
+char *row[colCount];
+while (lineFileNextRow(lf, row, colCount))
+ {
+ int size = sqlUnsigned(row[endColIx]) - sqlUnsigned(row[startColIx]);
+ el = slDoubleNew(size);
+ slAddHead(&sizeList, el);
+ double score = sqlDouble(row[scoreColIx]);
+ el = slDoubleNew(score);
+ slAddHead(&scoreList, el);
+ }
+fprintf(f, "%s\t%d\tsize:", bedFile, slCount(scoreList));
+printStats(f, sizeList);
+fprintf(f, "\tscore:");
+printStats(f, scoreList);
+fprintf(f, "\n");
+lineFileClose(&lf);
+}
+
+void regBedStats(char *fileOfFiles, char *output)
+/* regBedStats - Go through bed files and calculate a bunch of statistics on them. */
+{
+struct slName *in, *inList = readAllLines(fileOfFiles);
+FILE *f = mustOpen(output, "w");
+int colCount = max(chromColIx, startColIx);
+colCount = max(colCount, endColIx);
+colCount = max(colCount, scoreColIx);
+colCount += 1;
+for (in = inList; in != NULL; in = in->next)
+ {
+ bedFileStats(in->name, colCount, f);
+ }
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+ usage();
+chromColIx = optionInt("chromColIx", chromColIx);
+startColIx = optionInt("startColIx", startColIx);
+endColIx = optionInt("endColIx", endColIx);
+scoreColIx = optionInt("scoreColIx", scoreColIx);
+
+regBedStats(argv[1], argv[2]);
+return 0;
+}