src/hg/oneShot/findToFixBedGraphViewLimits/findToFixBedGraphViewLimits.c 1.1

1.1 2009/11/22 23:21:12 kent
One shot program to help patch viewLimits of bedGraph here.
Index: src/hg/oneShot/findToFixBedGraphViewLimits/findToFixBedGraphViewLimits.c
===================================================================
RCS file: src/hg/oneShot/findToFixBedGraphViewLimits/findToFixBedGraphViewLimits.c
diff -N src/hg/oneShot/findToFixBedGraphViewLimits/findToFixBedGraphViewLimits.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/oneShot/findToFixBedGraphViewLimits/findToFixBedGraphViewLimits.c	22 Nov 2009 23:21:12 -0000	1.1
@@ -0,0 +1,134 @@
+/* findToFixBedGraphViewLimits - Calculate min/max/mean/std and use them to set viewLimits on bedGraphs described in input.ra. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "ra.h"
+#include "hmmstats.h"
+#include "jksql.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "findToFixBedGraphViewLimits - Calculate min/max/mean/std and use them to set viewLimits on bedGraphs described in input.ra\n"
+  "usage:\n"
+  "   findToFixBedGraphViewLimits input.ra output.ra\n"
+  "options:\n"
+  "   -xxx=XXX\n"
+  );
+}
+
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+static char *mustFindVal(struct slPair *list, char *tag, struct lineFile *lf)
+/* Look for tag in list and return value.  If none complain and abort. */
+{
+char *val = slPairFindVal(list, tag);
+if (val == NULL)
+    errAbort("missing required %s tag near line %d of %s", tag, lf->lineIx, lf->fileName);
+return val;
+}
+
+void findToFixBedGraphViewLimits(char *input, char *output)
+/* findToFixBedGraphViewLimits - Calculate min/max/mean/std and use them to set viewLimits on bedGraphs described in input.ra. */
+{
+struct lineFile *lf = lineFileOpen(input, TRUE);
+FILE *f = mustOpen(output, "w");
+struct slPair *el, *list;
+while ((list = raNextRecordAsSlPairList(lf)) != NULL)
+    {
+    /* Find required fields for calcs. */
+    char *db = mustFindVal(list, "db", lf);
+    char *track = mustFindVal(list, "track", lf);
+    char *type = cloneString(mustFindVal(list, "type", lf));
+
+    /* Parse out type value, which should be "bedGraph 4" and put the 4 or whatever other number
+     * in dataFieldIndex. */
+    char *typeWords[3];
+    int typeWordCount = chopLine(type, typeWords);
+    if (typeWordCount != 2 || !sameString(typeWords[0], "bedGraph"))
+           errAbort("Not well formed bedGraph type line %d of %s", lf->lineIx, lf->fileName);
+    int dataFieldIndex = sqlUnsigned(typeWords[1]);
+    freez(&type);
+
+    /* Figure out field corresponding to dataFieldIndex. */
+    struct sqlConnection *conn = sqlConnect(db);
+    struct slName *fieldList = sqlFieldNames(conn, track);
+    struct slName *pastBin = fieldList;
+    if (sameString(pastBin->name, "bin"))
+         pastBin = pastBin->next;
+    struct slName *fieldName = slElementFromIx(pastBin, dataFieldIndex - 1);
+    if (fieldName == NULL)
+         errAbort("%s doesn't have enough fields", track);
+    char *field = fieldName->name;
+    assert(sqlFieldIndex(conn, track, field) >= 0);
+
+    /* Print reassuring status message */
+    verbose(1, "%s.%s has %d elements.  Data field is %s\n", db, track, sqlTableSize(conn, track), field);
+         
+    /* Get min/max dataValues in fields.  Do it ourselves rather than using SQL min/max because sometimes
+     * the data field is a name column.... */
+    char query[512];
+    safef(query, sizeof(query), "select chromStart,chromEnd,%s from %s", field, track);
+    struct sqlResult *sr = sqlGetResult(conn, query);
+    char **row;
+    row = sqlNextRow(sr);
+    assert(row != NULL);
+    int size = sqlUnsigned(row[1]) - sqlUnsigned(row[0]);
+    double val = sqlDouble(row[2]);
+    double minLimit = val, maxLimit = val;
+    double sumData = val*size;
+    double sumSquares = val*val*size;
+    bits64 n = size;
+    while ((row = sqlNextRow(sr)) != NULL)
+        {
+	val = sqlDouble(row[2]);
+	size = sqlUnsigned(row[1]) - sqlUnsigned(row[0]);
+	if (val < minLimit) minLimit = val;
+	if (val > maxLimit) maxLimit = val;
+	sumData += val*size;
+	sumSquares += val*val*size;
+	n += size;
+	}
+    sqlFreeResult(&sr);
+    double mean = sumData/n;
+    double std = calcStdFromSums(sumData, sumSquares, n);
+    verbose(1, "    min=%g max=%g mean=%g std=%g\n",  minLimit, maxLimit, mean, std);
+
+    double viewMax = mean + 5*std;
+    if (viewMax > maxLimit) viewMax = maxLimit;
+    double viewMin = mean - 5*std;
+    if (viewMin < minLimit) viewMin = minLimit;
+
+    /* Output original table plus new minLimit/maxLimit. */
+    for (el = list; el != NULL; el = el->next)
+	fprintf(f, "%s %s\n", el->name, (char *)el->val);
+    fprintf(f, "new_minLimit %g\n", minLimit);
+    fprintf(f, "new_maxLimit %g\n", maxLimit);
+    fprintf(f, "new_mean %g\n", mean);
+    fprintf(f, "new_std %g\n", std);
+    fprintf(f, "new_viewLimits %g:%g\n", viewMin, viewMax);
+    fprintf(f, "\n");
+
+    sqlDisconnect(&conn);
+    slFreeList(&fieldList);
+    slPairFreeValsAndList(&list);
+    }
+lineFileClose(&lf);
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+    usage();
+findToFixBedGraphViewLimits(argv[1], argv[2]);
+return 0;
+}