src/hg/regulate/regClusterMakeTableOfTables/regClusterMakeTableOfTables.c 1.1
1.1 2010/03/08 23:35:08 kent
First cut of some code to combine ENCODE regulatory tracks into a single track done.
Index: src/hg/regulate/regClusterMakeTableOfTables/regClusterMakeTableOfTables.c
===================================================================
RCS file: src/hg/regulate/regClusterMakeTableOfTables/regClusterMakeTableOfTables.c
diff -N src/hg/regulate/regClusterMakeTableOfTables/regClusterMakeTableOfTables.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/regulate/regClusterMakeTableOfTables/regClusterMakeTableOfTables.c 8 Mar 2010 23:35:08 -0000 1.1
@@ -0,0 +1,162 @@
+/* regClusterMakeTableOfTables - Make up a table of tables for regCluster program. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "obscure.h"
+#include "sqlNum.h"
+#include "hmmStats.h"
+
+static char const rcsid[] = "$Id$";
+
+boolean clTwo = FALSE;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "regClusterMakeTableOfTables - Make up a table of tables for regCluster program\n"
+ "usage:\n"
+ " regClusterMakeTableOfTables fileListFile output\n"
+ "options:\n"
+ " -two - assume name is camelCased with two things\n"
+ );
+}
+
+static struct optionSpec options[] = {
+ {"two", OPTION_BOOLEAN},
+ {NULL, 0},
+};
+
+int commonPrefixSize(struct slName *list)
+/* Return length of common prefix */
+{
+if (list == NULL)
+ return 0;
+int commonSize = strlen(list->name);
+struct slName *el, *lastEl = list;
+for (el = list->next; el != NULL; el = el->next)
+ {
+ int sameSize = countSame(el->name, lastEl->name);
+ commonSize = min(sameSize, commonSize);
+ lastEl = el;
+ }
+return commonSize;
+}
+
+int countSameAtEnd(char *a, char *b)
+/* Count number of characters at end of strings that are same in each string. */
+{
+int count = 0;
+char *aEnd = a + strlen(a);
+char *bEnd = b + strlen(b);
+while (--aEnd >= a && --bEnd >= b)
+ {
+ if (*aEnd != *bEnd)
+ break;
+ ++count;
+ }
+return count;
+}
+
+int commonSuffixSize(struct slName *list)
+/* Return length of common suffix */
+{
+if (list == NULL)
+ return 0;
+int commonSize = strlen(list->name);
+struct slName *el, *lastEl = list;
+for (el = list->next; el != NULL; el = el->next)
+ {
+ int sameSize = countSameAtEnd(el->name, lastEl->name);
+ commonSize = min(sameSize, commonSize);
+ lastEl = el;
+ }
+return commonSize;
+}
+
+void camelParseTwo(char *in, char **retA, char **retB)
+/* Parse out CamelCased in into a and b. */
+{
+char *s = in;
+char *aStart = s;
+char *bStart = NULL;
+char c;
+while ((c = *(++s)) != 0)
+ {
+ if (isupper(c))
+ {
+ bStart = s;
+ break;
+ }
+ }
+if (bStart == NULL)
+ errAbort("Couldn't find start of second word in %s", in);
+*retA = cloneStringZ(aStart, bStart - aStart);
+*retB = cloneString(bStart);
+}
+
+double calcNormScoreFactor(char *fileName, int scoreCol)
+/* Figure out what to multiply things by to get a nice browser score (0-1000) */
+{
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+char *row[scoreCol+1];
+double sum = 0, sumSquares = 0;
+int n = 0;
+double minVal=0, maxVal=0;
+while (lineFileRow(lf, row))
+ {
+ double x = sqlDouble(row[scoreCol]);
+ if (n == 0)
+ minVal = maxVal = x;
+ if (x < minVal) minVal = x;
+ if (x > maxVal) maxVal = x;
+ sum += x;
+ sumSquares += x*x;
+ n += 1;
+ }
+
+double std = calcStdFromSums(sum, sumSquares, n);
+double mean = sum/n;
+double highEnd = mean + std;
+if (highEnd > maxVal) highEnd = maxVal;
+return 1000.0/highEnd;
+}
+
+void regClusterMakeTableOfTables(char *input, char *output)
+/* regClusterMakeTableOfTables - Make up a table of tables for regCluster program. */
+{
+FILE *f = mustOpen(output, "w");
+struct slName *in, *inList = readAllLines(input);
+int commonPrefix = commonPrefixSize(inList);
+int commonSuffix = commonSuffixSize(inList);
+for (in = inList; in != NULL; in = in->next)
+ {
+ fprintf(f, "%s\t1\t2\t3\t7\t", in->name);
+ fprintf(f, "%g\t", calcNormScoreFactor(in->name, 7));
+ char *s = in->name;
+ int len = strlen(s);
+ char *midString = cloneStringZ(s+commonPrefix, len - commonPrefix - commonSuffix);
+ if (clTwo)
+ {
+ char *a, *b;
+ camelParseTwo(midString, &a, &b);
+ fprintf(f, "%s\t%s\n", a, b);
+ }
+ else
+ fprintf(f, "%s\n", midString);
+ freez(&midString);
+ }
+carefulClose(&f);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+ usage();
+clTwo = optionExists("two");
+regClusterMakeTableOfTables(argv[1], argv[2]);
+return 0;
+}