src/hg/regulate/regCluster/regCluster.c 1.1

1.1 2010/03/08 23:35:07 kent
First cut of some code to combine ENCODE regulatory tracks into a single track done.
Index: src/hg/regulate/regCluster/regCluster.c
===================================================================
RCS file: src/hg/regulate/regCluster/regCluster.c
diff -N src/hg/regulate/regCluster/regCluster.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/regulate/regCluster/regCluster.c	8 Mar 2010 23:35:07 -0000	1.1
@@ -0,0 +1,220 @@
+/* regCluster - Cluster regulator regions. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "sqlNum.h"
+#include "localmem.h"
+#include "rangeTree.h"
+
+static char const rcsid[] = "$Id$";
+
+int clDims = 1;
+double clScoreScale = 2.0;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+"regCluster - Cluster regulator regions\n"
+"usage:\n"
+"   regCluster tableOfTables output.cluster output.bed\n"
+"Where the table-of-tables is space or tab separated in the format:\n"
+"   <fileName> <chrom> <start> <end> <score> <dim1 label> ... <dimN label>\n"
+"where chrom, start, end are the indexes (starting with 0) of the chromosome, start, and end\n"
+"fields in the file.\n"
+"for example\n"
+"   simpleReg.bed 0 1 2 4 A 5 a 6\n"
+"options:\n"
+"   -dims=N - number of dimensions in data.  Would be 2 for cell-line + antibody. Default %d\n"
+"   -scoreScale=0.N - scale score by this factor. Default %g\n"
+, clDims
+, clScoreScale
+);
+}
+
+static struct optionSpec options[] = {
+   {"dims", OPTION_INT},
+   {"scoreScale", OPTION_DOUBLE},
+   {NULL, 0},
+};
+
+struct regDim
+/* A regulatory dimension */
+    {
+    int colIx;		/* Column index in table. */
+    char *label;	/* Label */
+    };
+
+struct regSource 
+/* A source of regulatory information */
+    {
+    struct regSource *next;
+    char *dataSource;		/* File (or table) */
+    int chromColIx;		/* Chromosome column index. */
+    int startColIx;		/* Start coordinate column index. */
+    int endColIx;		/* End ccoordinate column ix. */
+    int scoreColIx;		/* Index for score column. */
+    double normFactor;		/* Multiply this to get browser score. */
+    char **labels;		/* Label for each dimension */
+    int minColCount;		/* Minimum number of columns. */
+    };
+
+struct regItem
+/* An item in a regulatory track */
+    {
+    struct regItem *next;
+    char *chrom;		/* Chromosome. Not allocated here. */
+    int chromStart,chromEnd;	/* Half open coordinates. */
+    double score;		/* Ideally something like -log(p). */
+    struct regSource *source;   /* Source track/file for item. */
+    };
+
+struct regSource *regSourceLoadAll(char *fileName, int dimCount)
+/* Read file, parse it line by line and return list of regSources. */
+{
+struct lineFile *lf = lineFileOpen(fileName, TRUE);
+int rowSize = dimCount + 6;
+char *row[rowSize];
+struct regSource *sourceList = NULL, *source;
+while (lineFileNextRow(lf, row, rowSize))
+    {
+    /* Allocate struct and read in fixed fields. */
+    AllocVar(source);
+    source->dataSource = cloneString(row[0]);
+    source->chromColIx = sqlUnsigned(row[1]);
+    source->startColIx = sqlUnsigned(row[2]);
+    source->endColIx = sqlUnsigned(row[3]);
+    source->scoreColIx = sqlUnsigned(row[4]);
+    source->normFactor = sqlDouble(row[5]);
+
+    /* Read in dimension labels. */
+    AllocArray(source->labels, dimCount);
+    int i;
+    for (i=0; i<dimCount; ++i)
+        source->labels[i] = cloneString(row[i+6]);
+
+    /* Calculate required columns. */
+    int minColCount = max(source->chromColIx, source->startColIx);
+    minColCount = max(minColCount, source->endColIx);
+    minColCount = max(minColCount, source->scoreColIx);
+    source->minColCount = minColCount + 1;
+    slAddHead(&sourceList, source);
+    }
+lineFileClose(&lf);
+slReverse(&sourceList);
+return sourceList;
+}
+
+void clusterSource(struct regSource *source, struct hash *chromHash, 
+	struct rbTreeNode *stack[128])
+/* Read through data source and add items to it to rangeTrees in hash */
+{
+struct lineFile *lf = lineFileOpen(source->dataSource, TRUE);
+struct lm *lm = chromHash->lm;	/* Local memory pool - share with hash */
+char *row[source->minColCount];
+struct regItem *item;
+while (lineFileNextRow(lf, row, source->minColCount))
+    {
+    char *chrom = row[source->chromColIx];
+    struct hashEl *hel = hashLookup(chromHash, chrom);
+    if (hel == NULL)
+        {
+	struct rbTree *tree = rangeTreeNewDetailed(lm, stack);
+	hel = hashAdd(chromHash, chrom, tree);
+	}
+    struct rbTree *tree = hel->val;
+    lmAllocVar(lm, item);
+    item->chrom = hel->name;
+    item->chromStart = sqlUnsigned(row[source->startColIx]);
+    item->chromEnd = sqlUnsigned(row[source->endColIx]);
+    item->score = sqlDouble(row[source->scoreColIx]) * source->normFactor;
+    if (item->score > 1000) item->score = 1000;
+    item->source = source;
+    rangeTreeAddValList(tree, item->chromStart, item->chromEnd, item);
+    }
+
+lineFileClose(&lf);
+}
+
+int cmpChromEls(const void *va, const void *vb)
+/* Compare to sort based on query start. */
+{
+const struct hashEl *a = *((struct hashEl **)va);
+const struct hashEl *b = *((struct hashEl **)vb);
+return cmpWordsWithEmbeddedNumbers(a->name, b->name);
+}
+
+void outputClustersForChrom(char *chrom, struct rbTree *tree, FILE *fCluster, FILE *fBed)
+/* Write out clusters on one chromosome */
+{
+struct range *range, *rangeList = rangeTreeList(tree);
+uglyf("Got %d ranges\n", slCount(rangeList));
+int clusterIx = 0;
+for (range = rangeList; range != NULL; range = range->next)
+    {
+    struct regItem *item, *itemList = range->val;
+    ++clusterIx;
+    double totalScore = 0.0;
+    int start = itemList->chromStart, end = itemList->chromEnd;
+    for (item = itemList; item != NULL; item = item->next)
+        {
+	fprintf(fCluster, "%d\t%s\t", clusterIx, item->chrom);
+	fprintf(fCluster, "%d\t%d\t", item->chromStart, item->chromEnd);
+	fprintf(fCluster, "%d", (int)item->score);
+	int i;
+	for (i=0; i<clDims; ++i)
+	   fprintf(fCluster, "\t%s", item->source->labels[i]);
+	fprintf(fCluster, "\n");
+	start = min(start, item->chromStart);
+	end = max(end, item->chromEnd);
+	totalScore += item->score;
+	}
+    int score = clScoreScale * totalScore;
+    if (score > 1000) score = 1000;
+    fprintf(fBed, "%s\t%d\t%d\t%d\t%d\n", chrom, start, end, slCount(itemList), score);
+    }
+}
+
+void regCluster(char *tableOfTables, char *outCluster, char *outBed)
+/* regCluster - Cluster regulator regions. */
+{
+struct regSource *source, *sourceList = regSourceLoadAll(tableOfTables, clDims);
+verbose(1, "Read %d sources from %s\n", slCount(sourceList), tableOfTables);
+struct hash *chromHash = hashNew(0);
+struct rbTreeNode *stack[128];
+for (source = sourceList; source != NULL; source = source->next)
+    {
+    clusterSource(source, chromHash, stack);
+    }
+uglyf("%d chromosomes in sources\n", chromHash->elCount);
+
+/* Get out list of chromosomes and process one at a time. */
+FILE *fCluster = mustOpen(outCluster, "w");
+FILE *fBed = mustOpen(outBed, "w");
+struct hashEl *chrom, *chromList = hashElListHash(chromHash);
+slSort(&chromList, cmpChromEls);
+int totalClusters = 0;
+for (chrom = chromList; chrom != NULL; chrom = chrom->next)
+    {
+    struct rbTree *tree = chrom->val;
+    totalClusters += tree->n;
+    outputClustersForChrom(chrom->name, tree, fCluster, fBed);
+    }
+uglyf("%d total clusters in %d chromosomes\n", totalClusters, chromHash->elCount);
+carefulClose(&fCluster);
+carefulClose(&fBed);
+
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+clDims = optionInt("dims", clDims);
+clScoreScale = optionDouble("scoreScale", clScoreScale);
+regCluster(argv[1], argv[2], argv[3]);
+return 0;
+}