src/hg/regulate/regCluster/regCluster.c 1.2
1.2 2010/03/10 19:31:19 kent
Making it so that it will break up clusters that are weakly linked.
Index: src/hg/regulate/regCluster/regCluster.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/regulate/regCluster/regCluster.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/regulate/regCluster/regCluster.c 8 Mar 2010 23:35:07 -0000 1.1
+++ src/hg/regulate/regCluster/regCluster.c 10 Mar 2010 19:31:19 -0000 1.2
@@ -3,15 +3,18 @@
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "sqlNum.h"
+#include "obscure.h"
#include "localmem.h"
#include "rangeTree.h"
static char const rcsid[] = "$Id$";
int clDims = 1;
-double clScoreScale = 2.0;
+double clScoreScale = 1.0;
+double clForceJoinScore = 2;
+double clWeakLevel = 0.1;
void usage()
/* Explain usage and exit. */
{
@@ -23,20 +26,28 @@
" <fileName> <chrom> <start> <end> <score> <dim1 label> ... <dimN label>\n"
"where chrom, start, end are the indexes (starting with 0) of the chromosome, start, and end\n"
"fields in the file.\n"
"for example\n"
-" simpleReg.bed 0 1 2 4 A 5 a 6\n"
+" simpleReg.bed 0 1 2 4 aCell aFactor\n"
"options:\n"
" -dims=N - number of dimensions in data. Would be 2 for cell-line + antibody. Default %d\n"
" -scoreScale=0.N - scale score by this factor. Default %g\n"
+" -weakLevel=0.N - within a cluster ratio of highest depth of coverage of cluster to lowest\n"
+" Default %g\n"
+" -forceJoinScore=0.N - if combined score of elements joining 2 clusters above this, the\n"
+" clusters will be joined. Default %g\n"
, clDims
, clScoreScale
+, clWeakLevel
+, clForceJoinScore
);
}
static struct optionSpec options[] = {
{"dims", OPTION_INT},
{"scoreScale", OPTION_DOUBLE},
+ {"forceJoinScore", OPTION_DOUBLE},
+ {"weakLevel", OPTION_DOUBLE},
{NULL, 0},
};
struct regDim
@@ -69,8 +80,19 @@
double score; /* Ideally something like -log(p). */
struct regSource *source; /* Source track/file for item. */
};
+struct regCluster
+/* A cluster of items. */
+ {
+ struct regCluster *next;
+ char *chrom; /* Chromosome. Not allocated here. */
+ int chromStart, chromEnd; /* Half open coordinates. */
+ double score; /* Sum of component scores. */
+ double maxSubScore; /* Max of component scores. */
+ struct slRef *itemRefList; /* List of references to component items. */
+ };
+
struct regSource *regSourceLoadAll(char *fileName, int dimCount)
/* Read file, parse it line by line and return list of regSources. */
{
struct lineFile *lf = lineFileOpen(fileName, TRUE);
@@ -144,37 +166,159 @@
const struct hashEl *b = *((struct hashEl **)vb);
return cmpWordsWithEmbeddedNumbers(a->name, b->name);
}
+void addCluster(struct lm *lm, struct regItem *itemList, int start, int end,
+ struct regCluster **pList)
+/* Make cluster of all items that overlap start/end, and put it on list. */
+{
+struct regCluster *cluster;
+lmAllocVar(lm, cluster);
+double score = 0.0;
+double maxSubScore = 0.0;
+struct slRef *refList = NULL, *ref;
+struct regItem *item;
+for (item = itemList; item != NULL; item = item->next)
+ {
+ if (rangeIntersection(start, end, item->chromStart, item->chromEnd) > 0)
+ {
+ lmAllocVar(lm, ref);
+ ref->val = item;
+ slAddHead(&refList, ref);
+ score += item->score;
+ if (item->score > maxSubScore) maxSubScore = item->score;
+ }
+ }
+slReverse(&refList);
+cluster->chrom = itemList->chrom;
+cluster->chromStart = start;
+cluster->chromEnd = end;
+cluster->itemRefList = refList;
+cluster->score = score;
+cluster->maxSubScore = maxSubScore;
+slAddHead(pList, cluster);
+}
+
+struct regCluster *clusterItems(struct lm *lm, struct regItem *itemList,
+ double forceJoinScore, double weakLevel)
+/* Convert a list of items to a list of clusters of items. This may break up clusters that
+ * have weakly linked parts.
+ [ ]
+ AAAAAAAAAAAAAAAAAA
+ BBBBBB DDDDDD
+ CCCC EEEE
+ gets tranformed into
+ [ ] [ ]
+ AAAAAAAAAAAAAAAAAA
+ BBBBBB DDDDDD
+ CCCC EEEE
+ The strategy is to build a rangeTree of coverage, which might look something like so:
+ 123333211123333211
+ then define cluster ends that exceed the minimum limit, which is either 10% of the highest
+ or forceJoinScore if 10% of the highest is more than forceJoinScore. This will go to
+ something like so:
+ [---] [----]
+ Finally the items that are overlapping a cluster are assigned to it. Note that this
+ may mean that an item may be in multiple clusters.
+ [ABC] [ ADE]
+ */
+{
+int easyMax = round(1.0/weakLevel);
+int itemCount = slCount(itemList);
+struct regCluster *clusterList = NULL;
+if (itemCount < easyMax)
+ {
+ addCluster(lm, itemList, 0, BIGNUM, &clusterList);
+ }
+else
+ {
+ /* Make up coverage tree. */
+ struct rbTree *covTree = rangeTreeNew();
+ struct regItem *item;
+ for (item = itemList; item != NULL; item = item->next)
+ rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd);
+ struct range *range, *rangeList = rangeTreeList(covTree);
+
+ /* Figure out maximum coverage. */
+ int maxCov = 0;
+ for (range = rangeList; range != NULL; range = range->next)
+ {
+ int cov = ptToInt(range->val);
+ if (cov > maxCov) maxCov = cov;
+ }
+
+ /* Figure coverage threshold. */
+ int threshold = round(maxCov * weakLevel);
+ if (threshold > forceJoinScore-1) threshold = forceJoinScore-1;
+
+ /* Loop through emitting sections over threshold as clusters */
+ boolean inRange = FALSE;
+ boolean start = 0, end = 0;
+ for (range = rangeList; range != NULL; range = range->next)
+ {
+ int cov = ptToInt(range->val);
+ if (cov > threshold)
+ {
+ if (inRange)
+ end = range->end;
+ else
+ {
+ inRange = TRUE;
+ start = range->start;
+ end = range->end;
+ }
+ }
+ else
+ {
+ if (inRange)
+ {
+ addCluster(lm, itemList, start, end, &clusterList);
+ inRange = FALSE;
+ }
+ }
+ }
+ if (inRange)
+ addCluster(lm, itemList, start, end, &clusterList);
+ }
+slReverse(&clusterList);
+return clusterList;
+}
+
+static int clusterIx = 0;
+
void outputClustersForChrom(char *chrom, struct rbTree *tree, FILE *fCluster, FILE *fBed)
/* Write out clusters on one chromosome */
{
struct range *range, *rangeList = rangeTreeList(tree);
-uglyf("Got %d ranges\n", slCount(rangeList));
-int clusterIx = 0;
+verbose(2, "Got %d ranges on %s\n", slCount(rangeList), chrom);
+struct lm *lm = lmInit(0);
for (range = rangeList; range != NULL; range = range->next)
{
- struct regItem *item, *itemList = range->val;
+ struct regCluster *cluster, *clusterList = clusterItems(lm, range->val,
+ clForceJoinScore, clWeakLevel);
+ for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
+ {
+ struct slRef *ref, *refList=cluster->itemRefList;
++clusterIx;
- double totalScore = 0.0;
- int start = itemList->chromStart, end = itemList->chromEnd;
- for (item = itemList; item != NULL; item = item->next)
+ struct regItem *item = refList->val;
+ for (ref = refList; ref != NULL; ref = ref->next)
{
+ item = ref->val;
fprintf(fCluster, "%d\t%s\t", clusterIx, item->chrom);
fprintf(fCluster, "%d\t%d\t", item->chromStart, item->chromEnd);
- fprintf(fCluster, "%d", (int)item->score);
+ fprintf(fCluster, "%g", item->score);
int i;
for (i=0; i<clDims; ++i)
fprintf(fCluster, "\t%s", item->source->labels[i]);
fprintf(fCluster, "\n");
- start = min(start, item->chromStart);
- end = max(end, item->chromEnd);
- totalScore += item->score;
}
- int score = clScoreScale * totalScore;
+ double score = clScoreScale * cluster->maxSubScore;
if (score > 1000) score = 1000;
- fprintf(fBed, "%s\t%d\t%d\t%d\t%d\n", chrom, start, end, slCount(itemList), score);
+ fprintf(fBed, "%s\t%d\t%d\t%d\t%g\n", chrom,
+ cluster->chromStart, cluster->chromEnd, slCount(refList), score);
+ }
}
+lmCleanup(&lm);
}
void regCluster(char *tableOfTables, char *outCluster, char *outBed)
/* regCluster - Cluster regulator regions. */
@@ -186,9 +330,8 @@
for (source = sourceList; source != NULL; source = source->next)
{
clusterSource(source, chromHash, stack);
}
-uglyf("%d chromosomes in sources\n", chromHash->elCount);
/* Get out list of chromosomes and process one at a time. */
FILE *fCluster = mustOpen(outCluster, "w");
FILE *fBed = mustOpen(outBed, "w");
@@ -200,9 +343,10 @@
struct rbTree *tree = chrom->val;
totalClusters += tree->n;
outputClustersForChrom(chrom->name, tree, fCluster, fBed);
}
-uglyf("%d total clusters in %d chromosomes\n", totalClusters, chromHash->elCount);
+verbose(1, "%d singly-linked clusters, %d clusters in %d chromosomes\n",
+ totalClusters, clusterIx, chromHash->elCount);
carefulClose(&fCluster);
carefulClose(&fBed);
}
@@ -214,7 +358,8 @@
if (argc != 4)
usage();
clDims = optionInt("dims", clDims);
clScoreScale = optionDouble("scoreScale", clScoreScale);
+clWeakLevel = optionDouble("weakLevel", clWeakLevel);
regCluster(argv[1], argv[2], argv[3]);
return 0;
}