src/hg/regulate/regCluster/regCluster.c 1.2

1.2 2010/03/10 19:31:19 kent
Making it so that it will break up clusters that are weakly linked.
Index: src/hg/regulate/regCluster/regCluster.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/regulate/regCluster/regCluster.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 4 -r1.1 -r1.2
--- src/hg/regulate/regCluster/regCluster.c	8 Mar 2010 23:35:07 -0000	1.1
+++ src/hg/regulate/regCluster/regCluster.c	10 Mar 2010 19:31:19 -0000	1.2
@@ -3,15 +3,18 @@
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "sqlNum.h"
+#include "obscure.h"
 #include "localmem.h"
 #include "rangeTree.h"
 
 static char const rcsid[] = "$Id$";
 
 int clDims = 1;
-double clScoreScale = 2.0;
+double clScoreScale = 1.0;
+double clForceJoinScore = 2;
+double clWeakLevel = 0.1;
 
 void usage()
 /* Explain usage and exit. */
 {
@@ -23,20 +26,28 @@
 "   <fileName> <chrom> <start> <end> <score> <dim1 label> ... <dimN label>\n"
 "where chrom, start, end are the indexes (starting with 0) of the chromosome, start, and end\n"
 "fields in the file.\n"
 "for example\n"
-"   simpleReg.bed 0 1 2 4 A 5 a 6\n"
+"   simpleReg.bed 0 1 2 4 aCell aFactor\n"
 "options:\n"
 "   -dims=N - number of dimensions in data.  Would be 2 for cell-line + antibody. Default %d\n"
 "   -scoreScale=0.N - scale score by this factor. Default %g\n"
+"   -weakLevel=0.N - within a cluster ratio of highest depth of coverage of cluster to lowest\n"
+"                    Default %g\n"
+"   -forceJoinScore=0.N - if combined score of elements joining 2 clusters above this, the\n"
+"                         clusters will be joined. Default %g\n"
 , clDims
 , clScoreScale
+, clWeakLevel
+, clForceJoinScore
 );
 }
 
 static struct optionSpec options[] = {
    {"dims", OPTION_INT},
    {"scoreScale", OPTION_DOUBLE},
+   {"forceJoinScore", OPTION_DOUBLE},
+   {"weakLevel", OPTION_DOUBLE},
    {NULL, 0},
 };
 
 struct regDim
@@ -69,8 +80,19 @@
     double score;		/* Ideally something like -log(p). */
     struct regSource *source;   /* Source track/file for item. */
     };
 
+struct regCluster
+/* A cluster of items. */
+    {
+    struct regCluster *next;
+    char *chrom;		/* Chromosome.  Not allocated here. */
+    int chromStart, chromEnd;	/* Half open coordinates. */
+    double score;		/* Sum of component scores. */
+    double maxSubScore;		/* Max of component scores. */
+    struct slRef *itemRefList;	/* List of references to component items. */
+    };
+
 struct regSource *regSourceLoadAll(char *fileName, int dimCount)
 /* Read file, parse it line by line and return list of regSources. */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
@@ -144,37 +166,159 @@
 const struct hashEl *b = *((struct hashEl **)vb);
 return cmpWordsWithEmbeddedNumbers(a->name, b->name);
 }
 
+void addCluster(struct lm *lm, struct regItem *itemList, int start, int end,
+	struct regCluster **pList)
+/* Make cluster of all items that overlap start/end, and put it on list. */
+{
+struct regCluster *cluster;
+lmAllocVar(lm, cluster);
+double score = 0.0;
+double maxSubScore = 0.0;
+struct slRef  *refList = NULL, *ref;
+struct regItem *item;
+for (item = itemList; item != NULL; item = item->next)
+    {
+    if (rangeIntersection(start, end, item->chromStart, item->chromEnd) > 0)
+	{
+	lmAllocVar(lm, ref);
+	ref->val = item;
+	slAddHead(&refList, ref);
+	score += item->score;
+	if (item->score > maxSubScore) maxSubScore = item->score;
+	}
+    }
+slReverse(&refList);
+cluster->chrom = itemList->chrom;
+cluster->chromStart = start;
+cluster->chromEnd = end;
+cluster->itemRefList = refList;
+cluster->score = score;
+cluster->maxSubScore = maxSubScore;
+slAddHead(pList, cluster);
+}
+
+struct regCluster *clusterItems(struct lm *lm, struct regItem *itemList, 
+	double forceJoinScore, double weakLevel)
+/* Convert a list of items to a list of clusters of items.  This may break up clusters that
+ * have weakly linked parts. 
+      [                ]
+      AAAAAAAAAAAAAAAAAA 
+       BBBBBB   DDDDDD
+        CCCC     EEEE
+   gets tranformed into
+       [    ]   [    ]
+      AAAAAAAAAAAAAAAAAA 
+       BBBBBB   DDDDDD
+        CCCC     EEEE
+   The strategy is to build a rangeTree of coverage, which might look something like so:
+      123333211123333211 
+   then define cluster ends that exceed the minimum limit, which is either 10% of the highest
+   or forceJoinScore if 10% of the highest is more than forceJoinScore.  This will go to
+   something like so:
+        [---]   [----]   
+   Finally the items that are overlapping a cluster are assigned to it.  Note that this
+   may mean that an item may be in multiple clusters.
+        [ABC]   [ ADE]
+ */
+{
+int easyMax = round(1.0/weakLevel);
+int itemCount = slCount(itemList);
+struct regCluster *clusterList = NULL;
+if (itemCount < easyMax)
+    {
+    addCluster(lm, itemList, 0, BIGNUM, &clusterList);
+    }
+else
+    {
+    /* Make up coverage tree. */
+    struct rbTree *covTree = rangeTreeNew();
+    struct regItem *item;
+    for (item = itemList; item != NULL; item = item->next)
+	rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd);
+    struct range *range, *rangeList = rangeTreeList(covTree);
+
+    /* Figure out maximum coverage. */
+    int maxCov = 0;
+    for (range = rangeList; range != NULL; range = range->next)
+        {
+	int cov = ptToInt(range->val);
+	if (cov > maxCov) maxCov = cov;
+	}
+
+    /* Figure coverage threshold. */
+    int threshold = round(maxCov * weakLevel);
+    if (threshold > forceJoinScore-1) threshold = forceJoinScore-1;
+
+    /* Loop through emitting sections over threshold as clusters */
+    boolean inRange = FALSE;
+    boolean start = 0, end = 0;
+    for (range = rangeList; range != NULL; range = range->next)
+        {
+	int cov = ptToInt(range->val);
+	if (cov > threshold)
+	    {
+	    if (inRange)
+	       end = range->end;
+	    else
+	       {
+	       inRange = TRUE;
+	       start = range->start;
+	       end = range->end;
+	       }
+	    }
+	else
+	    {
+	    if (inRange)
+		{
+		addCluster(lm, itemList, start, end, &clusterList);
+		inRange = FALSE;
+		}
+	    }
+	}
+    if (inRange)
+        addCluster(lm, itemList, start, end, &clusterList);
+    }
+slReverse(&clusterList);
+return clusterList;
+}
+
+static int clusterIx = 0;
+
 void outputClustersForChrom(char *chrom, struct rbTree *tree, FILE *fCluster, FILE *fBed)
 /* Write out clusters on one chromosome */
 {
 struct range *range, *rangeList = rangeTreeList(tree);
-uglyf("Got %d ranges\n", slCount(rangeList));
-int clusterIx = 0;
+verbose(2, "Got %d ranges on %s\n", slCount(rangeList), chrom);
+struct lm *lm = lmInit(0);
 for (range = rangeList; range != NULL; range = range->next)
     {
-    struct regItem *item, *itemList = range->val;
+    struct regCluster *cluster, *clusterList = clusterItems(lm, range->val, 
+    	clForceJoinScore, clWeakLevel);
+    for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
+        {
+	struct slRef *ref, *refList=cluster->itemRefList;
     ++clusterIx;
-    double totalScore = 0.0;
-    int start = itemList->chromStart, end = itemList->chromEnd;
-    for (item = itemList; item != NULL; item = item->next)
+	struct regItem *item = refList->val;
+	for (ref = refList; ref != NULL; ref = ref->next)
         {
+	    item = ref->val;
 	fprintf(fCluster, "%d\t%s\t", clusterIx, item->chrom);
 	fprintf(fCluster, "%d\t%d\t", item->chromStart, item->chromEnd);
-	fprintf(fCluster, "%d", (int)item->score);
+	    fprintf(fCluster, "%g", item->score);
 	int i;
 	for (i=0; i<clDims; ++i)
 	   fprintf(fCluster, "\t%s", item->source->labels[i]);
 	fprintf(fCluster, "\n");
-	start = min(start, item->chromStart);
-	end = max(end, item->chromEnd);
-	totalScore += item->score;
 	}
-    int score = clScoreScale * totalScore;
+	double score = clScoreScale * cluster->maxSubScore;
     if (score > 1000) score = 1000;
-    fprintf(fBed, "%s\t%d\t%d\t%d\t%d\n", chrom, start, end, slCount(itemList), score);
+	fprintf(fBed, "%s\t%d\t%d\t%d\t%g\n", chrom, 
+		cluster->chromStart, cluster->chromEnd, slCount(refList), score);
+	}
     }
+lmCleanup(&lm);
 }
 
 void regCluster(char *tableOfTables, char *outCluster, char *outBed)
 /* regCluster - Cluster regulator regions. */
@@ -186,9 +330,8 @@
 for (source = sourceList; source != NULL; source = source->next)
     {
     clusterSource(source, chromHash, stack);
     }
-uglyf("%d chromosomes in sources\n", chromHash->elCount);
 
 /* Get out list of chromosomes and process one at a time. */
 FILE *fCluster = mustOpen(outCluster, "w");
 FILE *fBed = mustOpen(outBed, "w");
@@ -200,9 +343,10 @@
     struct rbTree *tree = chrom->val;
     totalClusters += tree->n;
     outputClustersForChrom(chrom->name, tree, fCluster, fBed);
     }
-uglyf("%d total clusters in %d chromosomes\n", totalClusters, chromHash->elCount);
+verbose(1, "%d singly-linked clusters, %d clusters in %d chromosomes\n", 
+	totalClusters, clusterIx, chromHash->elCount);
 carefulClose(&fCluster);
 carefulClose(&fBed);
 
 }
@@ -214,7 +358,8 @@
 if (argc != 4)
     usage();
 clDims = optionInt("dims", clDims);
 clScoreScale = optionDouble("scoreScale", clScoreScale);
+clWeakLevel = optionDouble("weakLevel", clWeakLevel);
 regCluster(argv[1], argv[2], argv[3]);
 return 0;
 }