src/hg/regulate/regCluster/regCluster.c 1.4

1.4 2010/05/16 21:41:42 kent
Putting in something so that multiple peaks in the same cluster from the same source don't get counted twice.
Index: src/hg/regulate/regCluster/regCluster.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/regulate/regCluster/regCluster.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/regulate/regCluster/regCluster.c	5 May 2010 00:50:37 -0000	1.3
+++ src/hg/regulate/regCluster/regCluster.c	16 May 2010 21:41:42 -0000	1.4
@@ -1,374 +1,377 @@
 /* regCluster - Cluster regulator regions. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "sqlNum.h"
 #include "obscure.h"
 #include "localmem.h"
 #include "rangeTree.h"
 
 static char const rcsid[] = "$Id$";
 
 int clDims = 1;
 double clScoreScale = 1.0;
 double clForceJoinScore = 2;
 double clWeakLevel = 0.1;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 "regCluster - Cluster regulator regions\n"
 "usage:\n"
 "   regCluster tableOfTables output.cluster output.bed\n"
 "Where the table-of-tables is space or tab separated in the format:\n"
 "   <fileName> <chrom> <start> <end> <score> <normScore> <dim1 label> ... <dimN label>\n"
 "where chrom, start, end, score are the indexes (starting with 0) of the chromosome, start, \n"
 "and end fields in the file, normScore is a factor to multiply score by to get it into the\n"
 "0-1000 range, and the dim# labels are the labels in each dimention.\n"
 "for example\n"
 "   simpleReg.bed 0 1 2 4 aCell aFactor\n"
 "options:\n"
 "   -dims=N - number of dimensions in data.  Would be 2 for cell-line + antibody. Default %d\n"
 "   -scoreScale=0.N - scale score by this factor. Default %g\n"
 "   -weakLevel=0.N - within a cluster ratio of highest depth of coverage of cluster to lowest\n"
 "                    Default %g\n"
 "   -forceJoinScore=0.N - if combined score of elements joining 2 clusters above this, the\n"
 "                         clusters will be joined. Default %g\n"
 , clDims
 , clScoreScale
 , clWeakLevel
 , clForceJoinScore
 );
 }
 
 static struct optionSpec options[] = {
    {"dims", OPTION_INT},
    {"scoreScale", OPTION_DOUBLE},
    {"forceJoinScore", OPTION_DOUBLE},
    {"weakLevel", OPTION_DOUBLE},
    {NULL, 0},
 };
 
 struct regDim
 /* A regulatory dimension */
     {
     int colIx;		/* Column index in table. */
     char *label;	/* Label */
     };
 
 struct regSource 
 /* A source of regulatory information */
     {
     struct regSource *next;
     char *dataSource;		/* File (or table) */
     int chromColIx;		/* Chromosome column index. */
     int startColIx;		/* Start coordinate column index. */
     int endColIx;		/* End ccoordinate column ix. */
     int scoreColIx;		/* Index for score column. */
     double normFactor;		/* Multiply this to get browser score. */
     char **labels;		/* Label for each dimension */
     int minColCount;		/* Minimum number of columns. */
     };
 
 struct regItem
 /* An item in a regulatory track */
     {
     struct regItem *next;
     char *chrom;		/* Chromosome. Not allocated here. */
     int chromStart,chromEnd;	/* Half open coordinates. */
     double score;		/* Ideally something like -log(p). */
     struct regSource *source;   /* Source track/file for item. */
     };
 
 struct regCluster
 /* A cluster of items. */
     {
     struct regCluster *next;
     char *chrom;		/* Chromosome.  Not allocated here. */
     int chromStart, chromEnd;	/* Half open coordinates. */
     double score;		/* Sum of component scores. */
     double maxSubScore;		/* Max of component scores. */
     struct slRef *itemRefList;	/* List of references to component items. */
     };
 
 struct regSource *regSourceLoadAll(char *fileName, int dimCount)
 /* Read file, parse it line by line and return list of regSources. */
 {
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 int rowSize = dimCount + 6;
 char *row[rowSize];
 struct regSource *sourceList = NULL, *source;
 while (lineFileNextRow(lf, row, rowSize))
     {
     /* Allocate struct and read in fixed fields. */
     AllocVar(source);
     source->dataSource = cloneString(row[0]);
     source->chromColIx = sqlUnsigned(row[1]);
     source->startColIx = sqlUnsigned(row[2]);
     source->endColIx = sqlUnsigned(row[3]);
     source->scoreColIx = sqlUnsigned(row[4]);
     source->normFactor = sqlDouble(row[5]);
 
     /* Read in dimension labels. */
     AllocArray(source->labels, dimCount);
     int i;
     for (i=0; i<dimCount; ++i)
         source->labels[i] = cloneString(row[i+6]);
 
     /* Calculate required columns. */
     int minColCount = max(source->chromColIx, source->startColIx);
     minColCount = max(minColCount, source->endColIx);
     minColCount = max(minColCount, source->scoreColIx);
     source->minColCount = minColCount + 1;
     slAddHead(&sourceList, source);
     }
 lineFileClose(&lf);
 slReverse(&sourceList);
 return sourceList;
 }
 
 void clusterSource(struct regSource *source, struct hash *chromHash, 
 	struct rbTreeNode *stack[128])
 /* Read through data source and add items to it to rangeTrees in hash */
 {
 struct lineFile *lf = lineFileOpen(source->dataSource, TRUE);
 struct lm *lm = chromHash->lm;	/* Local memory pool - share with hash */
 char *row[source->minColCount];
 struct regItem *item;
 while (lineFileNextRow(lf, row, source->minColCount))
     {
     char *chrom = row[source->chromColIx];
     struct hashEl *hel = hashLookup(chromHash, chrom);
     if (hel == NULL)
         {
 	struct rbTree *tree = rangeTreeNewDetailed(lm, stack);
 	hel = hashAdd(chromHash, chrom, tree);
 	}
     struct rbTree *tree = hel->val;
     lmAllocVar(lm, item);
     item->chrom = hel->name;
     item->chromStart = sqlUnsigned(row[source->startColIx]);
     item->chromEnd = sqlUnsigned(row[source->endColIx]);
     item->score = sqlDouble(row[source->scoreColIx]) * source->normFactor;
     if (item->score > 1000) item->score = 1000;
     item->source = source;
     rangeTreeAddValList(tree, item->chromStart, item->chromEnd, item);
     }
 
 lineFileClose(&lf);
 }
 
 int cmpChromEls(const void *va, const void *vb)
 /* Compare to sort based on query start. */
 {
 const struct hashEl *a = *((struct hashEl **)va);
 const struct hashEl *b = *((struct hashEl **)vb);
 return cmpWordsWithEmbeddedNumbers(a->name, b->name);
 }
 
 void addCluster(struct lm *lm, struct regItem *itemList, int start, int end,
 	struct regCluster **pList)
 /* Make cluster of all items that overlap start/end, and put it on list. */
 {
 struct regCluster *cluster;
 lmAllocVar(lm, cluster);
 double score = 0.0;
 double maxSubScore = 0.0;
 struct slRef  *refList = NULL, *ref;
 struct regItem *item;
 for (item = itemList; item != NULL; item = item->next)
     {
     if (rangeIntersection(start, end, item->chromStart, item->chromEnd) > 0)
 	{
 	lmAllocVar(lm, ref);
 	ref->val = item;
 	slAddHead(&refList, ref);
 	score += item->score;
 	if (item->score > maxSubScore) maxSubScore = item->score;
 	}
     }
 slReverse(&refList);
 cluster->chrom = itemList->chrom;
 cluster->chromStart = start;
 cluster->chromEnd = end;
 cluster->itemRefList = refList;
 cluster->score = score;
 cluster->maxSubScore = maxSubScore;
 slAddHead(pList, cluster);
 }
 
 struct regCluster *clusterItems(struct lm *lm, struct regItem *itemList, 
 	double forceJoinScore, double weakLevel)
 /* Convert a list of items to a list of clusters of items.  This may break up clusters that
  * have weakly linked parts. 
       [                ]
       AAAAAAAAAAAAAAAAAA 
        BBBBBB   DDDDDD
         CCCC     EEEE
    gets tranformed into
        [    ]   [    ]
       AAAAAAAAAAAAAAAAAA 
        BBBBBB   DDDDDD
         CCCC     EEEE
    The strategy is to build a rangeTree of coverage, which might look something like so:
       123333211123333211 
    then define cluster ends that exceed the minimum limit, which is either 10% of the highest
    or forceJoinScore if 10% of the highest is more than forceJoinScore.  This will go to
    something like so:
         [---]   [----]   
    Finally the items that are overlapping a cluster are assigned to it.  Note that this
    may mean that an item may be in multiple clusters.
         [ABC]   [ ADE]
  */
 {
 int easyMax = round(1.0/weakLevel);
 int itemCount = slCount(itemList);
 struct regCluster *clusterList = NULL;
 if (itemCount < easyMax)
     {
     struct regItem *item = itemList;
     int chromStart = item->chromStart;
     int chromEnd = item->chromEnd;
     for (item = item->next; item != NULL; item = item->next)
         {
 	if (item->chromStart < chromStart) chromStart = item->chromStart;
 	if (item->chromEnd > chromEnd) chromEnd = item->chromEnd;
 	}
     addCluster(lm, itemList, chromStart, chromEnd, &clusterList);
     }
 else
     {
     /* Make up coverage tree. */
     struct rbTree *covTree = rangeTreeNew();
     struct regItem *item;
     for (item = itemList; item != NULL; item = item->next)
 	rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd);
     struct range *range, *rangeList = rangeTreeList(covTree);
 
     /* Figure out maximum coverage. */
     int maxCov = 0;
     for (range = rangeList; range != NULL; range = range->next)
         {
 	int cov = ptToInt(range->val);
 	if (cov > maxCov) maxCov = cov;
 	}
 
     /* Figure coverage threshold. */
     int threshold = round(maxCov * weakLevel);
     if (threshold > forceJoinScore-1) threshold = forceJoinScore-1;
 
     /* Loop through emitting sections over threshold as clusters */
     boolean inRange = FALSE;
     boolean start = 0, end = 0;
     for (range = rangeList; range != NULL; range = range->next)
         {
 	int cov = ptToInt(range->val);
 	if (cov > threshold)
 	    {
 	    if (inRange)
 	       end = range->end;
 	    else
 	       {
 	       inRange = TRUE;
 	       start = range->start;
 	       end = range->end;
 	       }
 	    }
 	else
 	    {
 	    if (inRange)
 		{
 		addCluster(lm, itemList, start, end, &clusterList);
 		inRange = FALSE;
 		}
 	    }
 	}
     if (inRange)
         addCluster(lm, itemList, start, end, &clusterList);
     }
 slReverse(&clusterList);
 return clusterList;
 }
 
 static int clusterIx = 0;
 
 void outputClustersForChrom(char *chrom, struct rbTree *tree, FILE *fCluster, FILE *fBed)
 /* Write out clusters on one chromosome */
 {
 struct range *range, *rangeList = rangeTreeList(tree);
 verbose(2, "Got %d ranges on %s\n", slCount(rangeList), chrom);
 struct lm *lm = lmInit(0);
 for (range = rangeList; range != NULL; range = range->next)
     {
     struct regCluster *cluster, *clusterList = clusterItems(lm, range->val, 
     	clForceJoinScore, clWeakLevel);
     for (cluster = clusterList; cluster != NULL; cluster = cluster->next)
         {
 	struct slRef *ref, *refList=cluster->itemRefList;
 	++clusterIx;
 	struct regItem *item = refList->val;
+	struct hash *uniqHash = hashNew(0);
 	for (ref = refList; ref != NULL; ref = ref->next)
 	    {
 	    item = ref->val;
+	    hashStore(uniqHash, item->source->dataSource);
 	    fprintf(fCluster, "%d\t%s\t", clusterIx, item->chrom);
 	    fprintf(fCluster, "%d\t%d\t", item->chromStart, item->chromEnd);
 	    fprintf(fCluster, "%g", item->score);
 	    int i;
 	    for (i=0; i<clDims; ++i)
 	       fprintf(fCluster, "\t%s", item->source->labels[i]);
 	    fprintf(fCluster, "\n");
 	    }
 	double score = clScoreScale * cluster->maxSubScore;
 	if (score > 1000) score = 1000;
 	fprintf(fBed, "%s\t%d\t%d\t%d\t%g\n", chrom, 
-		cluster->chromStart, cluster->chromEnd, slCount(refList), score);
+		cluster->chromStart, cluster->chromEnd, uniqHash->elCount, score);
+	hashFree(&uniqHash);
 	}
     }
 lmCleanup(&lm);
 }
 
 void regCluster(char *tableOfTables, char *outCluster, char *outBed)
 /* regCluster - Cluster regulator regions. */
 {
 struct regSource *source, *sourceList = regSourceLoadAll(tableOfTables, clDims);
 verbose(1, "Read %d sources from %s\n", slCount(sourceList), tableOfTables);
 struct hash *chromHash = hashNew(0);
 struct rbTreeNode *stack[128];
 for (source = sourceList; source != NULL; source = source->next)
     {
     clusterSource(source, chromHash, stack);
     }
 
 /* Get out list of chromosomes and process one at a time. */
 FILE *fCluster = mustOpen(outCluster, "w");
 FILE *fBed = mustOpen(outBed, "w");
 struct hashEl *chrom, *chromList = hashElListHash(chromHash);
 slSort(&chromList, cmpChromEls);
 int totalClusters = 0;
 for (chrom = chromList; chrom != NULL; chrom = chrom->next)
     {
     struct rbTree *tree = chrom->val;
     totalClusters += tree->n;
     outputClustersForChrom(chrom->name, tree, fCluster, fBed);
     }
 verbose(1, "%d singly-linked clusters, %d clusters in %d chromosomes\n", 
 	totalClusters, clusterIx, chromHash->elCount);
 carefulClose(&fCluster);
 carefulClose(&fBed);
 
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 clDims = optionInt("dims", clDims);
 clScoreScale = optionDouble("scoreScale", clScoreScale);
 clWeakLevel = optionDouble("weakLevel", clWeakLevel);
 regCluster(argv[1], argv[2], argv[3]);
 return 0;
 }