8f45bb3988b7548987178934a02fc3e6e3ecea97 kent Mon Jun 14 18:43:58 2010 -0700 Moving clustering to library. diff --git src/lib/peakCluster.c src/lib/peakCluster.c new file mode 100644 index 0000000..f5753f2 --- /dev/null +++ src/lib/peakCluster.c @@ -0,0 +1,243 @@ +/* peakCluster - cluster peak calls from different sources. */ + +#include "common.h" +#include "hash.h" +#include "linefile.h" +#include "localmem.h" +#include "sqlNum.h" +#include "obscure.h" +#include "peakCluster.h" +#include "rangeTree.h" + +struct peakSource *peakSourceLoadAll(char *fileName, int dimCount) +/* Read file, parse it line by line and return list of peakSources. */ +{ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +int rowSize = dimCount + 6; +char *row[rowSize]; +struct peakSource *sourceList = NULL, *source; +while (lineFileNextRow(lf, row, rowSize)) + { + /* Allocate struct and read in fixed fields. */ + AllocVar(source); + source->dataSource = cloneString(row[0]); + source->chromColIx = sqlUnsigned(row[1]); + source->startColIx = sqlUnsigned(row[2]); + source->endColIx = sqlUnsigned(row[3]); + source->scoreColIx = sqlUnsigned(row[4]); + source->normFactor = sqlDouble(row[5]); + + /* Read in dimension labels. */ + AllocArray(source->labels, dimCount); + int i; + for (i=0; i<dimCount; ++i) + source->labels[i] = cloneString(row[i+6]); + + /* Calculate required columns. */ + int minColCount = max(source->chromColIx, source->startColIx); + minColCount = max(minColCount, source->endColIx); + minColCount = max(minColCount, source->scoreColIx); + source->minColCount = minColCount + 1; + slAddHead(&sourceList, source); + } +lineFileClose(&lf); +slReverse(&sourceList); +return sourceList; +} + +void peakClusterMakerAddFromSource(struct peakClusterMaker *maker, struct peakSource *source) +/* Read through data source and add items to it to rangeTrees in maker */ +{ +struct hash *chromHash = maker->chromHash; +struct lineFile *lf = lineFileOpen(source->dataSource, TRUE); +struct lm *lm = chromHash->lm; /* Local memory pool - share with hash */ +char *row[source->minColCount]; +struct peakItem *item; +char *line; +while (lineFileNextReal(lf, &line)) + { + char *asciiLine = lmCloneString(lm, line); + int wordCount = chopByWhite(line, row, source->minColCount); + lineFileExpectAtLeast(lf, source->minColCount, wordCount); + char *chrom = row[source->chromColIx]; + struct hashEl *hel = hashLookup(chromHash, chrom); + if (hel == NULL) + { + struct rbTree *tree = rangeTreeNewDetailed(lm, maker->stack); + hel = hashAdd(chromHash, chrom, tree); + } + struct rbTree *tree = hel->val; + lmAllocVar(lm, item); + item->chrom = hel->name; + item->chromStart = sqlUnsigned(row[source->startColIx]); + item->chromEnd = sqlUnsigned(row[source->endColIx]); + item->score = sqlDouble(row[source->scoreColIx]) * source->normFactor; + if (item->score > 1000) item->score = 1000; + item->source = source; + item->asciiLine = asciiLine; + rangeTreeAddValList(tree, item->chromStart, item->chromEnd, item); + } +lineFileClose(&lf); +} + +static void addCluster(struct lm *lm, struct peakItem *itemList, int start, int end, + struct peakCluster **pList) +/* Make cluster of all items that overlap start/end, and put it on list. */ +{ +struct peakCluster *cluster; +lmAllocVar(lm, cluster); +double score = 0.0; +double maxSubScore = 0.0; +struct slRef *refList = NULL, *ref; +struct peakItem *item; +for (item = itemList; item != NULL; item = item->next) + { + if (rangeIntersection(start, end, item->chromStart, item->chromEnd) > 0) + { + lmAllocVar(lm, ref); + ref->val = item; + slAddHead(&refList, ref); + score += item->score; + if (item->score > maxSubScore) maxSubScore = item->score; + } + } +slReverse(&refList); +cluster->chrom = itemList->chrom; +cluster->chromStart = start; +cluster->chromEnd = end; +cluster->itemRefList = refList; +cluster->score = score; +cluster->maxSubScore = maxSubScore; +slAddHead(pList, cluster); +} + +struct peakCluster *peakClusterItems(struct lm *lm, struct peakItem *itemList, + double forceJoinScore, double weakLevel) +/* Convert a list of items to a list of clusters of items. This may break up clusters that + * have weakly linked parts. + [ ] + AAAAAAAAAAAAAAAAAA + BBBBBB DDDDDD + CCCC EEEE + gets tranformed into + [ ] [ ] + AAAAAAAAAAAAAAAAAA + BBBBBB DDDDDD + CCCC EEEE + The strategy is to build a rangeTree of coverage, which might look something like so: + 123333211123333211 + then define cluster ends that exceed the minimum limit, which is either 10% of the highest + or forceJoinScore if 10% of the highest is more than forceJoinScore. This will go to + something like so: + [---] [----] + Finally the items that are overlapping a cluster are assigned to it. Note that this + may mean that an item may be in multiple clusters. + [ABC] [ ADE] + */ +{ +int easyMax = round(1.0/weakLevel); +int itemCount = slCount(itemList); +struct peakCluster *clusterList = NULL; +if (itemCount < easyMax) + { + struct peakItem *item = itemList; + int chromStart = item->chromStart; + int chromEnd = item->chromEnd; + for (item = item->next; item != NULL; item = item->next) + { + if (item->chromStart < chromStart) chromStart = item->chromStart; + if (item->chromEnd > chromEnd) chromEnd = item->chromEnd; + } + addCluster(lm, itemList, chromStart, chromEnd, &clusterList); + } +else + { + /* Make up coverage tree. */ + struct rbTree *covTree = rangeTreeNew(); + struct peakItem *item; + for (item = itemList; item != NULL; item = item->next) + rangeTreeAddToCoverageDepth(covTree, item->chromStart, item->chromEnd); + struct range *range, *rangeList = rangeTreeList(covTree); + + /* Figure out maximum coverage. */ + int maxCov = 0; + for (range = rangeList; range != NULL; range = range->next) + { + int cov = ptToInt(range->val); + if (cov > maxCov) maxCov = cov; + } + + /* Figure coverage threshold. */ + int threshold = round(maxCov * weakLevel); + if (threshold > forceJoinScore-1) threshold = forceJoinScore-1; + + /* Loop through emitting sections over threshold as clusters */ + boolean inRange = FALSE; + boolean start = 0, end = 0; + for (range = rangeList; range != NULL; range = range->next) + { + int cov = ptToInt(range->val); + if (cov > threshold) + { + if (inRange) + end = range->end; + else + { + inRange = TRUE; + start = range->start; + end = range->end; + } + } + else + { + if (inRange) + { + addCluster(lm, itemList, start, end, &clusterList); + inRange = FALSE; + } + } + } + if (inRange) + addCluster(lm, itemList, start, end, &clusterList); + } +slReverse(&clusterList); +return clusterList; +} + +struct peakClusterMaker *peakClusterMakerNew() +/* Return a new peakClusterMaker. */ +{ +struct peakClusterMaker *maker; +AllocVar(maker); +maker->chromHash = hashNew(0); +return maker; +} + +void peakClusterMakerFree(struct peakClusterMaker **pMaker) +/* Free up a peakClusterMaker. */ +{ +struct peakClusterMaker *maker = *pMaker; +if (maker != NULL) + { + hashFree(&maker->chromHash); + freez(pMaker); + } +} + +static int cmpChromEls(const void *va, const void *vb) +/* Compare to sort based on query start. */ +{ +const struct hashEl *a = *((struct hashEl **)va); +const struct hashEl *b = *((struct hashEl **)vb); +return cmpWordsWithEmbeddedNumbers(a->name, b->name); +} + +struct hashEl *peakClusterMakerChromList(struct peakClusterMaker *maker) +/* Return list of chromosomes. In hashEl format where the hashEl val is + * a rangeTree filled with items. */ +{ +struct hashEl *chromList = hashElListHash(maker->chromHash); +slSort(&chromList, cmpChromEls); +return chromList; +} +