fb0b9a237e0c6936a069af0afe7107db15f822d6 angie Wed May 4 15:08:28 2011 -0700 Feature #3711 (center-weighted alpha haplo sorting for vcfTabix):Performance improvement for hacTree: if caller passes in comparison function, then pre-sort the items and pre-cluster adjacent identical items before generating pairs. When many of the inputs are identical, this greatly reduces the number of pairs that the main clustering algorithm starts with. For example, a 1000 Genomes file has genotypes for 1360 people (2720 haplotypes), and starting with all pairs of 2720 haps was impossibly slow for hgTracks. However, in regions of a few tens of thousands of bases and a few tens of variants, in practice there's usually less than 100 distinct haplotypes, which makes it possible to cluster in tenths of seconds instead of timing out. The pre-clustering also makes nice balanced trees; the main clustering step still seems prone to chaining to me, so there's probably still more room for improvement there. diff --git src/lib/hacTree.c src/lib/hacTree.c index 1546fa7..f05dcc0 100644 --- src/lib/hacTree.c +++ src/lib/hacTree.c @@ -1,114 +1,210 @@ /* hacTree - Hierarchical Agglomerative Clustering a list of inputs into a binary tree */ #include "common.h" #include "hacTree.h" static struct hacTree *leafNodesFromItems(const struct slList *itemList, int itemCount, struct lm *localMem) /* Allocate & initialize leaf nodes that contain only items. */ { struct hacTree *leafNodes = lmAlloc(localMem, itemCount * sizeof(struct hacTree)); int i = 0; const struct slList *item = itemList; while (item != NULL && i < itemCount) { // needMem zeroes the memory, so initialize only non-NULL stuff. struct hacTree *node = &(leafNodes[i]); if (i < itemCount-1) node->next = &(leafNodes[i+1]); node->itemOrCluster = (struct slList *)item; i++; item = item->next; } return leafNodes; } -static void initNode(struct hacTree *node, struct hacTree *left, struct hacTree *right, +struct sortWrapper +/* We need to compare nodes' itemOrClusters using cmpF and extraData; + * qsort's comparison function doesn't have a way to pass in extraData, + * so we need to point to it from each qsort element. */ +{ + struct hacTree *node; // contains itemOrCluster to be compared + hacCmpFunction *cmpF; // user-provided itemOrCluster comparison function + void *extraData; // user-provided aux data for cmpF +}; + +static int sortWrapCmp(const void *v1, const void *v2) +/* Unpack sortWrappers and run cmpF on nodes' itemOrClusters with extraData. */ +{ +const struct sortWrapper *w1 = v1, *w2 = v2; +return w1->cmpF(w1->node->itemOrCluster, w2->node->itemOrCluster, w1->extraData); +} + +static struct sortWrapper *makeSortedWraps(struct hacTree *leafNodes, int itemCount, + struct lm *localMem, hacCmpFunction cmpF, + void *extraData) +/* Use cmpF and extraData to sort wrapped leaves so that identical leaves will be adjacent. */ +{ +struct sortWrapper *leafWraps = lmAlloc(localMem, itemCount * sizeof(struct sortWrapper)); +int i; +for (i=0; i < itemCount; i++) + { + leafWraps[i].node = &(leafNodes[i]); + leafWraps[i].cmpF = cmpF; + leafWraps[i].extraData = extraData; + } +qsort(leafWraps, itemCount, sizeof(struct sortWrapper), sortWrapCmp); +return leafWraps; +} + +INLINE void initNode(struct hacTree *node, const struct hacTree *left, const struct hacTree *right, hacDistanceFunction *distF, hacMergeFunction *mergeF, void *extraData) /* Initialize node to have left and right as its children. Leave parent pointers * alone -- they would be unstable during tree construction. */ { -node->left = left; -node->right = right; +node->left = (struct hacTree *)left; +node->right = (struct hacTree *)right; node->childDistance = distF(left->itemOrCluster, right->itemOrCluster, extraData); node->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData); } -static struct hacTree *pairUpItems(const struct slList *itemList, int itemCount, - struct lm *localMem, +INLINE struct hacTree preClusterNodes(const struct sortWrapper *leafWraps, int i, int runLength, hacDistanceFunction *distF, hacMergeFunction *mergeF, + void *extraData, struct lm *localMem) +/* Caller has allocated a node, and this returns what to store there: + * a recursively constructed cluster of nodes extracted from wrapped + * leafNodes (leafWraps) starting at i, for runLength items. */ +{ +struct hacTree ret; +if (runLength > 2) + { + struct hacTree *newClusters = lmAlloc(localMem, 2 * sizeof(struct hacTree)); + int halfLength = runLength/2; + newClusters[0] = preClusterNodes(leafWraps, i, halfLength, + distF, mergeF, extraData, localMem); + newClusters[1] = preClusterNodes(leafWraps, i+halfLength, runLength-halfLength, + distF, mergeF, extraData, localMem); + initNode(&ret, &(newClusters[0]), &(newClusters[1]), distF, mergeF, extraData); + } +else if (runLength == 2) + { + initNode(&ret, leafWraps[i].node, leafWraps[i+1].node, distF, mergeF, extraData); + } +else + ret = *(leafWraps[i].node); +return ret; +} + +static struct hacTree *sortAndPreCluster(struct hacTree *leafNodes, int *retItemCount, + struct lm *localMem, hacDistanceFunction *distF, + hacMergeFunction *mergeF, hacCmpFunction *cmpF, void *extraData) +/* Use cmpF and extraData to sort wrapped leaf nodes so that identical leaves will be adjacent, + * then replace leaves with clusters of identical leaves where possible. Place new + * (hopefully smaller) item count in retItemCount. */ +{ +int itemCount = *retItemCount; +struct sortWrapper *leafWraps = makeSortedWraps(leafNodes, itemCount, localMem, cmpF, extraData); +struct hacTree *newLeaves = lmAlloc(localMem, itemCount * sizeof(struct hacTree)); +int i=0, newI=0; +while (i < itemCount) + { + int nextRunStart; + for (nextRunStart = i+1; nextRunStart < itemCount; nextRunStart++) + if (distF(leafWraps[i].node->itemOrCluster, leafWraps[nextRunStart].node->itemOrCluster, + extraData) != 0) + break; + int runLength = nextRunStart - i; + newLeaves[newI] = preClusterNodes(leafWraps, i, runLength, distF, mergeF, extraData, localMem); + i = nextRunStart; + newI++; + } +*retItemCount = newI; +return newLeaves; +} + +static struct hacTree *pairUpItems(const struct slList *itemList, int itemCount, + int *retPairCount, struct lm *localMem, + hacDistanceFunction *distF, hacMergeFunction *mergeF, + hacCmpFunction *cmpF, void *extraData) /* Allocate & initialize leaf nodes and all possible pairings of leaf nodes - * which will be our seed clusters. */ + * which will be our seed clusters. If cmpF is given, pre-sort the leaf nodes + * and pre-cluster identical leaves before generating seed clusters. */ { struct hacTree *leafNodes = leafNodesFromItems(itemList, itemCount, localMem); +if (cmpF != NULL) + leafNodes = sortAndPreCluster(leafNodes, &itemCount, localMem, + distF, mergeF, cmpF, extraData); int pairCount = itemCount * (itemCount-1) / 2; struct hacTree *pairHeap = lmAlloc(localMem, pairCount * sizeof(struct hacTree)); int i, j, pairIx; for (i=0, pairIx=0; i < itemCount-1; i++) for (j=i+1; j < itemCount; j++, pairIx++) initNode(&(pairHeap[pairIx]), &(leafNodes[i]), &(leafNodes[j]), distF, mergeF, extraData); +*retPairCount = pairCount; return pairHeap; } struct hacTree *hacTreeFromItems(const struct slList *itemList, struct lm *localMem, hacDistanceFunction *distF, hacMergeFunction *mergeF, - void *extraData) -/* Using distF, mergeF, and binary tree operations, perform a - * hierarchical agglomerative (bottom-up) clustering of items. - * To free the resulting tree, lmCleanup(&localMem). */ + hacCmpFunction *cmpF, void *extraData) +/* Using distF, mergeF, optionally cmpF and binary tree operations, + * perform a hierarchical agglomerative (bottom-up) clustering of + * items. To free the resulting tree, lmCleanup(&localMem). */ { if (itemList == NULL) return NULL; struct hacTree *root = NULL; int itemCount = slCount(itemList); -struct hacTree *leafPairs = pairUpItems(itemList, itemCount, localMem, distF, mergeF, extraData); +int pairCount = 0; +struct hacTree *leafPairs = pairUpItems(itemList, itemCount, &pairCount, localMem, + distF, mergeF, cmpF, extraData); struct hacTree *heapHead = leafPairs; -int heapLength = itemCount * (itemCount-1) / 2; +int heapLength = pairCount; while (heapLength > 0) { // Scan heap for node with lowest childDistance; swap that node w/head int i; int bestIx = 0; double minScore = heapHead[0].childDistance; for (i=1; i < heapLength; i++) if (heapHead[i].childDistance < minScore) { minScore = heapHead[i].childDistance; bestIx = i; } if (bestIx != 0) swapBytes((char *)&(heapHead[0]), (char *)&(heapHead[bestIx]), sizeof(struct hacTree)); // Pop the best (lowest-distance) node from heapHead, make it root (for now). root = heapHead; heapHead = &(heapHead[1]); heapLength--; // Where root->left is found in the heap, replace it with root. // Where root->right is found, drop that node so it doesn't become // a duplicate of the replacement cases. for (i=0; i < heapLength; i++) { struct hacTree *node = &(heapHead[i]); if (node->left == root->left) initNode(node, root, node->right, distF, mergeF, extraData); else if (node->right == root->left) initNode(node, root, node->left, distF, mergeF, extraData); else if (node->left == root->right || node->right == root->right) { if (i < heapLength-1) memcpy(node, &(heapHead[i+1]), (heapLength-i-1)*sizeof(struct hacTree)); heapLength--; i--; } } // root now has a stable address, unlike nodes still in the heap, so set parents here: if (root->left != NULL) root->left->parent = root; if (root->right != NULL) root->right->parent = root; } // This shouldn't be necessary as long as initNode leaves parent pointers alone, // but just in case that changes: root->parent = NULL; return root; }