c564b740205151223285791ca4b238ca4d4dcebb angie Fri May 6 11:18:37 2011 -0700 Code Review #3822: Added long explanatory comment for main clusteringstep based on Jim's suggestions. In the process, I realized that I'm using a pool not a true heap, so I changed variable names and comments accordingly. diff --git src/lib/hacTree.c src/lib/hacTree.c index 3d8d90c..eda012f 100644 --- src/lib/hacTree.c +++ src/lib/hacTree.c @@ -124,87 +124,128 @@ } static struct hacTree *pairUpItems(const struct slList *itemList, int itemCount, int *retPairCount, struct lm *localMem, hacDistanceFunction *distF, hacMergeFunction *mergeF, hacCmpFunction *cmpF, void *extraData) /* Allocate & initialize leaf nodes and all possible pairings of leaf nodes * which will be our seed clusters. If cmpF is given, pre-sort the leaf nodes * and pre-cluster identical leaves before generating seed clusters. */ { struct hacTree *leafNodes = leafNodesFromItems(itemList, itemCount, localMem); if (cmpF != NULL) leafNodes = sortAndPreCluster(leafNodes, &itemCount, localMem, distF, mergeF, cmpF, extraData); int pairCount = itemCount * (itemCount-1) / 2; -struct hacTree *pairHeap = lmAlloc(localMem, pairCount * sizeof(struct hacTree)); +struct hacTree *pairPool = lmAlloc(localMem, pairCount * sizeof(struct hacTree)); int i, j, pairIx; for (i=0, pairIx=0; i < itemCount-1; i++) for (j=i+1; j < itemCount; j++, pairIx++) - initNode(&(pairHeap[pairIx]), &(leafNodes[i]), &(leafNodes[j]), distF, mergeF, extraData); + initNode(&(pairPool[pairIx]), &(leafNodes[i]), &(leafNodes[j]), distF, mergeF, extraData); *retPairCount = pairCount; -return pairHeap; +return pairPool; } struct hacTree *hacTreeFromItems(const struct slList *itemList, struct lm *localMem, hacDistanceFunction *distF, hacMergeFunction *mergeF, hacCmpFunction *cmpF, void *extraData) /* Using distF, mergeF, optionally cmpF and binary tree operations, * perform a hierarchical agglomerative (bottom-up) clustering of * items. To free the resulting tree, lmCleanup(&localMem). */ +// +// Implementation: +// +// Create a pool containing all pairs of items (N*(N-1)/2), and build +// a hierarchical binary tree of items from the bottom up. In each +// iteration, first we find the closest pair and swap it into the head +// of the pool; then we advance the head pointer, so the closest pair +// now has a stable location in memory. Next, for all pairs still in +// the pool, we replace references to the elements of the closest pair +// with the closest pair itself, but delete half of such pairs because +// they would be duplicates. Specifically, we keep pairs that had the +// left element of the closest pair, and delete pairs that had the +// right element of the closest pair. We rescore the pairs that have +// the closest pair swapped in for an element. The code to do all +// this is surprisingly simple -- in the second for loop below. Note +// that with each iteration, the pool will reduce in size, by N-2 the +// first iteration, N-3 the second, and so forth. +// +// An example may help: say we start with items A, B, C and D. Initially +// the pool contains all pairs: +// (A, B) (A, C) (A, D) (B, C) (B, D) (C, D) +// +// If (A, B) is the closest pair, we pop it from the pool and the pool +// becomes +// (A, C) (A, D) (B, C) (B, D) (C, D) +// +// Now we substitute (A, B) for pool pairs containing A, and delete pool +// pairs contining B because they would be duplicates of those containing +// A. [X] shows where a pair was deleted: +// +// ((A, B), C) ((A, B), D) [X] [X] (C, D) +// +// Now say ((A, B), D) is the closest remaining pair, and is popped from +// the head of the pool. We substitute into pairs containing (A, B) and +// delete pairs containing D. After the replacement step, the pool is +// down to a single element: +// +// (((A, B), D), C) [X] { if (itemList == NULL) return NULL; struct hacTree *root = NULL; int itemCount = slCount(itemList); int pairCount = 0; struct hacTree *leafPairs = pairUpItems(itemList, itemCount, &pairCount, localMem, distF, mergeF, cmpF, extraData); -struct hacTree *heapHead = leafPairs; -int heapLength = pairCount; -while (heapLength > 0) +struct hacTree *poolHead = leafPairs; +int poolLength = pairCount; +while (poolLength > 0) { - // Scan heap for node with lowest childDistance; swap that node w/head + // Scan pool for node with lowest childDistance; swap that node w/head int i; int bestIx = 0; - double minScore = heapHead[0].childDistance; - for (i=1; i < heapLength; i++) - if (heapHead[i].childDistance < minScore) + double minScore = poolHead[0].childDistance; + for (i=1; i < poolLength; i++) + if (poolHead[i].childDistance < minScore) { - minScore = heapHead[i].childDistance; + minScore = poolHead[i].childDistance; bestIx = i; } if (bestIx != 0) - swapBytes((char *)&(heapHead[0]), (char *)&(heapHead[bestIx]), sizeof(struct hacTree)); - // Pop the best (lowest-distance) node from heapHead, make it root (for now). - root = heapHead; - heapHead = &(heapHead[1]); - heapLength--; - // Where root->left is found in the heap, replace it with root. + swapBytes((char *)&(poolHead[0]), (char *)&(poolHead[bestIx]), sizeof(struct hacTree)); + // Pop the best (lowest-distance) node from poolHead, make it root (for now). + root = poolHead; + poolHead = &(poolHead[1]); + poolLength--; + // Where root->left is found in the pool, replace it with root. // Where root->right is found, drop that node so it doesn't become // a duplicate of the replacement cases. - for (i=0; i < heapLength; i++) + for (i=0; i < poolLength; i++) { - struct hacTree *node = &(heapHead[i]); + struct hacTree *node = &(poolHead[i]); if (node->left == root->left) + // found root->left; replace node->left with root (merge root with node->right): initNode(node, root, node->right, distF, mergeF, extraData); else if (node->right == root->left) + // found root->left; replace node->right with root (merge root with node->left): initNode(node, root, node->left, distF, mergeF, extraData); else if (node->left == root->right || node->right == root->right) { - if (i < heapLength-1) - memcpy(node, &(heapHead[i+1]), (heapLength-i-1)*sizeof(struct hacTree)); - heapLength--; + // found root->right; drop this node: + if (i < poolLength-1) + memcpy(node, &(poolHead[i+1]), (poolLength-i-1)*sizeof(struct hacTree)); + poolLength--; i--; } } - // root now has a stable address, unlike nodes still in the heap, so set parents here: + // root now has a stable address, unlike nodes still in the pool, so set parents here: if (root->left != NULL) root->left->parent = root; if (root->right != NULL) root->right->parent = root; } // This shouldn't be necessary as long as initNode leaves parent pointers alone, // but just in case that changes: root->parent = NULL; return root; }