e17f555375caa88ec04c1bd824cb776fddd1ea54 kent Sun Aug 24 20:40:53 2014 -0700 Moving an alternative hacTree implementation that minimizes the merge steps to the library. diff --git src/lib/hacTree.c src/lib/hacTree.c index e843b26..6752a70 100644 --- src/lib/hacTree.c +++ src/lib/hacTree.c @@ -1,21 +1,23 @@ /* hacTree - Hierarchical Agglomerative Clustering a list of inputs into a binary tree */ /* Copyright (C) 2011 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" +#include "dlist.h" +#include "hash.h" #include "hacTree.h" static struct hacTree *leafNodesFromItems(const struct slList *itemList, int itemCount, struct lm *localMem) /* Allocate & initialize leaf nodes that contain only items. */ { struct hacTree *leafNodes = lmAlloc(localMem, itemCount * sizeof(struct hacTree)); int i = 0; const struct slList *item = itemList; while (item != NULL && i < itemCount) { // needMem zeroes the memory, so initialize only non-NULL stuff. struct hacTree *node = &(leafNodes[i]); if (i < itemCount-1) node->next = &(leafNodes[i+1]); @@ -265,15 +267,116 @@ newPoolLen += blkSize; } poolLength = newPoolLen; } // root now has a stable address, unlike nodes still in the pool, so set parents here: if (root->left != NULL) root->left->parent = root; if (root->right != NULL) root->right->parent = root; } // This shouldn't be necessary as long as initNode leaves parent pointers alone, // but just in case that changes: root->parent = NULL; return root; } + +/** The code from here on down is an alternative implementation that calls the merge + ** function and for that matter the distance function much less than the function + ** above. */ + +static void findClosestPair(struct dlList *list, struct hash *distanceHash, + hacDistanceFunction *distF, void *extraData, struct dlNode **retNodeA, struct dlNode **retNodeB) +/* Loop through list returning closest two nodes */ +{ +struct dlNode *aNode; +double closestDistance = BIGDOUBLE; +struct dlNode *closestA = NULL, *closestB = NULL; +for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next) + { + struct hacTree *aHt = aNode->val; + struct slList *a = aHt->itemOrCluster; + struct dlNode *bNode; + for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next) + { + struct hacTree *bHt = bNode->val; + char key[64]; + safef(key, sizeof(key), "%p %p", aHt, bHt); + double *pd = hashFindVal(distanceHash, key); + if (pd == NULL) + { + lmAllocVar(distanceHash->lm, pd); + *pd = distF(a, bHt->itemOrCluster, extraData); + hashAdd(distanceHash, key, pd); + } + double d = *pd; + if (d < closestDistance) + { + closestDistance = d; + closestA = aNode; + closestB = bNode; + } + } + } +*retNodeA = closestA; +*retNodeB = closestB; +} + +static void lmDlAddValTail(struct lm *lm, struct dlList *list, void *val) +/* Allocate new dlNode out of lm, initialize it with val, and add it to end of list */ +{ +struct dlNode *node; +lmAllocVar(lm, node); +node->val = val; +dlAddTail(list, node); +} + +struct hacTree *hacTreeForCostlyMerges(struct slList *itemList, struct lm *localMem, + hacDistanceFunction *distF, hacMergeFunction *mergeF, + void *extraData) +/* Construct hacTree using a method that will minimize the number of calls to + * the distance and merge functions, assuming they are expensive. Do a lmCleanup(localMem) + * to free the returned tree. */ +{ +/* Make up a doubly-linked list in 'remaining' with all items in it */ +struct dlList remaining; +dlListInit(&remaining); +struct slList *item; +int count = 0; +struct hash *distanceHash = hashNew(0); +for (item = itemList; item != NULL; item = item->next) + { + struct hacTree *ht; + lmAllocVar(localMem, ht); + ht->itemOrCluster = item; + lmDlAddValTail(localMem, &remaining, ht); + count += 1; + } + +/* Loop through finding closest and merging until only one node left on remaining. */ +int i; +for (i=1; ileft = aNode->val; + struct hacTree *right = ht->right = bNode->val; + left->parent = right->parent = ht; + ht->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData); + + /* Put merged item onto remaining list. */ + lmDlAddValTail(localMem, &remaining, ht); + } + +/* Clean up and go home. */ +hashFree(&distanceHash); +struct dlNode *lastNode = dlPopHead(&remaining); +return lastNode->val; +} +