e17f555375caa88ec04c1bd824cb776fddd1ea54
kent
  Sun Aug 24 20:40:53 2014 -0700
Moving an alternative hacTree implementation that minimizes the merge steps to the library.
diff --git src/lib/hacTree.c src/lib/hacTree.c
index e843b26..6752a70 100644
--- src/lib/hacTree.c
+++ src/lib/hacTree.c
@@ -1,21 +1,23 @@
 /* hacTree - Hierarchical Agglomerative Clustering a list of inputs into a binary tree */
 
 /* Copyright (C) 2011 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
+#include "dlist.h"
+#include "hash.h"
 #include "hacTree.h"
 
 static struct hacTree *leafNodesFromItems(const struct slList *itemList, int itemCount,
 					  struct lm *localMem)
 /* Allocate & initialize leaf nodes that contain only items. */
 {
 struct hacTree *leafNodes = lmAlloc(localMem, itemCount * sizeof(struct hacTree));
 int i = 0;
 const struct slList *item = itemList;
 while (item != NULL && i < itemCount)
     {
     // needMem zeroes the memory, so initialize only non-NULL stuff.
     struct hacTree *node = &(leafNodes[i]);
     if (i < itemCount-1)
 	node->next = &(leafNodes[i+1]);
@@ -265,15 +267,116 @@
 	    newPoolLen += blkSize;
 	    }
 	poolLength = newPoolLen;
 	}
     // root now has a stable address, unlike nodes still in the pool, so set parents here:
     if (root->left != NULL)
 	root->left->parent = root;
     if (root->right != NULL)
 	root->right->parent = root;
     }
 // This shouldn't be necessary as long as initNode leaves parent pointers alone,
 // but just in case that changes:
 root->parent = NULL;
 return root;
 }
+
+/** The code from here on down is an alternative implementation that calls the merge
+ ** function and for that matter the distance function much less than the function
+ ** above.  */
+
+static void findClosestPair(struct dlList *list, struct hash *distanceHash, 
+    hacDistanceFunction *distF, void *extraData, struct dlNode **retNodeA, struct dlNode **retNodeB)
+/* Loop through list returning closest two nodes */
+{
+struct dlNode *aNode;
+double closestDistance = BIGDOUBLE;
+struct dlNode *closestA = NULL, *closestB = NULL;
+for (aNode = list->head; !dlEnd(aNode); aNode = aNode->next)
+    {
+    struct hacTree *aHt = aNode->val;
+    struct slList *a = aHt->itemOrCluster;
+    struct dlNode *bNode;
+    for (bNode = aNode->next; !dlEnd(bNode); bNode = bNode->next)
+        {
+	struct hacTree *bHt = bNode->val;
+	char key[64];
+	safef(key, sizeof(key), "%p %p", aHt, bHt);
+	double *pd = hashFindVal(distanceHash, key);
+	if (pd == NULL)
+	     {
+	     lmAllocVar(distanceHash->lm, pd);
+	     *pd = distF(a, bHt->itemOrCluster, extraData);
+	     hashAdd(distanceHash, key, pd);
+	     }
+	double d = *pd;
+	if (d < closestDistance)
+	    {
+	    closestDistance = d;
+	    closestA = aNode;
+	    closestB = bNode;
+	    }
+	}
+    }
+*retNodeA = closestA;
+*retNodeB = closestB;
+}
+
+static void lmDlAddValTail(struct lm *lm, struct dlList *list, void *val)
+/* Allocate new dlNode out of lm, initialize it with val, and add it to end of list */
+{
+struct dlNode *node;
+lmAllocVar(lm, node);
+node->val = val;
+dlAddTail(list, node);
+}
+
+struct hacTree *hacTreeForCostlyMerges(struct slList *itemList, struct lm *localMem,
+				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
+				 void *extraData)
+/* Construct hacTree using a method that will minimize the number of calls to
+ * the distance and merge functions, assuming they are expensive.  Do a lmCleanup(localMem)
+ * to free the returned tree. */
+{
+/* Make up a doubly-linked list in 'remaining' with all items in it */
+struct dlList remaining;
+dlListInit(&remaining);
+struct slList *item;
+int count = 0;
+struct hash *distanceHash = hashNew(0);
+for (item = itemList; item != NULL; item = item->next)
+    {
+    struct hacTree *ht;
+    lmAllocVar(localMem, ht);
+    ht->itemOrCluster = item;
+    lmDlAddValTail(localMem, &remaining, ht);
+    count += 1;
+    }
+
+/* Loop through finding closest and merging until only one node left on remaining. */
+int i;
+for (i=1; i<count; ++i)
+    {
+    /* Find closest pair and take them off of remaining list */
+    struct dlNode *aNode, *bNode;
+    findClosestPair(&remaining, distanceHash, distF, extraData, &aNode, &bNode);
+    dlRemove(aNode);
+    dlRemove(bNode);
+
+    /* Allocated new hacTree item for them and fill it in with a merged value. */
+    struct hacTree *ht;
+    lmAllocVar(localMem, ht);
+    struct hacTree *left = ht->left = aNode->val;
+    struct hacTree *right = ht->right = bNode->val;
+    left->parent = right->parent = ht;
+    ht->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData);
+
+    /* Put merged item onto remaining list. */
+    lmDlAddValTail(localMem, &remaining, ht);
+    }
+
+/* Clean up and go home. */
+hashFree(&distanceHash);
+struct dlNode *lastNode = dlPopHead(&remaining);
+return lastNode->val;
+}
+