ac5918d51bed4db6f5ac837e6660b322123bbd7c
angie
  Tue Apr 26 16:36:45 2011 -0700
Added new lib module hacTree (Hierarchical Agglomerative Clustering),which takes an slList of items and a couple user-defined functions,
and returns a binary tree of clusters, with the leaf nodes containing
the original input items.  The user-defined functions do the interesting
parts: computing distance between two items and/or clusters, and
merging two items and/or clusters into a new cluster.
This is motivated by work on Feature #3711 (vcfTabix: center-weighted
haplotype sorting for display of phased genotypes), but I'm hoping
it might have other uses.

diff --git src/lib/hacTree.c src/lib/hacTree.c
new file mode 100644
index 0000000..1546fa7
--- /dev/null
+++ src/lib/hacTree.c
@@ -0,0 +1,114 @@
+/* hacTree - Hierarchical Agglomerative Clustering a list of inputs into a binary tree */
+
+#include "common.h"
+#include "hacTree.h"
+
+static struct hacTree *leafNodesFromItems(const struct slList *itemList, int itemCount,
+					  struct lm *localMem)
+/* Allocate & initialize leaf nodes that contain only items. */
+{
+struct hacTree *leafNodes = lmAlloc(localMem, itemCount * sizeof(struct hacTree));
+int i = 0;
+const struct slList *item = itemList;
+while (item != NULL && i < itemCount)
+    {
+    // needMem zeroes the memory, so initialize only non-NULL stuff.
+    struct hacTree *node = &(leafNodes[i]);
+    if (i < itemCount-1)
+	node->next = &(leafNodes[i+1]);
+    node->itemOrCluster = (struct slList *)item;
+    i++;
+    item = item->next;
+    }
+return leafNodes;
+}
+
+static void initNode(struct hacTree *node, struct hacTree *left, struct hacTree *right,
+		     hacDistanceFunction *distF, hacMergeFunction *mergeF, void *extraData)
+/* Initialize node to have left and right as its children.  Leave parent pointers
+ * alone -- they would be unstable during tree construction. */
+{
+node->left = left;
+node->right = right;
+node->childDistance = distF(left->itemOrCluster, right->itemOrCluster, extraData);
+node->itemOrCluster = mergeF(left->itemOrCluster, right->itemOrCluster, extraData);
+}
+
+static struct hacTree *pairUpItems(const struct slList *itemList, int itemCount,
+				   struct lm *localMem,
+				   hacDistanceFunction *distF, hacMergeFunction *mergeF,
+				   void *extraData)
+/* Allocate & initialize leaf nodes and all possible pairings of leaf nodes
+ * which will be our seed clusters. */
+{
+struct hacTree *leafNodes = leafNodesFromItems(itemList, itemCount, localMem);
+int pairCount = itemCount * (itemCount-1) / 2;
+struct hacTree *pairHeap = lmAlloc(localMem, pairCount * sizeof(struct hacTree));
+int i, j, pairIx;
+for (i=0, pairIx=0;  i < itemCount-1;  i++)
+    for (j=i+1;  j < itemCount;  j++, pairIx++)
+	initNode(&(pairHeap[pairIx]), &(leafNodes[i]), &(leafNodes[j]), distF, mergeF, extraData);
+return pairHeap;
+}
+
+struct hacTree *hacTreeFromItems(const struct slList *itemList, struct lm *localMem,
+				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
+				 void *extraData)
+/* Using distF, mergeF, and binary tree operations, perform a
+ * hierarchical agglomerative (bottom-up) clustering of items.
+ * To free the resulting tree, lmCleanup(&localMem). */
+{
+if (itemList == NULL)
+    return NULL;
+struct hacTree *root = NULL;
+int itemCount = slCount(itemList);
+struct hacTree *leafPairs = pairUpItems(itemList, itemCount, localMem, distF, mergeF, extraData);
+struct hacTree *heapHead = leafPairs;
+int heapLength = itemCount * (itemCount-1) / 2;
+while (heapLength > 0)
+    {
+    // Scan heap for node with lowest childDistance; swap that node w/head
+    int i;
+    int bestIx = 0;
+    double minScore = heapHead[0].childDistance;
+    for (i=1;  i < heapLength;  i++)
+	if (heapHead[i].childDistance < minScore)
+	    {
+	    minScore = heapHead[i].childDistance;
+	    bestIx = i;
+	    }
+    if (bestIx != 0)
+	swapBytes((char *)&(heapHead[0]), (char *)&(heapHead[bestIx]), sizeof(struct hacTree));
+    // Pop the best (lowest-distance) node from heapHead, make it root (for now).
+    root = heapHead;
+    heapHead = &(heapHead[1]);
+    heapLength--;
+    // Where root->left is found in the heap, replace it with root.
+    // Where root->right is found, drop that node so it doesn't become
+    // a duplicate of the replacement cases.
+    for (i=0;  i < heapLength;  i++)
+	{
+	struct hacTree *node = &(heapHead[i]);
+	if (node->left == root->left)
+	    initNode(node, root, node->right, distF, mergeF, extraData);
+	else if (node->right == root->left)
+	    initNode(node, root, node->left, distF, mergeF, extraData);
+	else if (node->left == root->right || node->right == root->right)
+	    {
+	    if (i < heapLength-1)
+		memcpy(node, &(heapHead[i+1]), (heapLength-i-1)*sizeof(struct hacTree));
+	    heapLength--;
+	    i--;
+	    }
+	}
+    // root now has a stable address, unlike nodes still in the heap, so set parents here:
+    if (root->left != NULL)
+	root->left->parent = root;
+    if (root->right != NULL)
+	root->right->parent = root;
+    }
+// This shouldn't be necessary as long as initNode leaves parent pointers alone,
+// but just in case that changes:
+root->parent = NULL;
+return root;
+}