ac5918d51bed4db6f5ac837e6660b322123bbd7c
angie
  Tue Apr 26 16:36:45 2011 -0700
Added new lib module hacTree (Hierarchical Agglomerative Clustering),which takes an slList of items and a couple user-defined functions,
and returns a binary tree of clusters, with the leaf nodes containing
the original input items.  The user-defined functions do the interesting
parts: computing distance between two items and/or clusters, and
merging two items and/or clusters into a new cluster.
This is motivated by work on Feature #3711 (vcfTabix: center-weighted
haplotype sorting for display of phased genotypes), but I'm hoping
it might have other uses.

diff --git src/inc/hacTree.h src/inc/hacTree.h
new file mode 100644
index 0000000..41e7a58
--- /dev/null
+++ src/inc/hacTree.h
@@ -0,0 +1,69 @@
+/* hacTree - Hierarchical Agglomerative Clustering a list of inputs into a binary tree */
+/*
+ * This clustering algorithm can be used for problems with high
+ * dimensionality, such as clustering sequences by similarity.  It can
+ * be used for problems with low dimensionality too, such as
+ * clustering a sequence of numbers (1-dimensional: points along the
+ * number line), but there are more efficient algorithms for problems
+ * with low dimensionality.
+ *
+ * In order to use this, you need to provide functions that operate on a
+ * pair of input items (and extra data pointer):
+ * 1. a distance function that takes two input items and returns a number,
+ *    small for short distances/high similarity and large for long
+ *    distances/low similarity
+ * 2. a merging function that takes two input items and returns a new item
+ *    that represents both of them.  Therefore the data structure that
+ *    represents an input item must also be capable of representing a
+ *    cluster of input items.  Usually this is done by combining values
+ *    from input items into a new value that will compare properly with
+ *    other items or clusters in the distance function.
+ *
+ * Then pass in an slList of items, pointers to those two functions, and
+ * a pointer to data used in your distance and merging functions (or NULL).
+ * You get back a pointer to the root node of a binary tree structure
+ * where each leaf node contains one of your items and each node above
+ * that contains a cluster.  The root node contains the cluster of all items.
+ *
+ * kent/src/lib/tests/hacTreeTest.c contains a couple simple examples.
+ *
+ * To get the top k clusters, perform a breadth-first,
+ * lowest-distance-child-first search of the tree to find the top k
+ * nodes/clusters with the smallest distances from the root node/cluster.
+ * [It would be nice for this module to provide that function, if/when a need for it arises.]
+ */
+#ifndef HACTREE_H
+#define HACTREE_H
+
+#include "localmem.h"
+
+/* Caller-provided function to measure distance between two items or clusters */
+/* Note: this must be able to handle identical inputs or NULL inputs */
+/* Note: this should be monotonic and nonnegative, *not* a + or -
+ * difference (as would be used for comparing/ordering/sorting) */
+typedef double hacDistanceFunction(const struct slList *item1, const struct slList *item2, void *extraData);
+
+/* Caller-provided function to merge two items or clusters into a new cluster */
+/* Note: this must be able to handle NULL inputs */
+typedef struct slList *(hacMergeFunction)(const struct slList *item1, const struct slList *item2, void *extraData);
+
+/* Structure of nodes of the binary tree that contains a hierarchical clustering of inputs */
+struct hacTree
+    {
+    struct hacTree *next;       // Can have an unordered list of these
+    struct hacTree *parent;     // Cluster that contains this cluster, NULL for root node
+    struct hacTree *left;       // Left child, NULL for leaf node
+    struct hacTree *right;      // Right child, NULL for leaf node
+    double childDistance;       // Result of distance function on left and right kids
+    struct slList *itemOrCluster;  // If leaf node, one of the items passed in;
+                                // otherwise, result of merging left and right kids' itemOrClusters
+    };
+
+struct hacTree *hacTreeFromItems(const struct slList *itemList, struct lm *localMem,
+				 hacDistanceFunction *distF, hacMergeFunction *mergeF,
+				 void *extraData);
+/* Using distF, mergeF, and binary tree operations, perform a
+ * hierarchical agglomerative (bottom-up) clustering of items.
+ * To free the resulting tree, lmCleanup(&localMem). */
+
+#endif//def HACTREE_H