ac5918d51bed4db6f5ac837e6660b322123bbd7c angie Tue Apr 26 16:36:45 2011 -0700 Added new lib module hacTree (Hierarchical Agglomerative Clustering),which takes an slList of items and a couple user-defined functions, and returns a binary tree of clusters, with the leaf nodes containing the original input items. The user-defined functions do the interesting parts: computing distance between two items and/or clusters, and merging two items and/or clusters into a new cluster. This is motivated by work on Feature #3711 (vcfTabix: center-weighted haplotype sorting for display of phased genotypes), but I'm hoping it might have other uses. diff --git src/lib/tests/hacTreeTest.c src/lib/tests/hacTreeTest.c new file mode 100644 index 0000000..5929100 --- /dev/null +++ src/lib/tests/hacTreeTest.c @@ -0,0 +1,205 @@ +/* hacTreeTest - Read in items from a file and print the resulting clusters. */ +#include "common.h" +#include "linefile.h" +#include "options.h" +#include "sqlNum.h" +#include "hacTree.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "hacTreeTest - Read in lines from a file and print the resulting clusters by length.\n" + "usage:\n" + " hacTreeTest in.txt out.txt\n" + "Actually, before clustering in.txt, it clusters some numbers because that is easier.\n" + "\n" + ); +} + +static struct optionSpec options[] = { + {NULL, 0}, +}; + +//------------------- numeric example ------------------ + +static void rPrintSlDoubleTree(FILE *f, struct hacTree *tree, int level) +/* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */ +{ +double val = ((struct slDouble *)(tree->itemOrCluster))->val; +int i; +for (i=0; i < level; i++) + fputc(' ', f); +if (tree->left == NULL && tree->right == NULL) + { + fprintf(f, "{%0.3lf}", val); + return; + } +else if (tree->left == NULL || tree->right == NULL) + errAbort("\nHow did we get a node with one NULL kid??"); +fprintf(f, "(%0.3lf:%0.3lf:\n", val, tree->childDistance); +rPrintSlDoubleTree(f, tree->left, level+1); +fputs(",\n", f); +rPrintSlDoubleTree(f, tree->right, level+1); +fputc('\n', f); +for (i=0; i < level; i++) + fputc(' ', f); +fputs(")", f); +} + +void printSlDoubleTree(FILE *f, struct hacTree *tree) +/* Print out cluster as nested-parens with {}'s around leaf nodes. */ +{ +if (tree == NULL) + { + fputs("Empty tree.\n", f); + return; + } +rPrintSlDoubleTree(f, tree, 0); +fputc('\n', f); +} + +double slDoubleDistance(const struct slList *item1, const struct slList *item2, void *extraData) +/* Return the absolute difference between the two kids' values. */ +{ +const struct slDouble *kid1 = (const struct slDouble *)item1; +const struct slDouble *kid2 = (const struct slDouble *)item2; +double diff = kid1->val - kid2->val; +if (diff < 0) + diff = -diff; +return diff; +} + +struct slList *slDoubleMidpoint(const struct slList *item1, const struct slList *item2, + void *unusedExtraData) +/* Make a new slDouble that is the midpoint/average of kids' values. */ +{ +const struct slDouble *kid1 = (const struct slDouble *)item1; +const struct slDouble *kid2 = (const struct slDouble *)item2; +double midpoint = (kid1->val + kid2->val) / 2.0; +return (struct slList *)(slDoubleNew(midpoint)); +} + +void hacTreeTestInts(FILE *f, struct lm *localMem) +/* Cluster a list of integers (use doubles to get more precise cluster points). */ +{ +struct slDouble *sldList = NULL; +slAddHead(&sldList, slDoubleNew(0)); +slAddHead(&sldList, slDoubleNew(5)); +slAddHead(&sldList, slDoubleNew(7)); +slAddHead(&sldList, slDoubleNew(1)); +slAddHead(&sldList, slDoubleNew(-8)); +slAddHead(&sldList, slDoubleNew(12)); +slAddHead(&sldList, slDoubleNew(6)); +slAddHead(&sldList, slDoubleNew(-6)); +slAddHead(&sldList, slDoubleNew(-3)); +slAddHead(&sldList, slDoubleNew(8)); +struct hacTree *clusters = hacTreeFromItems((struct slList *)sldList, localMem, + slDoubleDistance, slDoubleMidpoint, NULL); +fputs("Clustering by numeric value:\n", f); +printSlDoubleTree(f, clusters); +} + + +//------------------- strlen(line) example ------------------ + +static void rPrintSlNameTree(FILE *f, struct hacTree *tree, int level) +/* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */ +{ +char *contents = ((struct slName *)(tree->itemOrCluster))->name; +int i; +for (i=0; i < level; i++) + fputc(' ', f); +if (tree->left == NULL && tree->right == NULL) + { + fprintf(f, "{%s}", contents); + return; + } +else if (tree->left == NULL || tree->right == NULL) + errAbort("\nHow did we get a node with one NULL kid??"); +fprintf(f, "(%s:%d:\n", contents, (int)(tree->childDistance)); +rPrintSlNameTree(f, tree->left, level+1); +fputs(",\n", f); +rPrintSlNameTree(f, tree->right, level+1); +fputc('\n', f); +for (i=0; i < level; i++) + fputc(' ', f); +fputs(")", f); +} + +void printSlNameTree(FILE *f, struct hacTree *tree) +/* Print out cluster as nested-parens with {}'s around leaf nodes. */ +{ +if (tree == NULL) + { + fputs("Empty tree.\n", f); + return; + } +rPrintSlNameTree(f, tree, 0); +fputc('\n', f); +} + +double strlenDistance(const struct slList *item1, const struct slList *item2, void *extraData) +/* Return the absolute difference in length between the two kids. */ +{ +const struct slName *kid1 = (const struct slName *)item1; +const struct slName *kid2 = (const struct slName *)item2; +int diff = strlen(kid1->name) - strlen(kid2->name); +if (diff < 0) + diff = -diff; +return (double)diff; +} + +struct slList *mergeLines(const struct slList *item1, const struct slList *item2, + void *unusedExtraData) +/* Make a string that is the average length of the kids. */ +{ +const struct slName *kid1 = (const struct slName *)item1; +const struct slName *kid2 = (const struct slName *)item2; +int len1 = strlen(kid1->name); +int len2 = strlen(kid2->name); +int avgLen = (len1 + len2 + 1) / 2; +const char *longerName = (len1 >= len2) ? kid1->name : kid2->name; +return (struct slList *)slNameNewN((char *)longerName, avgLen); +} + +void hacTreeTestStrlens(char *inFileName, FILE *f, struct lm *localMem) +/* Read in items from a file and print the resulting clusters. */ +{ +struct slName *sln, *slnList = NULL; +struct lineFile *lf = lineFileOpen(inFileName, TRUE); +char *line; +int size; +while (lineFileNext(lf, &line, &size)) + { + sln = slNameNewN(line, size); + slAddHead(&slnList, sln); + } +lineFileClose(&lf); +slReverse(&slnList); +struct hacTree *clusters = hacTreeFromItems((struct slList *)slnList, localMem, + strlenDistance, mergeLines, NULL); +fputs("Clustering by string length:\n", f); +printSlNameTree(f, clusters); +carefulClose(&f); +} + +void hacTreeTest(char *inFileName, char *outFileName) +/* Read in items from a file and print the resulting clusters. */ +{ +FILE *f = mustOpen(outFileName, "w"); +struct lm *localMem = lmInit(0); +hacTreeTestInts(f, localMem); +hacTreeTestStrlens(inFileName, f, localMem); +lmCleanup(&localMem); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +hacTreeTest(argv[1], argv[2]); +return 0; +}