24fb6db9cd5a85659f16acab276c85f0ec86dd84
angie
  Fri Apr 29 16:46:20 2011 -0700
Better hacTree test: instead of using a dumb contrived example, useDavid's clustering example from email.

diff --git src/lib/tests/hacTreeTest.c src/lib/tests/hacTreeTest.c
index 5929100..2375f04 100644
--- src/lib/tests/hacTreeTest.c
+++ src/lib/tests/hacTreeTest.c
@@ -1,29 +1,30 @@
 /* hacTreeTest - Read in items from a file and print the resulting clusters. */
 #include "common.h"
 #include "linefile.h"
 #include "options.h"
 #include "sqlNum.h"
 #include "hacTree.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "hacTreeTest - Read in lines from a file and print the resulting clusters by length.\n"
+  "hacTreeTest - Read in haplotypes from a file and print the resulting clusters.\n"
   "usage:\n"
   "   hacTreeTest in.txt out.txt\n"
+  "All lines of in.txt must have the same length.\n"
   "Actually, before clustering in.txt, it clusters some numbers because that is easier.\n"
   "\n"
   );
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 //------------------- numeric example ------------------
 
 static void rPrintSlDoubleTree(FILE *f, struct hacTree *tree, int level)
 /* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */
 {
 double val = ((struct slDouble *)(tree->itemOrCluster))->val;
@@ -89,117 +90,163 @@
 slAddHead(&sldList, slDoubleNew(7));
 slAddHead(&sldList, slDoubleNew(1));
 slAddHead(&sldList, slDoubleNew(-8));
 slAddHead(&sldList, slDoubleNew(12));
 slAddHead(&sldList, slDoubleNew(6));
 slAddHead(&sldList, slDoubleNew(-6));
 slAddHead(&sldList, slDoubleNew(-3));
 slAddHead(&sldList, slDoubleNew(8));
 struct hacTree *clusters = hacTreeFromItems((struct slList *)sldList, localMem,
 					    slDoubleDistance, slDoubleMidpoint, NULL);
 fputs("Clustering by numeric value:\n", f);
 printSlDoubleTree(f, clusters);
 }
 
 
-//------------------- strlen(line) example ------------------
+//-------------------------------------------------------------------
+// Center-weighted alpha clustering of haplotypes -- see Redmine #3711, #2823 note 7
 
 static void rPrintSlNameTree(FILE *f, struct hacTree *tree, int level)
 /* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */
 {
 char *contents = ((struct slName *)(tree->itemOrCluster))->name;
 int i;
 for (i=0;  i < level;  i++)
     fputc(' ', f);
 if (tree->left == NULL && tree->right == NULL)
     {
     fprintf(f, "{%s}", contents);
     return;
     }
 else if (tree->left == NULL || tree->right == NULL)
     errAbort("\nHow did we get a node with one NULL kid??");
-fprintf(f, "(%s:%d:\n", contents, (int)(tree->childDistance));
+fprintf(f, "(%s:%f:\n", contents, tree->childDistance);
 rPrintSlNameTree(f, tree->left, level+1);
 fputs(",\n", f);
 rPrintSlNameTree(f, tree->right, level+1);
 fputc('\n', f);
 for (i=0;  i < level;  i++)
     fputc(' ', f);
 fputs(")", f);
 }
 
 void printSlNameTree(FILE *f, struct hacTree *tree)
 /* Print out cluster as nested-parens with {}'s around leaf nodes. */
 {
 if (tree == NULL)
     {
     fputs("Empty tree.\n", f);
     return;
     }
 rPrintSlNameTree(f, tree, 0);
 fputc('\n', f);
 }
 
-double strlenDistance(const struct slList *item1, const struct slList *item2, void *extraData)
-/* Return the absolute difference in length between the two kids. */
+struct cwaExtraData
+/* Helper data for hacTree clustering of haplotypes by center-weighted alpha distance */
+{
+    int center;    // index from which each point's contribution to distance is to be weighted
+    int len;       // total length of haplotype strings
+    double alpha;  // weighting factor for distance from center
+    struct lm *localMem;
+};
+
+static double cwaDistance(const struct slList *item1, const struct slList *item2, void *extraData)
+/* Center-weighted alpha sequence distance function for hacTree clustering of haplotype seqs */
+// This is inner-loop so I am not doing defensive checks.  Caller must ensure:
+// 1. kids's sequences' lengths are both equal to helper->len
+// 2. 0 <= helper->center <= len
+// 3. 0.0 < helper->alpha <= 1.0
 {
 const struct slName *kid1 = (const struct slName *)item1;
 const struct slName *kid2 = (const struct slName *)item2;
-int diff = strlen(kid1->name) - strlen(kid2->name);
-if (diff < 0)
-    diff = -diff;
-return (double)diff;
+const char *hap1 = kid1->name;
+const char *hap2 = kid2->name;
+struct cwaExtraData *helper = extraData;
+double distance = 0;
+double weight = 1; // start at center: alpha to the 0th power
+int i;
+for (i=helper->center;  i >= 0;  i--)
+    {
+    if (hap1[i] != hap2[i])
+	distance += weight;
+    weight *= helper->alpha;
+    }
+weight = helper->alpha; // start at center+1: alpha to the 1st power
+for (i=helper->center+1;  i < helper->len;  i++)
+    {
+    if (hap1[i] != hap2[i])
+	distance += weight;
+    weight *= helper->alpha;
+    }
+return distance;
 }
 
-struct slList *mergeLines(const struct slList *item1, const struct slList *item2,
-			  void *unusedExtraData)
-/* Make a string that is the average length of the kids. */
+static struct slList *cwaMerge(const struct slList *item1, const struct slList *item2,
+			       void *extraData)
+/* Make a consensus haplotype from two input haplotypes, for hacTree clustering by
+ * center-weighted alpha distance. */
+// This is inner-loop so I am not doing defensive checks.  Caller must ensure that
+// kids's sequences' lengths are both equal to helper->len.
 {
 const struct slName *kid1 = (const struct slName *)item1;
 const struct slName *kid2 = (const struct slName *)item2;
-int len1 = strlen(kid1->name);
-int len2 = strlen(kid2->name);
-int avgLen = (len1 + len2 + 1) / 2;
-const char *longerName = (len1 >= len2) ? kid1->name : kid2->name;
-return (struct slList *)slNameNewN((char *)longerName, avgLen);
+const char *hap1 = kid1->name;
+const char *hap2 = kid2->name;
+struct cwaExtraData *helper = extraData;
+struct slName *consensus = lmSlName(helper->localMem, (char *)hap1);
+int i;
+for (i=0; i < helper->len;  i++)
+    if (hap1[i] != hap2[i])
+	consensus->name[i] = 'N';
+return (struct slList *)consensus;
 }
 
-void hacTreeTestStrlens(char *inFileName, FILE *f, struct lm *localMem)
-/* Read in items from a file and print the resulting clusters. */
+void hacTreeTestHaplos(char *inFileName, FILE *f, struct lm *localMem)
+/* Read in haplotypes of same length from a file and print the resulting clusters. */
 {
 struct slName *sln, *slnList = NULL;
 struct lineFile *lf = lineFileOpen(inFileName, TRUE);
+int len = 0;
 char *line;
 int size;
 while (lineFileNext(lf, &line, &size))
     {
+    if (len == 0)
+	len = size-1;
+    else if (size-1 != len)
+	lineFileAbort(lf, "All lines in input file must have same length (got %d vs. %d)",
+		      size-1, len);
     sln = slNameNewN(line, size);
     slAddHead(&slnList, sln);
     }
 lineFileClose(&lf);
 slReverse(&slnList);
+int center = len / 2;
+double alpha = 0.5;
+struct cwaExtraData helper = { center, len, alpha, localMem };
 struct hacTree *clusters = hacTreeFromItems((struct slList *)slnList, localMem,
-					    strlenDistance, mergeLines, NULL);
+					    cwaDistance, cwaMerge, &helper);
 fputs("Clustering by string length:\n", f);
 printSlNameTree(f, clusters);
 carefulClose(&f);
 }
 
 void hacTreeTest(char *inFileName, char *outFileName)
 /* Read in items from a file and print the resulting clusters. */
 {
 FILE *f = mustOpen(outFileName, "w");
 struct lm *localMem = lmInit(0);
 hacTreeTestInts(f, localMem);
-hacTreeTestStrlens(inFileName, f, localMem);
+hacTreeTestHaplos(inFileName, f, localMem);
 lmCleanup(&localMem);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 hacTreeTest(argv[1], argv[2]);
 return 0;
 }