fb0b9a237e0c6936a069af0afe7107db15f822d6
angie
  Wed May 4 15:08:28 2011 -0700
Feature #3711 (center-weighted alpha haplo sorting for vcfTabix):Performance improvement for hacTree: if caller passes in comparison
function, then pre-sort the items and pre-cluster adjacent identical
items before generating pairs.  When many of the inputs are identical,
this greatly reduces the number of pairs that the main clustering
algorithm starts with.  For example, a 1000 Genomes file has
genotypes for 1360 people (2720 haplotypes), and starting with
all pairs of 2720 haps was impossibly slow for hgTracks.  However,
in regions of a few tens of thousands of bases and a few tens of
variants, in practice there's usually less than 100 distinct
haplotypes, which makes it possible to cluster in tenths of seconds
instead of timing out.  The pre-clustering also makes nice balanced
trees; the main clustering step still seems prone to chaining to me,
so there's probably still more room for improvement there.

diff --git src/lib/tests/hacTreeTest.c src/lib/tests/hacTreeTest.c
index 2375f04..536038a 100644
--- src/lib/tests/hacTreeTest.c
+++ src/lib/tests/hacTreeTest.c
@@ -84,31 +84,31 @@
 void hacTreeTestInts(FILE *f, struct lm *localMem)
 /* Cluster a list of integers (use doubles to get more precise cluster points). */
 {
 struct slDouble *sldList = NULL;
 slAddHead(&sldList, slDoubleNew(0));
 slAddHead(&sldList, slDoubleNew(5));
 slAddHead(&sldList, slDoubleNew(7));
 slAddHead(&sldList, slDoubleNew(1));
 slAddHead(&sldList, slDoubleNew(-8));
 slAddHead(&sldList, slDoubleNew(12));
 slAddHead(&sldList, slDoubleNew(6));
 slAddHead(&sldList, slDoubleNew(-6));
 slAddHead(&sldList, slDoubleNew(-3));
 slAddHead(&sldList, slDoubleNew(8));
 struct hacTree *clusters = hacTreeFromItems((struct slList *)sldList, localMem,
-					    slDoubleDistance, slDoubleMidpoint, NULL);
+					    slDoubleDistance, slDoubleMidpoint, NULL, NULL);
 fputs("Clustering by numeric value:\n", f);
 printSlDoubleTree(f, clusters);
 }
 
 
 //-------------------------------------------------------------------
 // Center-weighted alpha clustering of haplotypes -- see Redmine #3711, #2823 note 7
 
 static void rPrintSlNameTree(FILE *f, struct hacTree *tree, int level)
 /* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */
 {
 char *contents = ((struct slName *)(tree->itemOrCluster))->name;
 int i;
 for (i=0;  i < level;  i++)
     fputc(' ', f);
@@ -189,56 +189,64 @@
 // kids's sequences' lengths are both equal to helper->len.
 {
 const struct slName *kid1 = (const struct slName *)item1;
 const struct slName *kid2 = (const struct slName *)item2;
 const char *hap1 = kid1->name;
 const char *hap2 = kid2->name;
 struct cwaExtraData *helper = extraData;
 struct slName *consensus = lmSlName(helper->localMem, (char *)hap1);
 int i;
 for (i=0; i < helper->len;  i++)
     if (hap1[i] != hap2[i])
 	consensus->name[i] = 'N';
 return (struct slList *)consensus;
 }
 
+static int cwaCmp(const struct slList *item1, const struct slList *item2, void *extraData)
+/* Use strcmp on haplotype strings. */
+{
+const struct slName *kid1 = (const struct slName *)item1;
+const struct slName *kid2 = (const struct slName *)item2;
+return strcmp(kid1->name, kid2->name);
+}
+
 void hacTreeTestHaplos(char *inFileName, FILE *f, struct lm *localMem)
 /* Read in haplotypes of same length from a file and print the resulting clusters. */
 {
 struct slName *sln, *slnList = NULL;
 struct lineFile *lf = lineFileOpen(inFileName, TRUE);
 int len = 0;
 char *line;
 int size;
 while (lineFileNext(lf, &line, &size))
     {
     if (len == 0)
 	len = size-1;
     else if (size-1 != len)
 	lineFileAbort(lf, "All lines in input file must have same length (got %d vs. %d)",
 		      size-1, len);
     sln = slNameNewN(line, size);
     slAddHead(&slnList, sln);
     }
 lineFileClose(&lf);
 slReverse(&slnList);
 int center = len / 2;
 double alpha = 0.5;
 struct cwaExtraData helper = { center, len, alpha, localMem };
 struct hacTree *clusters = hacTreeFromItems((struct slList *)slnList, localMem,
-					    cwaDistance, cwaMerge, &helper);
-fputs("Clustering by string length:\n", f);
+					    cwaDistance, cwaMerge, cwaCmp, &helper);
+fputs("Clustering by haplotype similarity:\n", f);
 printSlNameTree(f, clusters);
 carefulClose(&f);
 }
 
 void hacTreeTest(char *inFileName, char *outFileName)
 /* Read in items from a file and print the resulting clusters. */
 {
 FILE *f = mustOpen(outFileName, "w");
 struct lm *localMem = lmInit(0);
 hacTreeTestInts(f, localMem);
 hacTreeTestHaplos(inFileName, f, localMem);
 lmCleanup(&localMem);
 }
 
 int main(int argc, char *argv[])