fb0b9a237e0c6936a069af0afe7107db15f822d6 angie Wed May 4 15:08:28 2011 -0700 Feature #3711 (center-weighted alpha haplo sorting for vcfTabix):Performance improvement for hacTree: if caller passes in comparison function, then pre-sort the items and pre-cluster adjacent identical items before generating pairs. When many of the inputs are identical, this greatly reduces the number of pairs that the main clustering algorithm starts with. For example, a 1000 Genomes file has genotypes for 1360 people (2720 haplotypes), and starting with all pairs of 2720 haps was impossibly slow for hgTracks. However, in regions of a few tens of thousands of bases and a few tens of variants, in practice there's usually less than 100 distinct haplotypes, which makes it possible to cluster in tenths of seconds instead of timing out. The pre-clustering also makes nice balanced trees; the main clustering step still seems prone to chaining to me, so there's probably still more room for improvement there. diff --git src/lib/tests/hacTreeTest.c src/lib/tests/hacTreeTest.c index 2375f04..536038a 100644 --- src/lib/tests/hacTreeTest.c +++ src/lib/tests/hacTreeTest.c @@ -84,31 +84,31 @@ void hacTreeTestInts(FILE *f, struct lm *localMem) /* Cluster a list of integers (use doubles to get more precise cluster points). */ { struct slDouble *sldList = NULL; slAddHead(&sldList, slDoubleNew(0)); slAddHead(&sldList, slDoubleNew(5)); slAddHead(&sldList, slDoubleNew(7)); slAddHead(&sldList, slDoubleNew(1)); slAddHead(&sldList, slDoubleNew(-8)); slAddHead(&sldList, slDoubleNew(12)); slAddHead(&sldList, slDoubleNew(6)); slAddHead(&sldList, slDoubleNew(-6)); slAddHead(&sldList, slDoubleNew(-3)); slAddHead(&sldList, slDoubleNew(8)); struct hacTree *clusters = hacTreeFromItems((struct slList *)sldList, localMem, - slDoubleDistance, slDoubleMidpoint, NULL); + slDoubleDistance, slDoubleMidpoint, NULL, NULL); fputs("Clustering by numeric value:\n", f); printSlDoubleTree(f, clusters); } //------------------------------------------------------------------- // Center-weighted alpha clustering of haplotypes -- see Redmine #3711, #2823 note 7 static void rPrintSlNameTree(FILE *f, struct hacTree *tree, int level) /* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */ { char *contents = ((struct slName *)(tree->itemOrCluster))->name; int i; for (i=0; i < level; i++) fputc(' ', f); @@ -189,56 +189,64 @@ // kids's sequences' lengths are both equal to helper->len. { const struct slName *kid1 = (const struct slName *)item1; const struct slName *kid2 = (const struct slName *)item2; const char *hap1 = kid1->name; const char *hap2 = kid2->name; struct cwaExtraData *helper = extraData; struct slName *consensus = lmSlName(helper->localMem, (char *)hap1); int i; for (i=0; i < helper->len; i++) if (hap1[i] != hap2[i]) consensus->name[i] = 'N'; return (struct slList *)consensus; } +static int cwaCmp(const struct slList *item1, const struct slList *item2, void *extraData) +/* Use strcmp on haplotype strings. */ +{ +const struct slName *kid1 = (const struct slName *)item1; +const struct slName *kid2 = (const struct slName *)item2; +return strcmp(kid1->name, kid2->name); +} + void hacTreeTestHaplos(char *inFileName, FILE *f, struct lm *localMem) /* Read in haplotypes of same length from a file and print the resulting clusters. */ { struct slName *sln, *slnList = NULL; struct lineFile *lf = lineFileOpen(inFileName, TRUE); int len = 0; char *line; int size; while (lineFileNext(lf, &line, &size)) { if (len == 0) len = size-1; else if (size-1 != len) lineFileAbort(lf, "All lines in input file must have same length (got %d vs. %d)", size-1, len); sln = slNameNewN(line, size); slAddHead(&slnList, sln); } lineFileClose(&lf); slReverse(&slnList); int center = len / 2; double alpha = 0.5; struct cwaExtraData helper = { center, len, alpha, localMem }; struct hacTree *clusters = hacTreeFromItems((struct slList *)slnList, localMem, - cwaDistance, cwaMerge, &helper); -fputs("Clustering by string length:\n", f); + cwaDistance, cwaMerge, cwaCmp, &helper); +fputs("Clustering by haplotype similarity:\n", f); printSlNameTree(f, clusters); carefulClose(&f); } void hacTreeTest(char *inFileName, char *outFileName) /* Read in items from a file and print the resulting clusters. */ { FILE *f = mustOpen(outFileName, "w"); struct lm *localMem = lmInit(0); hacTreeTestInts(f, localMem); hacTreeTestHaplos(inFileName, f, localMem); lmCleanup(&localMem); } int main(int argc, char *argv[])