fb0b9a237e0c6936a069af0afe7107db15f822d6 angie Wed May 4 15:08:28 2011 -0700 Feature #3711 (center-weighted alpha haplo sorting for vcfTabix):Performance improvement for hacTree: if caller passes in comparison function, then pre-sort the items and pre-cluster adjacent identical items before generating pairs. When many of the inputs are identical, this greatly reduces the number of pairs that the main clustering algorithm starts with. For example, a 1000 Genomes file has genotypes for 1360 people (2720 haplotypes), and starting with all pairs of 2720 haps was impossibly slow for hgTracks. However, in regions of a few tens of thousands of bases and a few tens of variants, in practice there's usually less than 100 distinct haplotypes, which makes it possible to cluster in tenths of seconds instead of timing out. The pre-clustering also makes nice balanced trees; the main clustering step still seems prone to chaining to me, so there's probably still more room for improvement there. diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index c79ad82..9218d87 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -8,30 +8,31 @@ #include "hdb.h" #include "hgTracks.h" #include "pgSnp.h" #include "trashDir.h" #include "vcf.h" #if (defined USE_TABIX && defined KNETFILE_HOOKS) #include "knetUdc.h" #include "udc.h" #endif//def USE_TABIX && KNETFILE_HOOKS #ifdef USE_TABIX //#*** TODO: use trackDb/cart setting or something static boolean boringBed = FALSE; static boolean doHapClusterDisplay = TRUE; +static boolean colorHapByRefAlt = TRUE; static struct bed4 *vcfFileToBed4(struct vcfFile *vcff) /* Convert vcff's records to bed4; don't free vcff until you're done with bed4 * because bed4 contains pointers into vcff's records' chrom and name. */ { struct bed4 *bedList = NULL; struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { struct bed4 *bed; AllocVar(bed); bed->chrom = rec->chrom; bed->chromStart = rec->chromStart; bed->chromEnd = rec->chromEnd; bed->name = rec->name; @@ -128,60 +129,30 @@ { float qual = rec->infoElements[i].values[0].datFloat; dyStringPrintf(dy, "%.1f", qual); int j; for (j = 0; j < altCount; j++) dyStringPrintf(dy, ",%.1f", qual); break; } pgs->alleleScores = cloneStringZ(dy->string, dy->stringSize+1); slAddHead(&pgsList, pgs); } slReverse(&pgsList); return pgsList; } -INLINE char *hapIxToAllele(int hapIx, char *refAllele, char *altAlleles[]) -/* Look up allele by index into reference allele and alternate allele(s). */ -{ -return (hapIx == 0) ? refAllele : altAlleles[hapIx-1]; -} - -INLINE Color colorFromGt(struct vcfGenotype *gt, int ploidIx, char *refAllele, - char *altAlleles[], int altCount, boolean grayUnphasedHet) -/* Color allele by base. */ -{ -int hapIx = ploidIx ? gt->hapIxB : gt->hapIxA; -char *allele = hapIxToAllele(hapIx, refAllele, altAlleles); -if (gt->isHaploid && hapIx > 0) - return shadesOfGray[5]; -if (grayUnphasedHet && !gt->isPhased && gt->hapIxA != gt->hapIxB) - return shadesOfGray[5]; -// Copying pgSnp color scheme here, using first base of allele which is not ideal for multibase -// but allows us to simplify it to 5 colors: -else if (allele[0] == 'A') - return MG_RED; -else if (allele[0] == 'C') - return MG_BLUE; -else if (allele[0] == 'G') - return darkGreenColor; -else if (allele[0] == 'T') - return MG_MAGENTA; -else - return shadesOfGray[5]; -} - // Center-weighted alpha clustering of haplotypes -- see Redmine #3711, #2823 note 7 // It might be nice to use an allele-frequency representation here instead of [ACGTN] strings // with "N" for missing info or differences, but keep it simple. struct cwaExtraData /* Helper data for hacTree clustering of haplotypes by center-weighted alpha distance */ { int center; // index from which each point's contribution to distance is to be weighted int len; // total length of haplotype strings double alpha; // weighting factor for distance from center struct lm *localMem; }; // This is the representation of a cluster of up to 65,535 haplotypes of equal length, @@ -254,40 +225,92 @@ const struct hapCluster *kid1 = (const struct hapCluster *)item1; const struct hapCluster *kid2 = (const struct hapCluster *)item2; struct cwaExtraData *helper = extraData; struct hapCluster *consensus = lmHapCluster(helper); consensus->leafCount = kid1->leafCount + kid2->leafCount; consensus->gtHapIx = kid1->gtHapIx; int i; for (i=0; i < helper->len; i++) { consensus->refCounts[i] = kid1->refCounts[i] + kid2->refCounts[i]; consensus->unkCounts[i] = kid1->unkCounts[i] + kid2->unkCounts[i]; } return (struct slList *)consensus; } -static struct hacTree *clusterChroms(const struct vcfFile *vcff) +INLINE void hapClusterToString(const struct hapCluster *c, struct dyString *dy, int len) +/* Write a text representation of hapCluster's alleles into dy. */ +{ +dyStringClear(dy); +int i; +for (i=0; i < len; i++) + dyStringAppendC(dy, (isRef(c, i) ? '0': '1')); +} + +static int cwaCmp(const struct slList *item1, const struct slList *item2, void *extraData) +/* Convert hapCluster to allele strings for easy comparison by strcmp. */ +{ +const struct hapCluster *c1 = (const struct hapCluster *)item1; +const struct hapCluster *c2 = (const struct hapCluster *)item2; +struct cwaExtraData *helper = extraData; +static struct dyString *dy1 = NULL, *dy2 = NULL; +if (dy1 == NULL) + { + dy1 = dyStringNew(0); + dy2 = dyStringNew(0); + } +hapClusterToString(c1, dy1, helper->len); +hapClusterToString(c2, dy2, helper->len); +return strcmp(dy1->string, dy2->string); +} + +void rSetGtHapOrder(struct hacTree *ht, unsigned short *gtHapOrder, unsigned short *retGtHapEnd) +/* Traverse hacTree and build an ordered array of genotype + haplotype indices. */ +{ +if (ht->left == NULL && ht->right == NULL) + { + struct hapCluster *c = (struct hapCluster *)ht->itemOrCluster; + gtHapOrder[(*retGtHapEnd)++] = c->gtHapIx; + } +else + { + struct hapCluster *cL = (struct hapCluster *)ht->left->itemOrCluster; + struct hapCluster *cR = (struct hapCluster *)ht->right->itemOrCluster; + if (cL->leafCount >= cR->leafCount) + { + rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); + rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd); + } + else + { + rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd); + rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); + } + } +} + +static unsigned short *clusterChroms(const struct vcfFile *vcff, unsigned short *retGtHapEnd) /* Given a bunch of VCF records with phased genotypes, build up one haplotype string * per chromosome that is the sequence of alleles in all variants (simplified to one base * per variant). Each individual/sample will have two haplotype strings (unless haploid * like Y or male X). Independently cluster the haplotype strings using hacTree with the - * center-weighted alpha functions above. */ + * center-weighted alpha functions above. Return an array of genotype+haplotype indices + * in the order determined by the hacTree, and set retGtHapEnd to its length/end. */ { int len = slCount(vcff->records); -// TODO: make this settable by user right-click or some other means of choosing a specific -// variant or position: +// Use the median variant in the window as the center; would be even nicer to allow +// the user to choose a variant (or position) to use as center: int center = len / 2; // Should alpha depend on len? Should the penalty drop off with distance? Seems like // straight-up exponential will cause the signal to drop to nothing pretty quickly... double alpha = 0.5; struct lm *lm = lmInit(0); struct cwaExtraData helper = { center, len, alpha, lm }; int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. int gtCount = vcff->genotypeCount; // Make an slList of hapClusters, but allocate in a big block so I can use // array indexing. struct hapCluster **hapArray = lmAlloc(lm, sizeof(struct hapCluster *) * gtCount * ploidy); int i; for (i=0; i < ploidy * gtCount; i++) { hapArray[i] = lmHapCluster(&helper); @@ -333,121 +356,98 @@ } } if (haveHaploid) { // Some array items will have an empty cluster for missing hap2 -- // trim those from the linked list. struct hapCluster *c = hapArray[0]; while (c != NULL && c->next != NULL) { if (c->next->leafCount == 0) c->next = c->next->next; c = c->next; } } } -return hacTreeFromItems((struct slList *)(hapArray[0]), lm, cwaDistance, cwaMerge, &helper); +struct hacTree *ht = hacTreeFromItems((struct slList *)(hapArray[0]), lm, + cwaDistance, cwaMerge, cwaCmp, &helper); +unsigned short *gtHapOrder = needMem(vcff->genotypeCount * 2 * sizeof(unsigned short)); +rSetGtHapOrder(ht, gtHapOrder, retGtHapEnd); +return gtHapOrder; } -INLINE int drawOneHap(struct vcfGenotype *gt, int hapIx, - char *ref, char *altAlleles[], int altCount, - struct hvGfx *hvg, int x1, int y, int w, int itemHeight, int lineHeight) -/* Draw a base-colored box for genotype[hapIx]. Return the new y offset. */ +INLINE char *hapIxToAllele(int hapIx, char *refAllele, char *altAlleles[]) +/* Look up allele by index into reference allele and alternate allele(s). */ { -Color color = colorFromGt(gt, hapIx, ref, altAlleles, altCount, TRUE); -hvGfxBox(hvg, x1, y, w, itemHeight+1, color); -y += itemHeight+1; -return y; +return (hapIx == 0) ? refAllele : altAlleles[hapIx-1]; } - -#ifdef DEBUG -static void rPrintHapClusterTree(FILE *f, struct hacTree *tree, int len, int level) -/* Recursively print out cluster as nested-parens with {}'s around leaf nodes. */ -{ -static struct dyString *dy = NULL; -if (dy == NULL) - dy = dyStringNew(0); -dyStringClear(dy); -struct hapCluster *c = (struct hapCluster *)(tree->itemOrCluster); -int i; -for (i=0; i < len; i++) - dyStringAppendC(dy, isRef(c, i) ? '0' : '1'); -for (i=0; i < level; i++) - fputc('0'+i, f); -if (tree->left == NULL && tree->right == NULL) +INLINE Color colorFromGt(struct vcfGenotype *gt, int ploidIx, char *refAllele, + char *altAlleles[], int altCount, boolean grayUnphasedHet) +/* Color allele by base. */ { - fprintf(f, "{%s}", dy->string); - return; - } -else if (tree->left == NULL || tree->right == NULL) - errAbort("\nHow did we get a node with one NULL kid??"); -fprintf(f, "(%s:%d:\n", dy->string, (int)(tree->childDistance)); -rPrintHapClusterTree(f, tree->left, len, level+1); -fputs(",\n", f); -rPrintHapClusterTree(f, tree->right, len, level+1); -fputc('\n', f); -for (i=0; i < level; i++) - fputc('0'+i, f); -fputs(")", f); +int hapIx = ploidIx ? gt->hapIxB : gt->hapIxA; +char *allele = hapIxToAllele(hapIx, refAllele, altAlleles); +if (gt->isHaploid && hapIx > 0) + return shadesOfGray[5]; +if (grayUnphasedHet && !gt->isPhased && gt->hapIxA != gt->hapIxB) + return shadesOfGray[5]; +// Copying pgSnp color scheme here, using first base of allele which is not ideal for multibase +// but allows us to simplify it to 5 colors: +else if (allele[0] == 'A') + return MG_RED; +else if (allele[0] == 'C') + return MG_BLUE; +else if (allele[0] == 'G') + return darkGreenColor; +else if (allele[0] == 'T') + return MG_MAGENTA; +else + return shadesOfGray[5]; } -void printHapClusterTree(FILE *f, struct hacTree *tree, int len) -/* Print out cluster as nested-parens with {}'s around leaf nodes. */ +INLINE Color colorFromRefAlt(struct vcfGenotype *gt, int hapIx, boolean grayUnphasedHet) +/* Color allele red for alternate allele, blue for reference allele. */ { -if (tree == NULL) - { - fputs("Empty tree.\n", f); - return; - } -rPrintHapClusterTree(f, tree, len, 0); -fputc('\n', f); +if (grayUnphasedHet && !gt->isPhased && gt->hapIxA != gt->hapIxB) + return shadesOfGray[5]; +int alIx = hapIx ? gt->hapIxB : gt->hapIxA; +return alIx ? MG_RED : MG_BLUE; } -#endif//def DEBUG -void rSetGtHapOrder(struct hacTree *ht, unsigned short *gtHapOrder, unsigned short *retGtHapEnd) -/* Traverse hacTree and build an ordered array of genotype + haplotype indices. */ -{ -if (ht->left == NULL && ht->right == NULL) - { - struct hapCluster *c = (struct hapCluster *)ht->itemOrCluster; - gtHapOrder[(*retGtHapEnd)++] = c->gtHapIx; - } -else + +INLINE int drawOneHap(struct vcfGenotype *gt, int hapIx, + char *ref, char *altAlleles[], int altCount, + struct hvGfx *hvg, int x1, int y, int w, int itemHeight, int lineHeight) +/* Draw a base-colored box for genotype[hapIx]. Return the new y offset. */ { - rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); - rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd); - } +Color color = colorHapByRefAlt ? colorFromRefAlt(gt, hapIx, TRUE) : + colorFromGt(gt, hapIx, ref, altAlleles, altCount, TRUE); +hvGfxBox(hvg, x1, y, w, itemHeight+1, color); +y += itemHeight+1; +return y; } static void vcfHapClusterDraw(struct track *tg, int seqStart, int seqEnd, struct hvGfx *hvg, int xOff, int yOff, int width, MgFont *font, Color color, enum trackVisibility vis) /* Split samples' chromosomes (haplotypes), cluster them by center-weighted * alpha similarity, and draw in the order determined by clustering. */ { const struct vcfFile *vcff = tg->extraUiData; -struct hacTree *ht = clusterChroms(vcff); -#ifdef DEBUG -puts("
"); -printHapClusterTree(stdout, ht, slCount(vcff->records)); -puts(""); -#endif//def DEBUG -// the following 4 lines should probably be moved into clusterChroms. -unsigned short *gtHapOrder = needMem(vcff->genotypeCount * 2 * sizeof(unsigned short)); unsigned short gtHapEnd = 0; -rSetGtHapOrder(ht, gtHapOrder, >HapEnd); +unsigned short *gtHapOrder = clusterChroms(vcff, >HapEnd); struct dyString *tmp = dyStringNew(0); struct vcfRecord *rec; const int lineHeight = tg->lineHeight; const int itemHeight = tg->heightPer; const double scale = scaleForPixels(width); for (rec = vcff->records; rec != NULL; rec = rec->next) { dyStringClear(tmp); dyStringAppend(tmp, rec->alt); char *altAlleles[256]; int altCount = chopCommas(tmp->string, altAlleles); int x1 = round((double)(rec->chromStart-winStart)*scale) + xOff; int x2 = round((double)(rec->chromEnd-winStart)*scale) + xOff; int w = x2-x1; if (w < 1) @@ -520,31 +520,31 @@ vcff = vcfTabixFileMayOpen(fileOrUrl, chromName, winStart, winEnd, vcfMaxErr); } errCatchEnd(errCatch); if (errCatch->gotError) { if (isNotEmpty(errCatch->message->string)) tg->networkErrMsg = cloneString(errCatch->message->string); tg->drawItems = bigDrawWarning; tg->totalHeight = bigWarnTotalHeight; } errCatchFree(&errCatch); if (vcff != NULL) { if (boringBed) tg->items = vcfFileToBed4(vcff); - else if (doHapClusterDisplay && vcff->genotypeCount > 0 && vcff->genotypeCount < 100 && + else if (doHapClusterDisplay && vcff->genotypeCount > 0 && vcff->genotypeCount < 3000 && (tg->visibility == tvPack || tg->visibility == tvSquish)) vcfHapClusterOverloadMethods(tg, vcff); else { tg->items = vcfFileToPgSnp(vcff); /* base coloring/display decision on count of items */ tg->customInt = slCount(tg->items); } // Don't vcfFileFree here -- we are using its string pointers! } } void vcfTabixMethods(struct track *track) /* Methods for VCF + tabix files. */ {