b4a8c2460a293048399a03b8f35ddfc588cd9f21 angie Wed Aug 10 15:46:02 2011 -0700 Feature #3711 (vcfTabix haplotype sorting display): Draw hierarchical tree in left label area (as much as can fit there). diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index 2e73653..0a06af2 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -206,31 +206,31 @@ struct hapCluster *cR = (struct hapCluster *)ht->right->itemOrCluster; if (cL->leafCount >= cR->leafCount) { rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd); } else { rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd); rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); } } } static unsigned short *clusterChroms(const struct vcfFile *vcff, int centerIx, - unsigned short *retGtHapEnd) + unsigned short *retGtHapEnd, struct hacTree **retTree) /* Given a bunch of VCF records with phased genotypes, build up one haplotype string * per chromosome that is the sequence of alleles in all variants (simplified to one base * per variant). Each individual/sample will have two haplotype strings (unless haploid * like Y or male X). Independently cluster the haplotype strings using hacTree with the * center-weighted alpha functions above. Return an array of genotype+haplotype indices * in the order determined by the hacTree, and set retGtHapEnd to its length/end. */ { int len = slCount(vcff->records); // Should alpha depend on len? Should the penalty drop off with distance? Seems like // straight-up exponential will cause the signal to drop to nothing pretty quickly... double alpha = 0.5; struct lm *lm = lmInit(0); struct cwaExtraData helper = { centerIx, len, alpha, lm }; int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. int gtCount = vcff->genotypeCount; @@ -287,30 +287,31 @@ // Some array items will have an empty cluster for missing hap2 -- // trim those from the linked list. struct hapCluster *c = hapArray[0]; while (c != NULL && c->next != NULL) { if (c->next->leafCount == 0) c->next = c->next->next; c = c->next; } } } struct hacTree *ht = hacTreeFromItems((struct slList *)(hapArray[0]), lm, cwaDistance, cwaMerge, cwaCmp, &helper); unsigned short *gtHapOrder = needMem(vcff->genotypeCount * 2 * sizeof(unsigned short)); rSetGtHapOrder(ht, gtHapOrder, retGtHapEnd); +*retTree = ht; return gtHapOrder; } INLINE Color pgSnpColor(char *allele) /* Color allele by first base according to pgSnp palette. */ { if (allele[0] == 'A') return revCmplDisp ? MG_MAGENTA : MG_RED; else if (allele[0] == 'C') return revCmplDisp ? darkGreenColor : MG_BLUE; else if (allele[0] == 'G') return revCmplDisp ? MG_BLUE : darkGreenColor; else if (allele[0] == 'T') return revCmplDisp ? MG_RED : MG_MAGENTA; else @@ -354,45 +355,45 @@ rec->alleles[1], rec->alleles[1], gtAltAltCount); if (gtOtherCount > 0) dyStringPrintf(dy, " other:%d", gtOtherCount); // Restore original values of pooled strings. if (revCmplDisp) { for (i=0; i < rec->alleleCount; i++) reverseComplement(rec->alleles[i], strlen(rec->alleles[i])); } return dy->string; } // This is initialized when we start drawing: static Color purple = 0; -static void drawOneRec(struct vcfRecord *rec, unsigned short *gtHapOrder, int gtHapEnd, +static void drawOneRec(struct vcfRecord *rec, unsigned short *gtHapOrder, unsigned short gtHapCount, struct track *tg, struct hvGfx *hvg, int xOff, int yOff, int width, boolean isCenter, boolean colorByRefAlt) /* Draw a stack of genotype bars for this record */ { const double scale = scaleForPixels(width); int x1 = round((double)(rec->chromStart-winStart)*scale) + xOff; int x2 = round((double)(rec->chromEnd-winStart)*scale) + xOff; int w = x2-x1; if (w <= 1) { x1--; w = 3; } -double hapsPerPix = (2 * (double)rec->file->genotypeCount / tg->height); +double hapsPerPix = (double)gtHapCount / (tg->height-1); int pixIx; for (pixIx = 0; pixIx < tg->height; pixIx++) { int gtHapOrderIxStart = round(hapsPerPix * pixIx); int gtHapOrderIxEnd = round(hapsPerPix * (pixIx + 1)); if (gtHapOrderIxEnd == gtHapOrderIxStart) gtHapOrderIxEnd++; int unks = 0, refs = 0, alts = 0; int gtHapOrderIx; for (gtHapOrderIx = gtHapOrderIxStart; gtHapOrderIx < gtHapOrderIxEnd; gtHapOrderIx++) { int gtHapIx = gtHapOrder[gtHapOrderIx]; int hapIx = gtHapIx & 1; int gtIx = gtHapIx >>1; struct vcfGenotype *gt = &(rec->genotypes[gtIx]); @@ -464,72 +465,228 @@ int centerPos = cartInt(cart, cartVar); int winSize = seqEnd - seqStart; if (centerPos > (seqStart - winSize) && centerPos < (seqEnd + winSize)) { int i; struct vcfRecord *rec; for (rec = records, i = 0; rec != NULL; rec = rec->next, i++) if (rec->chromStart >= centerPos) return i; return i-1; } } return defaultIx; } +/* Pixel y offset return type for recursive tree-drawing: */ +enum yRetType + { + yrtMidPoint, + yrtStart, + yrtEnd, + }; + +/* Callback for calculating y (in pixels) for a cluster node: */ +typedef int yFromNodeFunc(const struct slList *itemOrCluster, void *extraData, + enum yRetType yType); + +static int rDrawTreeInLabelArea(struct hacTree *ht, struct hvGfx *hvg, enum yRetType yType, int x, + yFromNodeFunc *yFromNode, void *extraData) +/* Recursively draw the haplotype clustering tree in the left label area. + * Returns pixel height for use at non-leaf levels of tree. */ +{ +const int branchW = 4; +int labelEnd = leftLabelX + leftLabelWidth; +if (yType == yrtStart || yType == yrtEnd) + { + // We're just getting vertical span of a leaf cluster, not drawing any lines. + int yLeft, yRight; + if (ht->left) + yLeft = rDrawTreeInLabelArea(ht->left, hvg, yType, x, yFromNode, extraData); + else + yLeft = yFromNode(ht->itemOrCluster, extraData, yType); + if (ht->right) + yRight = rDrawTreeInLabelArea(ht->right, hvg, yType, x, yFromNode, extraData); + else + yRight = yFromNode(ht->itemOrCluster, extraData, yType); + if (yType == yrtStart) + return min(yLeft, yRight); + else + return max(yLeft, yRight); + } +// Otherwise yType is yrtMidPoint. If we have 2 children, we'll be drawing some lines: +if (ht->left != NULL && ht->right != NULL) + { + int midY; + if (ht->childDistance == 0 || x+(2*branchW) > labelEnd) + { + // Treat this as a leaf cluster. + // Recursing twice is wasteful. Could be avoided if this, and yFromNode, + // returned both yStart and yEnd. However, the time to draw a tree of + // 2188 hap's (1kG phase1 interim) is in the noise, so I consider it + // not worth the effort of refactoring to save a sub-millisecond here. + int yStartLeft = rDrawTreeInLabelArea(ht->left, hvg, yrtStart, x+branchW, + yFromNode, extraData); + int yEndLeft = rDrawTreeInLabelArea(ht->left, hvg, yrtEnd, x+branchW, + yFromNode, extraData); + int yStartRight = rDrawTreeInLabelArea(ht->right, hvg, yrtStart, x+branchW, + yFromNode, extraData); + int yEndRight = rDrawTreeInLabelArea(ht->right, hvg, yrtEnd, x+branchW, + yFromNode, extraData); + int yStart = min(yStartLeft, yStartRight); + int yEnd = max(yEndLeft, yEndRight); + midY = (yStart + yEnd) / 2; + hvGfxLine(hvg, x+branchW-1, yStart, x+branchW-1, yEnd-1, MG_BLACK); + hvGfxLine(hvg, x+branchW, yStart, labelEnd, yStart, MG_BLACK); + hvGfxLine(hvg, x+branchW, yEnd-1, labelEnd, yEnd-1, MG_BLACK); + } + else + { + int leftMid = rDrawTreeInLabelArea(ht->left, hvg, yrtMidPoint, x+branchW, + yFromNode, extraData); + int rightMid = rDrawTreeInLabelArea(ht->right, hvg, yrtMidPoint, x+branchW, + yFromNode, extraData); + midY = (leftMid + rightMid) / 2; + hvGfxLine(hvg, x+branchW-1, leftMid, x+branchW-1, rightMid, MG_BLACK); + } + hvGfxLine(hvg, x, midY, x+branchW-1, midY, MG_BLACK); + return midY; + } +else if (ht->left != NULL) + return rDrawTreeInLabelArea(ht->left, hvg, yType, x, yFromNode, extraData); +else if (ht->right != NULL) + return rDrawTreeInLabelArea(ht->right, hvg, yType, x, yFromNode, extraData); +// Leaf node -- return pixel height. Draw a line if yType is midpoint. +int y = yFromNode(ht->itemOrCluster, extraData, yType); +if (yType == yrtMidPoint && x < labelEnd) + hvGfxLine(hvg, x, y, labelEnd, y, MG_BLACK); +return y; +} + +struct yFromNodeHelper +/* Pre-computed mapping from cluster nodes' gtHapIx to pixel heights. */ + { + unsigned short gtHapCount; + unsigned short *gtHapIxToPxStart; + unsigned short *gtHapIxToPxEnd; + }; + +void initYFromNodeHelper(struct yFromNodeHelper *helper, int yOff, int height, + unsigned short gtHapCount, unsigned short *gtHapOrder) +/* Build a mapping of genotype and haplotype to pixel y coords. */ +{ +helper->gtHapCount = gtHapCount; +helper->gtHapIxToPxStart = needMem(gtHapCount * sizeof(unsigned short)); +helper->gtHapIxToPxEnd = needMem(gtHapCount * sizeof(unsigned short)); +double pxPerHap = (double)height / gtHapCount; +int i; +for (i = 0; i < gtHapCount; i++) + { + int yStart = round(i * pxPerHap); + int yEnd = round((i+1) * pxPerHap); + if (yEnd == yStart) + yEnd++; + int gtHapIx = gtHapOrder[i]; + helper->gtHapIxToPxStart[gtHapIx] = yOff + yStart; + helper->gtHapIxToPxEnd[gtHapIx] = yOff + yEnd; + } +} + +static int yFromHapNode(const struct slList *itemOrCluster, void *extraData, + enum yRetType yType) +/* Extract the gtHapIx from hapCluster (hacTree node item), find out its relative order + * and translate that to a pixel height. */ +{ +unsigned short gtHapIx = ((const struct hapCluster *)itemOrCluster)->gtHapIx; +struct yFromNodeHelper *helper = extraData; +if (gtHapIx >= helper->gtHapCount) + errAbort("vcfTrack.c: gtHapIx %d out of range [0,%d).", gtHapIx, helper->gtHapCount); +int y; +if (yType == yrtStart) + y = helper->gtHapIxToPxStart[gtHapIx]; +else if (yType == yrtEnd) + y = helper->gtHapIxToPxEnd[gtHapIx]; +else + y = (helper->gtHapIxToPxStart[gtHapIx] + helper->gtHapIxToPxEnd[gtHapIx]) / 2; +return y; +} + +static void drawTreeInLabelArea(struct hacTree *ht, struct hvGfx *hvg, int yOff, int height, + unsigned short gtHapCount, unsigned short *gtHapOrder) +/* Draw the haplotype clustering in the left label area (as much as fits there). */ +{ +// Figure out which hvg to use, save current clipping, and clip to left label coords: +struct hvGfx *hvgLL = (hvgSide != NULL) ? hvgSide : hvg; +int clipXBak, clipYBak, clipWidthBak, clipHeightBak; +hvGfxGetClip(hvgLL, &clipXBak, &clipYBak, &clipWidthBak, &clipHeightBak); +hvGfxUnclip(hvgLL); +hvGfxSetClip(hvgLL, leftLabelX, yOff, leftLabelWidth, height); +// Draw the tree: +int x = leftLabelX; +struct yFromNodeHelper helper = {0, NULL, NULL}; +initYFromNodeHelper(&helper, yOff, height-1, gtHapCount, gtHapOrder); +(void)rDrawTreeInLabelArea(ht, hvgLL, yrtMidPoint, x, yFromHapNode, &helper); +// Restore the prior clipping: +hvGfxUnclip(hvgLL); +hvGfxSetClip(hvgLL, clipXBak, clipYBak, clipWidthBak, clipHeightBak); +} + static void vcfHapClusterDraw(struct track *tg, int seqStart, int seqEnd, struct hvGfx *hvg, int xOff, int yOff, int width, MgFont *font, Color color, enum trackVisibility vis) /* Split samples' chromosomes (haplotypes), cluster them by center-weighted * alpha similarity, and draw in the order determined by clustering. */ { const struct vcfFile *vcff = tg->extraUiData; if (vcff->records == NULL) return; purple = hvGfxFindColorIx(hvg, 0x99, 0x00, 0xcc); boolean compositeLevel = isNameAtCompositeLevel(tg->tdb, tg->tdb->track); char *colorBy = cartUsualStringClosestToHome(cart, tg->tdb, compositeLevel, VCF_HAP_COLORBY_VAR, VCF_HAP_COLORBY_REFALT); boolean colorByRefAlt = sameString(colorBy, VCF_HAP_COLORBY_REFALT); -unsigned short gtHapEnd = 0; +unsigned short gtHapCount = 0; int ix, centerIx = getCenterVariantIx(tg, seqStart, seqEnd, vcff->records); -unsigned short *gtHapOrder = clusterChroms(vcff, centerIx, >HapEnd); +struct hacTree *ht = NULL; +unsigned short *gtHapOrder = clusterChroms(vcff, centerIx, >HapCount, &ht); struct vcfRecord *rec, *centerRec = NULL; for (rec = vcff->records, ix=0; rec != NULL; rec = rec->next, ix++) { if (ix == centerIx) centerRec = rec; else - drawOneRec(rec, gtHapOrder, gtHapEnd, tg, hvg, xOff, yOff, width, FALSE, colorByRefAlt); + drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, FALSE, colorByRefAlt); } // Draw the center rec on top, outlined with black lines, to make sure it is very visible: -drawOneRec(centerRec, gtHapOrder, gtHapEnd, tg, hvg, xOff, yOff, width, TRUE, colorByRefAlt); +drawOneRec(centerRec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, TRUE, colorByRefAlt); +// Draw as much of the tree as can fit in the left label area: +drawTreeInLabelArea(ht, hvg, yOff, tg->height, gtHapCount, gtHapOrder); } static int vcfHapClusterTotalHeight(struct track *tg, enum trackVisibility vis) /* Return height of haplotype graph (2 * #samples * lineHeight); * 2 because we're assuming diploid genomes here, no XXY, tetraploid etc. */ { // Should we make it single-height when on chrY? const struct vcfFile *vcff = tg->extraUiData; if (vcff->records == NULL) return 0; int ploidy = sameString(chromName, "chrY") ? 1 : 2; int simpleHeight = ploidy * vcff->genotypeCount * tg->lineHeight; int defaultHeight = min(simpleHeight, VCF_DEFAULT_HAP_HEIGHT); int cartHeight = cartOrTdbInt(cart, tg->tdb, VCF_HAP_HEIGHT_VAR, defaultHeight); -tg->height = min(cartHeight, maximumTrackHeight(tg)); +tg->height = min(cartHeight+1, maximumTrackHeight(tg)); return tg->height; } static char *vcfHapClusterTrackName(struct track *tg, void *item) /* If someone asks for itemName/mapItemName, just send name of track like wiggle. */ { return tg->track; } static void vcfHapClusterOverloadMethods(struct track *tg, struct vcfFile *vcff) /* If we confirm at load time that we can draw a haplotype graph, use * this to overwrite the methods for the rest of execution: */ { tg->heightPer = (tg->visibility == tvSquish) ? (tl.fontHeight/4) : (tl.fontHeight / 2); tg->lineHeight = tg->heightPer + 1;