0a9372ad1d6c6498fd3818a91edb65d932d890d9 angie Thu Apr 2 11:37:11 2020 -0700 VCF: new option to display haplotypes ordered by a trackDb-specified Newick tree file. Tree is drawn in left label area. refs #25278 diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index 09192cd..21be9fc 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -1,29 +1,30 @@ /* vcfTrack -- handlers for Variant Call Format data. */ /* Copyright (C) 2014 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "bigWarn.h" #include "dystring.h" #include "errCatch.h" #include "hacTree.h" #include "hdb.h" #include "hgColors.h" #include "hgTracks.h" #include "pgSnp.h" +#include "phyloTree.h" #include "trashDir.h" #include "vcf.h" #include "vcfUi.h" #include "knetUdc.h" #include "udc.h" static boolean getMinQual(struct trackDb *tdb, double *retMinQual) /* Return TRUE and set retMinQual if cart contains minimum QUAL filter */ { if (cartOrTdbBoolean(cart, tdb, VCF_APPLY_MIN_QUAL_VAR, VCF_DEFAULT_APPLY_MIN_QUAL)) { if (retMinQual != NULL) *retMinQual = cartOrTdbDouble(cart, tdb, VCF_MIN_QUAL_VAR, VCF_DEFAULT_MIN_QUAL); return TRUE; @@ -1019,59 +1020,71 @@ static enum hapColorMode getColorMode(struct trackDb *tdb) /* Get the hap-cluster coloring mode from cart & tdb. */ { enum hapColorMode colorMode = altOnlyMode; char *colorBy = cartOrTdbString(cart, tdb, VCF_HAP_COLORBY_VAR, VCF_DEFAULT_HAP_COLORBY); if (sameString(colorBy, VCF_HAP_COLORBY_ALTONLY)) colorMode = altOnlyMode; else if (sameString(colorBy, VCF_HAP_COLORBY_REFALT)) colorMode = refAltMode; else if (sameString(colorBy, VCF_HAP_COLORBY_BASE)) colorMode = baseMode; return colorMode; } +static boolean vcfHapClusterDrawInit(struct track *tg, struct vcfFile *vcff, struct hvGfx *hvg, + enum hapColorMode *retHapColorMode) +/* Parse vcff's genotypes and get ready to draw haplotypes. Return FALSE if nothing to draw. */ +{ +if (vcff->records == NULL) + return FALSE; +undefYellow = hvGfxFindRgb(hvg, &undefinedYellowColor); +if (retHapColorMode) + *retHapColorMode = getColorMode(tg->tdb); +pushWarnHandler(ignoreEm); +struct vcfRecord *rec; +for (rec = vcff->records; rec != NULL; rec = rec->next) + vcfParseGenotypes(rec); +popWarnHandler(); +return TRUE; +} + static void vcfHapClusterDraw(struct track *tg, int seqStart, int seqEnd, struct hvGfx *hvg, int xOff, int yOff, int width, MgFont *font, Color color, enum trackVisibility vis) /* Split samples' chromosomes (haplotypes), cluster them by center-weighted * alpha similarity, and draw in the order determined by clustering. */ { struct vcfFile *vcff = tg->extraUiData; -if (vcff->records == NULL) +enum hapColorMode colorMode; +if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode)) return; purple = hvGfxFindColorIx(hvg, 0x99, 0x00, 0xcc); -undefYellow = hvGfxFindRgb(hvg, &undefinedYellowColor); -enum hapColorMode colorMode = getColorMode(tg->tdb); -pushWarnHandler(ignoreEm); -struct vcfRecord *rec; -for (rec = vcff->records; rec != NULL; rec = rec->next) - vcfParseGenotypes(rec); -popWarnHandler(); unsigned short gtHapCount = 0; int nRecords = slCount(vcff->records); int centerIx = getCenterVariantIx(tg, seqStart, seqEnd, vcff->records); // Limit the number of variants that we compare, to keep from timing out: // (really what we should limit is the number of distinct haplo's passed to hacTree!) // In the meantime, this should at least be a cart var... int maxVariantsPerSide = 50; int startIx = max(0, centerIx - maxVariantsPerSide); int endIx = min(nRecords, centerIx+1 + maxVariantsPerSide); struct hacTree *ht = NULL; unsigned short *gtHapOrder = clusterHaps(vcff, centerIx, startIx, endIx, >HapCount, &ht); struct vcfRecord *centerRec = NULL; +struct vcfRecord *rec; int ix; // Unlike drawing order (last drawn is on top), the first mapBox gets priority, // so map center variant before drawing & mapping other variants! for (rec = vcff->records, ix=0; rec != NULL; rec = rec->next, ix++) { if (ix == centerIx) { centerRec = rec; mapBoxForCenterVariant(rec, hvg, tg, xOff, yOff, width); break; } } for (rec = vcff->records, ix=0; rec != NULL; rec = rec->next, ix++) { boolean isClustered = (ix >= startIx && ix < endIx); @@ -1153,91 +1166,333 @@ int gtIx = (gtHapOrder[gtHapIx] >> 1); char *sample = vcff->genotypeIds[gtIx]; if (!lastSample || differentString(sample, lastSample)) { if (isNotEmpty(dy->string)) dyStringAppend(dy, ", "); dyStringAppend(dy, sample); lastSample = sample; } } imgTrackAddMapItem(curImgTrack, TITLE_BUT_NO_LINK, dy->string, leftLabelX, y+yStart, labelEnd, y+yStart+1, track); } } -static void vcfGtHapFileOrderDraw(struct track *tg, int seqStart, int seqEnd, - struct hvGfx *hvg, int xOff, int yOff, int width, - MgFont *font, Color color, enum trackVisibility vis) -/* Draw rows in the same fashion as vcfHapClusterDraw, but instead of clustering, use the - * order in which samples appear in the VCF file. */ +static unsigned short *gtHapOrderFromGtOrder(struct vcfFile *vcff, + boolean *retIsAllDiploid, int *retGtHapCount) { -struct vcfFile *vcff = tg->extraUiData; -if (vcff->records == NULL) - return; -undefYellow = hvGfxFindRgb(hvg, &undefinedYellowColor); -enum hapColorMode colorMode = getColorMode(tg->tdb); -pushWarnHandler(ignoreEm); -struct vcfRecord *rec; -for (rec = vcff->records; rec != NULL; rec = rec->next) - vcfParseGenotypes(rec); -popWarnHandler(); int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. int gtCount = vcff->genotypeCount; boolean isAllDiploid = TRUE; unsigned short *gtHapOrder = needMem(gtCount * ploidy * sizeof(unsigned short)); int orderIx = 0; int gtIx; // Determine the number of chromosome rows; for chrX, can be mix of diploid and haploid. for (gtIx=0; gtIx < gtCount; gtIx++) { int gtHapIx = (gtIx << 1); gtHapOrder[orderIx] = gtHapIx; orderIx++; + struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { struct vcfGenotype *gt = &(rec->genotypes[gtIx]); if (!gt->isHaploid) { gtHapOrder[orderIx] = gtHapIx+1; orderIx++; break; } else isAllDiploid = FALSE; } } -int gtHapCount = orderIx; +if (retIsAllDiploid) + *retIsAllDiploid = isAllDiploid; +if (retGtHapCount) + *retGtHapCount = orderIx; +return gtHapOrder; +} + +static void vcfGtHapFileOrderDraw(struct track *tg, int seqStart, int seqEnd, + struct hvGfx *hvg, int xOff, int yOff, int width, + MgFont *font, Color color, enum trackVisibility vis) +/* Draw rows in the same fashion as vcfHapClusterDraw, but instead of clustering, use the + * order in which samples appear in the VCF file. */ +{ +struct vcfFile *vcff = tg->extraUiData; +enum hapColorMode colorMode; +if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode)) + return; +boolean isAllDiploid; +int gtHapCount; +unsigned short *gtHapOrder = gtHapOrderFromGtOrder(vcff, &isAllDiploid, >HapCount); +struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, FALSE, FALSE, colorMode); // If height is sufficient, draw sample names as left labels; otherwise make mouseover titles // with sample names for each pixel y offset. int extraPixel = (colorMode == altOnlyMode) ? 1 : 0; int hapHeight = tg->height - CLIP_PAD - 2*extraPixel; int minHeightForLabels; if (isAllDiploid) minHeightForLabels = vcff->genotypeCount * (tl.fontHeight + 1); else minHeightForLabels = gtHapCount * (tl.fontHeight + 1); if (hapHeight >= minHeightForLabels) drawSampleLabels(vcff, isAllDiploid, yOff+extraPixel, hapHeight, gtHapOrder, gtHapCount, font, color, tg->track); else drawSampleTitles(vcff, yOff+extraPixel, hapHeight, gtHapOrder, gtHapCount, tg->track); } +static struct phyloTree *getTreeFromFile(struct trackDb *tdb) +/* Get the filename that follows trackDb setting 'hapClusterMethod treeFile ' and read it in + * as a phyloTree. */ +{ +char *hapMethod = cloneString(trackDbSetting(tdb, VCF_HAP_METHOD_VAR)); +if (! startsWithWord(VCF_HAP_METHOD_TREE_FILE, nextWord(&hapMethod))) + errAbort("getTreeFromFile: expected trackDb setting " VCF_HAP_METHOD_VAR "to start with '" + VCF_HAP_METHOD_TREE_FILE "' followed by a file name, but got '%s'", hapMethod); +char *fileName = nextWord(&hapMethod); +return phyloOpenTree(fileName); +} + +struct hash *makeSampleToIx(struct vcfFile *vcff) +/* Alloc & return a hash of sample names to genotype indices in vcff. */ +{ +struct hash *sampleToIx = hashNew(0); +int gtIx; +for (gtIx = 0; gtIx < vcff->genotypeCount; gtIx++) + hashAddInt(sampleToIx, vcff->genotypeIds[gtIx], gtIx); +return sampleToIx; +} + +static int gtIxFromSample(struct hash *sampleToIx, char *sample, char *vcfFileName) +/* Look up sample's genotype index in sampleToIx, die complaining about vcfFileName if not found. */ +{ +int gtIx = hashIntValDefault(sampleToIx, sample, -1); +if (gtIx < 0) + errAbort("gtIxFromSample: sample '%s' not found in VCF file %s", sample, vcfFileName); +return gtIx; +} + +static void rSetGtHapOrderFromTree(struct phyloTree *node, struct vcfFile *vcff, + struct hash *sampleToIx, + unsigned short *gtHapOrder, int *pGtHapCount, + unsigned short *leafOrderToHapOrderStart, + unsigned short *leafOrderToHapOrderEnd, int *pLeafCount) +/* Set gtHapOrder to sample gt & hap indices in the order we encounter the samples in tree. */ +{ +if (node->numEdges > 0) + { + int ix; + for (ix = 0; ix < node->numEdges; ix++) + rSetGtHapOrderFromTree(node->edges[ix], vcff, sampleToIx, gtHapOrder, pGtHapCount, + leafOrderToHapOrderStart, leafOrderToHapOrderEnd, pLeafCount); + } +else + { + int gtIx = gtIxFromSample(sampleToIx, node->ident->name, vcff->fileOrUrl); + int gtHapIx = (gtIx << 1); + gtHapOrder[*pGtHapCount] = gtHapIx; + leafOrderToHapOrderStart[*pLeafCount] = leafOrderToHapOrderEnd[*pLeafCount] = *pGtHapCount; + *pGtHapCount += 1; + struct vcfRecord *rec; + for (rec = vcff->records; rec != NULL; rec = rec->next) + { + struct vcfGenotype *gt = &(rec->genotypes[gtIx]); + if (!gt->isHaploid) + { + gtHapOrder[*pGtHapCount] = gtHapIx+1; + leafOrderToHapOrderEnd[*pLeafCount] = *pGtHapCount; + *pGtHapCount += 1; + break; + } + } + *pLeafCount += 1; + } +} + +static unsigned short *gtHapOrderFromTree(struct vcfFile *vcff, struct phyloTree *tree, + unsigned short **retLeafOrderToHapOrderStart, + unsigned short **retLeafOrderToHapOrderEnd, + int *retGtHapCount) +/* Alloc & return gtHapOrder, set to samples in the order we encounter them in tree. + * Also build up maps of leaf order to low and high gtHapIx, for drawing the tree later. */ +{ +struct hash *sampleToIx = makeSampleToIx(vcff); +int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. +int gtCount = vcff->genotypeCount; +unsigned short *gtHapOrder = needMem(gtCount * ploidy * sizeof(unsigned short)); +*retLeafOrderToHapOrderStart = needMem(gtCount * sizeof(unsigned short)); +*retLeafOrderToHapOrderEnd = needMem(gtCount * sizeof(unsigned short)); +*retGtHapCount = 0; +int leafCount = 0; +rSetGtHapOrderFromTree(tree, vcff, sampleToIx, gtHapOrder, retGtHapCount, + *retLeafOrderToHapOrderStart, *retLeafOrderToHapOrderEnd, &leafCount); +if (leafCount != vcff->genotypeCount) + errAbort("gtHapOrderFromTree: tree has %d leaves, but VCF file %s has %d genotype columns", + leafCount, vcff->fileOrUrl, vcff->genotypeCount); +return gtHapOrder; +} + + +// Relative coordinates for tree layout, to be transformed into pixel coords later. +struct nodeCoords + { + double rank; // Centerpoint of children's rank in terms of hap order (0 = top haplotype) + int depth; // Maximum child depth + 1 + }; + +static int phyloTreeAddNodeCoords(struct phyloTree *node, + unsigned short *leafOrderToHapOrderStart, + unsigned short *leafOrderToHapOrderEnd, + int leafIx) +/* Recursively annotate node and descendants with nodeCoords to prepare for drawing the tree. */ +{ +struct nodeCoords *coords; +AllocVar(coords); +node->priv = coords; +if (node->numEdges > 0) + { + double minRank = -1, maxRank = 0; + int maxDepth = 0; + int ix; + for (ix = 0; ix < node->numEdges; ix++) + { + struct phyloTree *child = node->edges[ix]; + leafIx = phyloTreeAddNodeCoords(child, leafOrderToHapOrderStart, leafOrderToHapOrderEnd, + leafIx); + struct nodeCoords *childCoords = child->priv; + if (minRank < 0 || childCoords->rank < minRank) + minRank = childCoords->rank; + if (childCoords->rank > maxRank) + maxRank = childCoords->rank; + if (childCoords->depth > maxDepth) + maxDepth = childCoords->depth; + } + coords->rank = (minRank + maxRank) / 2.0; + coords->depth = maxDepth + 1; + } +else + { + // Leaf (sample) + double rankStart = leafOrderToHapOrderStart[leafIx]; + double rankEnd = leafOrderToHapOrderStart[leafIx]; + coords->rank = (rankStart + rankEnd) / 2.0; + leafIx++; + coords->depth = 0; + } +return leafIx; +} + +static void rDrawPhyloTreeInLabelArea(struct phyloTree *node, struct hvGfx *hvg, int x, + int yOff, double pxPerHap, Color color, MgFont *font, + boolean drawRectangle) +/* Recursively draw the tree in the left label area. + * Returns pixel height for use at non-leaf levels of tree. */ +{ +const int branchW = 4; +int labelEnd = leftLabelX + leftLabelWidth; + + +if (node->numEdges > 0) + { + // Draw each child and a horizontal line to child + int minY = -1, maxY = 0; + int ix; + for (ix = 0; ix < node->numEdges; ix++) + { + struct phyloTree *child = node->edges[ix]; + rDrawPhyloTreeInLabelArea(child, hvg, x+branchW, yOff, pxPerHap, color, font, drawRectangle); + struct nodeCoords *childCoords = child->priv; + int childY = yOff + ((0.5 + childCoords->rank) * pxPerHap); + hvGfxLine(hvg, x, childY, x+branchW, childY, MG_BLACK); + if (minY < 0 || childY < minY) + minY = childY; + if (childY > maxY) + maxY = childY; + } + // Draw a vertical line to connect the children + hvGfxLine(hvg, x, minY, x, maxY, MG_BLACK); + } +else + { + // Leaf node -- draw a horizontal line + struct nodeCoords *coords = node->priv; + int y = yOff + ((0.5 + coords->rank) * pxPerHap); + hvGfxLine(hvg, x, y, x+branchW, y, color); + int textX = x + branchW + 3; + if (pxPerHap >= tl.fontHeight+1 && textX < labelEnd) + hvGfxText(hvg, textX, y, color, font, node->ident->name); + } +} + +static void drawPhyloTreeInLabelArea(struct phyloTree *tree, struct hvGfx *hvg, int yOff, + int clipHeight, int gtHapCount, + Color color, MgFont *font, boolean drawRectangle, + unsigned short *leafOrderToHapOrderStart, + unsigned short *leafOrderToHapOrderEnd) +{ +struct hvGfx *hvgLL = (hvgSide != NULL) ? hvgSide : hvg; +int clipXBak, clipYBak, clipWidthBak, clipHeightBak; +hvGfxGetClip(hvgLL, &clipXBak, &clipYBak, &clipWidthBak, &clipHeightBak); +hvGfxUnclip(hvgLL); +hvGfxSetClip(hvgLL, leftLabelX, yOff, leftLabelWidth, clipHeight); +// Figure out rank (vertical position) and depth (horizontal position) of every node in tree: +phyloTreeAddNodeCoords(tree, leafOrderToHapOrderStart, leafOrderToHapOrderEnd, 0); +// Draw the tree: +int x = leftLabelX; +double pxPerHap = (double)clipHeight / gtHapCount; +rDrawPhyloTreeInLabelArea(tree, hvgLL, x, yOff, pxPerHap, color, font, drawRectangle); +// Restore the prior clipping: +hvGfxUnclip(hvgLL); +hvGfxSetClip(hvgLL, clipXBak, clipYBak, clipWidthBak, clipHeightBak); +} + +static void vcfGtHapTreeFileDraw(struct track *tg, int seqStart, int seqEnd, + struct hvGfx *hvg, int xOff, int yOff, int width, + MgFont *font, Color color, enum trackVisibility vis) +/* Draw rows in the same fashion as vcfHapClusterDraw, but instead of clustering, use the + * order in which samples appear in the VCF file. */ +{ +struct vcfFile *vcff = tg->extraUiData; +enum hapColorMode colorMode; +if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode)) + return; +struct phyloTree *tree = getTreeFromFile(tg->tdb); +int gtHapCount; +unsigned short *leafOrderToHapOrderStart, *leafOrderToHapOrderEnd; +unsigned short *gtHapOrder = gtHapOrderFromTree(vcff, tree, + &leafOrderToHapOrderStart, &leafOrderToHapOrderEnd, + >HapCount); +struct vcfRecord *rec; +for (rec = vcff->records; rec != NULL; rec = rec->next) + drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, FALSE, FALSE, + colorMode); +int extraPixel = (colorMode == altOnlyMode) ? 1 : 0; +int hapHeight = tg->height - CLIP_PAD - 2*extraPixel; +char *treeAngle = cartOrTdbString(cart, tg->tdb, VCF_HAP_TREEANGLE_VAR, VCF_DEFAULT_HAP_TREEANGLE); +boolean drawRectangle = sameString(treeAngle, VCF_HAP_TREEANGLE_RECTANGLE); +drawPhyloTreeInLabelArea(tree, hvg, yOff+extraPixel, hapHeight, gtHapCount, color, font, + drawRectangle, leafOrderToHapOrderStart, leafOrderToHapOrderEnd); +drawSampleTitles(vcff, yOff+extraPixel, hapHeight, gtHapOrder, gtHapCount, tg->track); +} + static int vcfHapClusterTotalHeight(struct track *tg, enum trackVisibility vis) /* Return height of haplotype graph (2 * #samples * lineHeight); * 2 because we're assuming diploid genomes here, no XXY, tetraploid etc. */ { const struct vcfFile *vcff = tg->extraUiData; if (vcff->records == NULL) return 0; int ploidy = sameString(chromName, "chrY") ? 1 : 2; int simpleHeight = ploidy * vcff->genotypeCount * tg->lineHeight; int defaultHeight = min(simpleHeight, VCF_DEFAULT_HAP_HEIGHT); char *tdbHeight = trackDbSettingOrDefault(tg->tdb, VCF_HAP_HEIGHT_VAR, NULL); if (isNotEmpty(tdbHeight)) defaultHeight = atoi(tdbHeight); int cartHeight = cartOrTdbInt(cart, tg->tdb, VCF_HAP_HEIGHT_VAR, defaultHeight); if (tg->visibility == tvSquish) @@ -1251,30 +1506,32 @@ static char *vcfHapClusterTrackName(struct track *tg, void *item) /* If someone asks for itemName/mapItemName, just send name of track like wiggle. */ { return tg->track; } static void vcfHapClusterOverloadMethods(struct track *tg, struct vcfFile *vcff) /* If we confirm at load time that we can draw a haplotype graph, use * this to overwrite the methods for the rest of execution: */ { tg->heightPer = (tg->visibility == tvSquish) ? (tl.fontHeight/4) : (tl.fontHeight / 2); tg->lineHeight = tg->heightPer + 1; char *hapMethod = cartOrTdbString(cart, tg->tdb, VCF_HAP_METHOD_VAR, VCF_DEFAULT_HAP_METHOD); if (sameString(hapMethod, VCF_HAP_METHOD_FILE_ORDER)) tg->drawItems = vcfGtHapFileOrderDraw; +else if (startsWithWord(VCF_HAP_METHOD_TREE_FILE, hapMethod)) + tg->drawItems = vcfGtHapTreeFileDraw; else tg->drawItems = vcfHapClusterDraw; tg->totalHeight = vcfHapClusterTotalHeight; tg->itemHeight = tgFixedItemHeight; tg->itemName = vcfHapClusterTrackName; tg->mapItemName = vcfHapClusterTrackName; tg->itemStart = tgItemNoStart; tg->itemEnd = tgItemNoEnd; tg->mapsSelf = TRUE; tg->extraUiData = vcff; } static void indelTweakMapItem(struct track *tg, struct hvGfx *hvg, void *item, char *itemName, char *mapItemName, int start, int end, int x, int y, int width, int height) /* Pass the original vcf chromStart to pgSnpMapItem, so if we have trimmed an identical