8d9367da0e57755641dbbf6498093f7e44f14f12 angie Sat Jun 20 13:27:08 2020 -0700 VCF: support up to 64k genotypes, including haplotype+tree display. sorta refs #25278 The VCF parser currently has a hardcoded upper limit on the number of genotype columns. I increased that from 16k to 64k to support Rob Lanfear's SARS-CoV-2 tree w/40k nodes. It will keep growing, so I will probably want to revisit so that we allocate the array size needed instead of a hardcoded huge size when most VCF tracks don't have anywhere near that many. In vcfTracks.c, using 'unsigned short' for the gtHapOrder array limits the number of samples to 32k because the index into gtHapOrder is gtIx<<1+hapIx. Changed all the shorts to ints because we now have VCF with more than 32k genotype columns. diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index cf11ade..a6d3b67 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -287,41 +287,41 @@ { int center; // index from which each point's contribution to distance is to be weighted int len; // total length of haplotype strings double alpha; // weighting factor for distance from center struct lm *localMem; }; // This is the representation of a cluster of up to 65,535 haplotypes of equal length, // where each variant's alleles are specified as 0 (reference) or 1 (alternate) // [or possibly 2 for second alternate, but those are rare so I'll ignore them]. // When an individual is heterozygous and unphased for some variant, we need to // account for missing data. struct hapCluster { struct hapCluster *next; // hacTree wants slList of items - unsigned short *refCounts; // per-variant count of reference alleles observed - unsigned short *unkCounts; // per-variant count of unknown (or unphased het) alleles - unsigned short leafCount; // number of leaves under this node (or 1 if leaf) - unsigned short gtHapIx; // if leaf, (genotype index << 1) + hap (0 or 1 for diploid) + unsigned int *refCounts; // per-variant count of reference alleles observed + unsigned int *unkCounts; // per-variant count of unknown (or unphased het) alleles + unsigned int leafCount; // number of leaves under this node (or 1 if leaf) + unsigned int gtHapIx; // if leaf, (genotype index << 1) + hap (0 or 1 for diploid) }; INLINE boolean isRef(const struct hapCluster *c, int varIx) // Return TRUE if the leaves of cluster have at least as many reference alleles // as alternate alleles for variant varIx. { -unsigned short altCount = c->leafCount - c->refCounts[varIx] - c->unkCounts[varIx]; +unsigned int altCount = c->leafCount - c->refCounts[varIx] - c->unkCounts[varIx]; return (c->refCounts[varIx] >= altCount); } static double cwaDistance(const struct slList *item1, const struct slList *item2, void *extraData) /* Center-weighted alpha sequence distance function for hacTree clustering of haplotype seqs */ // This is inner-loop so I am not doing defensive checks. Caller must ensure: // 1. kids's sequences' lengths are both equal to helper->len // 2. 0 <= helper->center <= len // 3. 0.0 < helper->alpha <= 1.0 { const struct hapCluster *kid1 = (const struct hapCluster *)item1; const struct hapCluster *kid2 = (const struct hapCluster *)item2; struct cwaExtraData *helper = extraData; double distance = 0; double weight = 1; // start at center: alpha to the 0th power @@ -334,32 +334,32 @@ } weight = helper->alpha; // start at center+1: alpha to the 1st power for (i=helper->center+1; i < helper->len; i++) { if (isRef(kid1, i) != isRef(kid2, i)) distance += weight; weight *= helper->alpha; } return distance; } static struct hapCluster *lmHapCluster(struct cwaExtraData *helper) /* Use localMem to allocate a new cluster of the given len. */ { struct hapCluster *c = lmAlloc(helper->localMem, sizeof(struct hapCluster)); -c->refCounts = lmAlloc(helper->localMem, helper->len * sizeof(unsigned short)); -c->unkCounts = lmAlloc(helper->localMem, helper->len * sizeof(unsigned short)); +c->refCounts = lmAlloc(helper->localMem, helper->len * sizeof(unsigned int)); +c->unkCounts = lmAlloc(helper->localMem, helper->len * sizeof(unsigned int)); return c; } static struct slList *cwaMerge(const struct slList *item1, const struct slList *item2, void *extraData) /* Make a consensus haplotype from two input haplotypes, for hacTree clustering by * center-weighted alpha distance. */ // This is inner-loop so I am not doing defensive checks. Caller must ensure that // kids's sequences' lengths are both equal to helper->len. { const struct hapCluster *kid1 = (const struct hapCluster *)item1; const struct hapCluster *kid2 = (const struct hapCluster *)item2; struct cwaExtraData *helper = extraData; struct hapCluster *consensus = lmHapCluster(helper); consensus->leafCount = kid1->leafCount + kid2->leafCount; @@ -383,62 +383,62 @@ } static int cwaCmp(const struct slList *item1, const struct slList *item2, void *extraData) /* Convert hapCluster to allele strings for easy comparison by strcmp. */ { const struct hapCluster *c1 = (const struct hapCluster *)item1; const struct hapCluster *c2 = (const struct hapCluster *)item2; struct cwaExtraData *helper = extraData; char s1[helper->len+1]; char s2[helper->len+1]; hapClusterToString(c1, s1, helper->len); hapClusterToString(c2, s2, helper->len); return strcmp(s1, s2); } -void rSetGtHapOrder(struct hacTree *ht, unsigned short *gtHapOrder, unsigned short *retGtHapEnd) +void rSetGtHapOrder(struct hacTree *ht, unsigned int *gtHapOrder, unsigned int *retGtHapEnd) /* Traverse hacTree and build an ordered array of genotype + haplotype indices. */ { if (ht->left == NULL && ht->right == NULL) { struct hapCluster *c = (struct hapCluster *)ht->itemOrCluster; gtHapOrder[(*retGtHapEnd)++] = c->gtHapIx; } else if (ht->left == NULL) rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd); else if (ht->right == NULL) rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); else { struct hapCluster *cL = (struct hapCluster *)ht->left->itemOrCluster; struct hapCluster *cR = (struct hapCluster *)ht->right->itemOrCluster; if (cL->leafCount >= cR->leafCount) { rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd); } else { rSetGtHapOrder(ht->right, gtHapOrder, retGtHapEnd); rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); } } } -static unsigned short *clusterHaps(const struct vcfFile *vcff, int centerIx, +static unsigned int *clusterHaps(const struct vcfFile *vcff, int centerIx, int startIx, int endIx, - unsigned short *retGtHapEnd, struct hacTree **retTree) + unsigned int *retGtHapEnd, struct hacTree **retTree) /* Given a bunch of VCF records with phased genotypes, build up one haplotype string * per chromosome that is the sequence of alleles in all variants (simplified to one base * per variant). Each individual/sample will have two haplotype strings (unless haploid * like Y or male X). Independently cluster the haplotype strings using hacTree with the * center-weighted alpha functions above. Return an array of genotype+haplotype indices * in the order determined by the hacTree, and set retGtHapEnd to its length/end. */ { double alpha = 0.5; struct lm *lm = lmInit(0); struct cwaExtraData helper = { centerIx-startIx, endIx-startIx, alpha, lm }; int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. int gtCount = vcff->genotypeCount; // Make an slList of hapClusters, but allocate in a big block so I can use // array indexing. struct hapCluster **hapArray = lmAlloc(lm, sizeof(struct hapCluster *) * gtCount * ploidy); @@ -497,31 +497,31 @@ { // Some array items will have an empty cluster for missing hap2 -- // trim those from the linked list. struct hapCluster *c = hapArray[0]; while (c != NULL && c->next != NULL) { if (c->next->leafCount == 0) c->next = c->next->next; else c = c->next; } } } struct hacTree *ht = hacTreeFromItems((struct slList *)(hapArray[0]), lm, cwaDistance, cwaMerge, cwaCmp, &helper); -unsigned short *gtHapOrder = needMem(vcff->genotypeCount * ploidy * sizeof(unsigned short)); +unsigned int *gtHapOrder = needMem(vcff->genotypeCount * ploidy * sizeof(unsigned int)); rSetGtHapOrder(ht, gtHapOrder, retGtHapEnd); *retTree = ht; return gtHapOrder; } INLINE Color pgSnpColor(char *allele) /* Color allele by first base according to pgSnp palette. */ { if (allele[0] == 'A') return revCmplDisp ? MG_MAGENTA : MG_RED; else if (allele[0] == 'C') return revCmplDisp ? darkGreenColor : MG_BLUE; else if (allele[0] == 'G') return revCmplDisp ? MG_BLUE : darkGreenColor; else if (allele[0] == 'T') @@ -670,31 +670,31 @@ /* Color gray for unknown or mixed, otherwise pgSnpColor of predominant allele. */ { const int fudgeFactor = 4; // Threshold for calling for one color or the other when mixed if (unks > (refs + alts)) return undefYellow; if (alts > fudgeFactor * refs) return pgSnpColor(altAl); if (refs > fudgeFactor * alts) return pgSnpColor(refAl); return shadesOfGray[5]; } // tg->height needs an extra pixel at the bottom; it's eaten by the clipping rectangle: #define CLIP_PAD 1 -static void drawOneRec(struct vcfRecord *rec, unsigned short *gtHapOrder, unsigned short gtHapCount, +static void drawOneRec(struct vcfRecord *rec, unsigned int *gtHapOrder, unsigned int gtHapCount, struct track *tg, struct hvGfx *hvg, int xOff, int yOff, int width, boolean isClustered, boolean isCenter, enum hapColorMode colorMode) /* Draw a stack of genotype bars for this record */ { unsigned int chromStartMap = vcfRecordTrimIndelLeftBase(rec); unsigned int chromEndMap = vcfRecordTrimAllelesRight(rec); const double scale = scaleForPixels(width); int x1 = round((double)(rec->chromStart-winStart)*scale) + xOff; int x2 = round((double)(rec->chromEnd-winStart)*scale) + xOff; int w = x2-x1; if (w <= 1) { x1--; w = 3; } @@ -946,63 +946,63 @@ else if (ht->right != NULL) return rDrawTreeInLabelArea(ht->right, hvg, yType, x, yFromNode, yh, th, drawRectangle); // Leaf node -- return pixel height. Draw a line if yType is midpoint. int y = yFromNode(ht->itemOrCluster, yh, yType); if (yType == yrtMidPoint && x < labelEnd) { hvGfxLine(hvg, x, y, labelEnd, y, purple); addClusterMapItem(ht, x, y, labelEnd, y+1, th); } return y; } struct yFromNodeHelper /* Pre-computed mapping from cluster nodes' gtHapIx to pixel heights. */ { - unsigned short gtHapCount; - unsigned short *gtHapIxToPxStart; - unsigned short *gtHapIxToPxEnd; + unsigned int gtHapCount; + unsigned int *gtHapIxToPxStart; + unsigned int *gtHapIxToPxEnd; }; void initYFromNodeHelper(struct yFromNodeHelper *helper, int yOff, int height, - unsigned short gtHapCount, unsigned short *gtHapOrder, + unsigned int gtHapCount, unsigned int *gtHapOrder, int genotypeCount) /* Build a mapping of genotype and haplotype to pixel y coords. */ { helper->gtHapCount = gtHapCount; -helper->gtHapIxToPxStart = needMem(genotypeCount * 2 * sizeof(unsigned short)); -helper->gtHapIxToPxEnd = needMem(genotypeCount * 2 * sizeof(unsigned short)); +helper->gtHapIxToPxStart = needMem(genotypeCount * 2 * sizeof(unsigned int)); +helper->gtHapIxToPxEnd = needMem(genotypeCount * 2 * sizeof(unsigned int)); double pxPerHap = (double)height / gtHapCount; int i; for (i = 0; i < gtHapCount; i++) { int yStart = round(i * pxPerHap); int yEnd = round((i+1) * pxPerHap); if (yEnd == yStart) yEnd++; int gtHapIx = gtHapOrder[i]; helper->gtHapIxToPxStart[gtHapIx] = yOff + yStart; helper->gtHapIxToPxEnd[gtHapIx] = yOff + yEnd; } } static int yFromHapNode(const struct slList *itemOrCluster, void *extraData, enum yRetType yType) /* Extract the gtHapIx from hapCluster (hacTree node item), find out its relative order * and translate that to a pixel height. */ { -unsigned short gtHapIx = ((const struct hapCluster *)itemOrCluster)->gtHapIx; +unsigned int gtHapIx = ((const struct hapCluster *)itemOrCluster)->gtHapIx; struct yFromNodeHelper *helper = extraData; int y; if (yType == yrtStart) y = helper->gtHapIxToPxStart[gtHapIx]; else if (yType == yrtEnd) y = helper->gtHapIxToPxEnd[gtHapIx]; else y = (helper->gtHapIxToPxStart[gtHapIx] + helper->gtHapIxToPxEnd[gtHapIx]) / 2; return y; } void initTitleHelper(struct titleHelper *th, char *track, int startIx, int centerIx, int endIx, int nRecords, struct vcfFile *vcff) /* Set up info including arrays of ref & alt alleles for cluster mouseover. */ { @@ -1086,41 +1086,41 @@ popWarnHandler(); return TRUE; } static void vcfHapClusterDraw(struct track *tg, int seqStart, int seqEnd, struct hvGfx *hvg, int xOff, int yOff, int width, MgFont *font, Color color, enum trackVisibility vis) /* Split samples' chromosomes (haplotypes), cluster them by center-weighted * alpha similarity, and draw in the order determined by clustering. */ { struct vcfFile *vcff = tg->extraUiData; enum hapColorMode colorMode; if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode)) return; purple = hvGfxFindColorIx(hvg, 0x99, 0x00, 0xcc); -unsigned short gtHapCount = 0; +unsigned int gtHapCount = 0; int nRecords = slCount(vcff->records); int centerIx = getCenterVariantIx(tg, seqStart, seqEnd, vcff->records); // Limit the number of variants that we compare, to keep from timing out: // (really what we should limit is the number of distinct haplo's passed to hacTree!) // In the meantime, this should at least be a cart var... int maxVariantsPerSide = 50; int startIx = max(0, centerIx - maxVariantsPerSide); int endIx = min(nRecords, centerIx+1 + maxVariantsPerSide); struct hacTree *ht = NULL; -unsigned short *gtHapOrder = clusterHaps(vcff, centerIx, startIx, endIx, >HapCount, &ht); +unsigned int *gtHapOrder = clusterHaps(vcff, centerIx, startIx, endIx, >HapCount, &ht); struct vcfRecord *centerRec = NULL; struct vcfRecord *rec; int ix; // Unlike drawing order (last drawn is on top), the first mapBox gets priority, // so map center variant before drawing & mapping other variants! for (rec = vcff->records, ix=0; rec != NULL; rec = rec->next, ix++) { if (ix == centerIx) { centerRec = rec; mapBoxForCenterVariant(rec, hvg, tg, xOff, yOff, width); break; } } for (rec = vcff->records, ix=0; rec != NULL; rec = rec->next, ix++) @@ -1137,31 +1137,31 @@ int extraPixel = (colorMode == altOnlyMode) ? 1 : 0; int hapHeight = tg->height- CLIP_PAD - 2*extraPixel; struct yFromNodeHelper yHelper = {0, NULL, NULL}; initYFromNodeHelper(&yHelper, yOff+extraPixel, hapHeight, gtHapCount, gtHapOrder, vcff->genotypeCount); struct titleHelper titleHelper = { NULL, 0, 0, 0, 0, NULL, NULL }; initTitleHelper(&titleHelper, tg->track, startIx, centerIx, endIx, nRecords, vcff); char *treeAngle = cartOrTdbString(cart, tg->tdb, VCF_HAP_TREEANGLE_VAR, VCF_DEFAULT_HAP_TREEANGLE); boolean drawRectangle = sameString(treeAngle, VCF_HAP_TREEANGLE_RECTANGLE); drawTreeInLabelArea(ht, hvg, yOff+extraPixel, hapHeight+CLIP_PAD, &yHelper, &titleHelper, drawRectangle); } static void drawSampleLabels(struct vcfFile *vcff, struct hvGfx *hvg, boolean isAllDiploid, int yStart, int height, - unsigned short *gtHapOrder, int gtHapCount, MgFont *font, Color color, + unsigned int *gtHapOrder, int gtHapCount, MgFont *font, Color color, char *track) /* Draw sample names as left labels. */ { // Figure out which hvg to use, save current clipping, and clip to left label coords: struct hvGfx *hvgLL = (hvgSide != NULL) ? hvgSide : hvg; int clipXBak, clipYBak, clipWidthBak, clipHeightBak; hvGfxGetClip(hvgLL, &clipXBak, &clipYBak, &clipWidthBak, &clipHeightBak); hvGfxUnclip(hvgLL); hvGfxSetClip(hvgLL, leftLabelX, yStart, leftLabelWidth, height); if (isAllDiploid) { double pxPerGt = (double)height / vcff->genotypeCount; if (pxPerGt < tl.fontHeight + 1) warn("track %s: drawSampleLabels called with insufficient height", track); int gtIx; @@ -1181,31 +1181,31 @@ for (orderIx = 0; orderIx < gtHapCount; orderIx++) { int gtHapIx = gtHapOrder[orderIx]; int gtIx = (gtHapIx >> 1); int y = gtIx * pxPerHt; hvGfxTextRight(hvgLL, leftLabelX, y+yStart, leftLabelWidth-1, (int)pxPerHt, color, font, vcff->genotypeIds[gtIx]); } } // Restore the prior clipping: hvGfxUnclip(hvgLL); hvGfxSetClip(hvgLL, clipXBak, clipYBak, clipWidthBak, clipHeightBak); } static void drawSampleTitles(struct vcfFile *vcff, int yStart, int height, - unsigned short *gtHapOrder, int gtHapCount, char *track) + unsigned int *gtHapOrder, int gtHapCount, char *track) /* Draw mouseover labels / titles with the samples that are drawn at each pixel y offset */ { double hapPerPx = (double)gtHapCount / height; int labelEnd = leftLabelX + leftLabelWidth; struct dyString *dy = dyStringNew(0); int y; for (y = 0; y < height; y++) { dyStringClear(dy); int gtHapStart = y * hapPerPx; int gtHapEnd = (y + 1) * hapPerPx; if (gtHapEnd == gtHapStart) gtHapEnd++; char *lastSample = NULL; int gtHapIx; @@ -1214,37 +1214,37 @@ int gtIx = (gtHapOrder[gtHapIx] >> 1); char *sample = vcff->genotypeIds[gtIx]; if (!lastSample || differentString(sample, lastSample)) { if (isNotEmpty(dy->string)) dyStringAppend(dy, ", "); dyStringAppend(dy, sample); lastSample = sample; } } imgTrackAddMapItem(curImgTrack, TITLE_BUT_NO_LINK, dy->string, leftLabelX, y+yStart, labelEnd, y+yStart+1, track); } } -static unsigned short *gtHapOrderFromGtOrder(struct vcfFile *vcff, +static unsigned int *gtHapOrderFromGtOrder(struct vcfFile *vcff, boolean *retIsAllDiploid, int *retGtHapCount) { int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. int gtCount = vcff->genotypeCount; boolean isAllDiploid = TRUE; -unsigned short *gtHapOrder = needMem(gtCount * ploidy * sizeof(unsigned short)); +unsigned int *gtHapOrder = needMem(gtCount * ploidy * sizeof(unsigned int)); int orderIx = 0; int gtIx; // Determine the number of chromosome rows; for chrX, can be mix of diploid and haploid. for (gtIx=0; gtIx < gtCount; gtIx++) { int gtHapIx = (gtIx << 1); gtHapOrder[orderIx] = gtHapIx; orderIx++; struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { struct vcfGenotype *gt = &(rec->genotypes[gtIx]); if (!gt->isHaploid) { gtHapOrder[orderIx] = gtHapIx+1; @@ -1262,31 +1262,31 @@ return gtHapOrder; } static void vcfGtHapFileOrderDraw(struct track *tg, int seqStart, int seqEnd, struct hvGfx *hvg, int xOff, int yOff, int width, MgFont *font, Color color, enum trackVisibility vis) /* Draw rows in the same fashion as vcfHapClusterDraw, but instead of clustering, use the * order in which samples appear in the VCF file. */ { struct vcfFile *vcff = tg->extraUiData; enum hapColorMode colorMode; if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode)) return; boolean isAllDiploid; int gtHapCount; -unsigned short *gtHapOrder = gtHapOrderFromGtOrder(vcff, &isAllDiploid, >HapCount); +unsigned int *gtHapOrder = gtHapOrderFromGtOrder(vcff, &isAllDiploid, >HapCount); struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, FALSE, FALSE, colorMode); // If height is sufficient, draw sample names as left labels; otherwise make mouseover titles // with sample names for each pixel y offset. int extraPixel = (colorMode == altOnlyMode) ? 1 : 0; int hapHeight = tg->height - CLIP_PAD - 2*extraPixel; int minHeightForLabels; if (isAllDiploid) minHeightForLabels = vcff->genotypeCount * (tl.fontHeight + 1); else minHeightForLabels = gtHapCount * (tl.fontHeight + 1); if (hapHeight >= minHeightForLabels) drawSampleLabels(vcff, hvg, isAllDiploid, yOff+extraPixel, hapHeight, gtHapOrder, gtHapCount, @@ -1316,33 +1316,33 @@ hashAddInt(sampleToIx, vcff->genotypeIds[gtIx], gtIx); return sampleToIx; } static int gtIxFromSample(struct hash *sampleToIx, char *sample, char *vcfFileName) /* Look up sample's genotype index in sampleToIx, die complaining about vcfFileName if not found. */ { int gtIx = hashIntValDefault(sampleToIx, sample, -1); if (gtIx < 0) errAbort("gtIxFromSample: sample '%s' not found in VCF file %s", sample, vcfFileName); return gtIx; } static void rSetGtHapOrderFromTree(struct phyloTree *node, struct vcfFile *vcff, struct hash *sampleToIx, - unsigned short *gtHapOrder, int *pGtHapCount, - unsigned short *leafOrderToHapOrderStart, - unsigned short *leafOrderToHapOrderEnd, int *pLeafCount) + unsigned int *gtHapOrder, int *pGtHapCount, + unsigned int *leafOrderToHapOrderStart, + unsigned int *leafOrderToHapOrderEnd, int *pLeafCount) /* Set gtHapOrder to sample gt & hap indices in the order we encounter the samples in tree. */ { if (node->numEdges > 0) { int ix; for (ix = 0; ix < node->numEdges; ix++) rSetGtHapOrderFromTree(node->edges[ix], vcff, sampleToIx, gtHapOrder, pGtHapCount, leafOrderToHapOrderStart, leafOrderToHapOrderEnd, pLeafCount); } else { int gtIx = gtIxFromSample(sampleToIx, node->ident->name, vcff->fileOrUrl); int gtHapIx = (gtIx << 1); gtHapOrder[*pGtHapCount] = gtHapIx; leafOrderToHapOrderStart[*pLeafCount] = leafOrderToHapOrderEnd[*pLeafCount] = *pGtHapCount; @@ -1351,64 +1351,64 @@ for (rec = vcff->records; rec != NULL; rec = rec->next) { struct vcfGenotype *gt = &(rec->genotypes[gtIx]); if (!gt->isHaploid) { gtHapOrder[*pGtHapCount] = gtHapIx+1; leafOrderToHapOrderEnd[*pLeafCount] = *pGtHapCount; *pGtHapCount += 1; break; } } *pLeafCount += 1; } } -static unsigned short *gtHapOrderFromTree(struct vcfFile *vcff, struct phyloTree *tree, - unsigned short **retLeafOrderToHapOrderStart, - unsigned short **retLeafOrderToHapOrderEnd, +static unsigned int *gtHapOrderFromTree(struct vcfFile *vcff, struct phyloTree *tree, + unsigned int **retLeafOrderToHapOrderStart, + unsigned int **retLeafOrderToHapOrderEnd, int *retGtHapCount) /* Alloc & return gtHapOrder, set to samples in the order we encounter them in tree. * Also build up maps of leaf order to low and high gtHapIx, for drawing the tree later. */ { struct hash *sampleToIx = makeSampleToIx(vcff); int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. int gtCount = vcff->genotypeCount; -unsigned short *gtHapOrder = needMem(gtCount * ploidy * sizeof(unsigned short)); -*retLeafOrderToHapOrderStart = needMem(gtCount * sizeof(unsigned short)); -*retLeafOrderToHapOrderEnd = needMem(gtCount * sizeof(unsigned short)); +unsigned int *gtHapOrder = needMem(gtCount * ploidy * sizeof(unsigned int)); +*retLeafOrderToHapOrderStart = needMem(gtCount * sizeof(unsigned int)); +*retLeafOrderToHapOrderEnd = needMem(gtCount * sizeof(unsigned int)); *retGtHapCount = 0; int leafCount = 0; rSetGtHapOrderFromTree(tree, vcff, sampleToIx, gtHapOrder, retGtHapCount, *retLeafOrderToHapOrderStart, *retLeafOrderToHapOrderEnd, &leafCount); if (leafCount != vcff->genotypeCount) errAbort("gtHapOrderFromTree: tree has %d leaves, but VCF file %s has %d genotype columns", leafCount, vcff->fileOrUrl, vcff->genotypeCount); return gtHapOrder; } // Relative coordinates for tree layout, to be transformed into pixel coords later. struct nodeCoords { double rank; // Centerpoint of children's rank in terms of hap order (0 = top haplotype) int depth; // Maximum child depth + 1 }; static int phyloTreeAddNodeCoords(struct phyloTree *node, - unsigned short *leafOrderToHapOrderStart, - unsigned short *leafOrderToHapOrderEnd, + unsigned int *leafOrderToHapOrderStart, + unsigned int *leafOrderToHapOrderEnd, int leafIx) /* Recursively annotate node and descendants with nodeCoords to prepare for drawing the tree. */ { struct nodeCoords *coords; AllocVar(coords); node->priv = coords; if (node->numEdges > 0) { double minRank = -1, maxRank = 0; int maxDepth = 0; int ix; for (ix = 0; ix < node->numEdges; ix++) { struct phyloTree *child = node->edges[ix]; leafIx = phyloTreeAddNodeCoords(child, leafOrderToHapOrderStart, leafOrderToHapOrderEnd, @@ -1471,63 +1471,63 @@ else { // Leaf node -- draw a horizontal line, and label if there is space to right of tree struct nodeCoords *coords = node->priv; int y = yOff + ((0.5 + coords->rank) * pxPerHap); hvGfxLine(hvg, x, y, x+branchW, y, color); int textX = x + branchW + 3; if (pxPerHap >= tl.fontHeight+1 && textX < labelEnd) hvGfxText(hvg, textX, y, MG_BLACK, font, node->ident->name); } } static void drawPhyloTreeInLabelArea(struct phyloTree *tree, struct hvGfx *hvg, int yOff, int clipHeight, int gtHapCount, MgFont *font, boolean drawRectangle, - unsigned short *leafOrderToHapOrderStart, - unsigned short *leafOrderToHapOrderEnd) + unsigned int *leafOrderToHapOrderStart, + unsigned int *leafOrderToHapOrderEnd) { struct hvGfx *hvgLL = (hvgSide != NULL) ? hvgSide : hvg; int clipXBak, clipYBak, clipWidthBak, clipHeightBak; hvGfxGetClip(hvgLL, &clipXBak, &clipYBak, &clipWidthBak, &clipHeightBak); hvGfxUnclip(hvgLL); hvGfxSetClip(hvgLL, leftLabelX, yOff, leftLabelWidth, clipHeight); // Figure out rank (vertical position) and depth (horizontal position) of every node in tree: phyloTreeAddNodeCoords(tree, leafOrderToHapOrderStart, leafOrderToHapOrderEnd, 0); // Draw the tree: int x = leftLabelX; double pxPerHap = (double)clipHeight / gtHapCount; rDrawPhyloTreeInLabelArea(tree, hvgLL, x, yOff, pxPerHap, font, drawRectangle); // Restore the prior clipping: hvGfxUnclip(hvgLL); hvGfxSetClip(hvgLL, clipXBak, clipYBak, clipWidthBak, clipHeightBak); } static void vcfGtHapTreeFileDraw(struct track *tg, int seqStart, int seqEnd, struct hvGfx *hvg, int xOff, int yOff, int width, MgFont *font, Color color, enum trackVisibility vis) /* Draw rows in the same fashion as vcfHapClusterDraw, but instead of clustering, use the * order in which samples appear in the VCF file. */ { struct vcfFile *vcff = tg->extraUiData; enum hapColorMode colorMode; if (!vcfHapClusterDrawInit(tg, vcff, hvg, &colorMode)) return; struct phyloTree *tree = getTreeFromFile(tg->tdb); int gtHapCount; -unsigned short *leafOrderToHapOrderStart, *leafOrderToHapOrderEnd; -unsigned short *gtHapOrder = gtHapOrderFromTree(vcff, tree, +unsigned int *leafOrderToHapOrderStart, *leafOrderToHapOrderEnd; +unsigned int *gtHapOrder = gtHapOrderFromTree(vcff, tree, &leafOrderToHapOrderStart, &leafOrderToHapOrderEnd, >HapCount); struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, FALSE, FALSE, colorMode); int extraPixel = (colorMode == altOnlyMode) ? 1 : 0; int hapHeight = tg->height - CLIP_PAD - 2*extraPixel; char *treeAngle = cartOrTdbString(cart, tg->tdb, VCF_HAP_TREEANGLE_VAR, VCF_DEFAULT_HAP_TREEANGLE); boolean drawRectangle = sameString(treeAngle, VCF_HAP_TREEANGLE_RECTANGLE); drawPhyloTreeInLabelArea(tree, hvg, yOff+extraPixel, hapHeight, gtHapCount, font, drawRectangle, leafOrderToHapOrderStart, leafOrderToHapOrderEnd); drawSampleTitles(vcff, yOff+extraPixel, hapHeight, gtHapOrder, gtHapCount, tg->track); } @@ -1656,42 +1656,42 @@ errCatchFree(&errCatch); } static void vcfPhasedAddMapBox(struct hvGfx *hvg, struct vcfRecord *rec, struct pgSnpVcfStartEnd *psvs, char *text, int x, int y, int width, int height, struct track *track) // Add mapbox for a tick in the vcfPhased track { mapBoxHgcOrHgGene(hvg, psvs->vcfStart, psvs->vcfEnd, x, y, width, height, track->track, rec->name, text, NULL, TRUE, NULL); } struct hapDistanceMatrixCell { /* The pair of alleles and how distant they are */ struct hapDistanceMatrixCell *next; char *sampleId; // name of this sample char *otherId; // name of other sample - short alleleIx; // index into vcfRecord->genotypes - short otherAlleleIx; // index into vcfRecord->genotypes for other sample + int alleleIx; // index into vcfRecord->genotypes + int otherAlleleIx; // index into vcfRecord->genotypes for other sample double dist; // distance between this sample/allele and another sample/allele }; struct hapDistanceMatrix { /* A row of hapDistanceMatrixCells, which store the distance computed by cwaDistance() * between two different alleles */ struct hapDistanceMatrix *next; // next row in matrix char *sampleId; // name of this sample - short alleleIx; // allele id of this sample, 0 or 1 + int alleleIx; // allele id of this sample, 0 or 1 struct hapDistanceMatrixCell *row; // a row of cwaDistance scores }; static int hapDistanceMatrixCellCmp(const void *item1, const void *item2) { const struct hapDistanceMatrixCell *cell1 = *(struct hapDistanceMatrixCell **)item1; const struct hapDistanceMatrixCell *cell2 = *(struct hapDistanceMatrixCell **)item2; return cell1->dist > cell2->dist; } static struct hapDistanceMatrixCell *findClosestChildAlleleToParent(char *parent, char *child, struct hapDistanceMatrix *matrix) /* Get all cells belonging to a certain parent onto a list and sort by distance */ { struct hapDistanceMatrix *vec = NULL; @@ -1705,74 +1705,74 @@ struct hapDistanceMatrixCell *tmp = needMem(sizeof(struct hapDistanceMatrixCell)); tmp->next = NULL; tmp->sampleId = cloneString(cell->sampleId); tmp->alleleIx = cell->alleleIx; tmp->otherId = cloneString(vec->sampleId); tmp->otherAlleleIx = vec->alleleIx; tmp->dist = cell->dist; slAddHead(&cellList, tmp); } } } slSort(&cellList, hapDistanceMatrixCellCmp); return cellList; } -static unsigned short toggleShort(unsigned short s) +static unsigned int toggleInt(unsigned int s) /* Add or subtract one */ { return s & 1 ? s - 1 : s + 1; } -static void fillOutHapOrder(unsigned short *hapOrder, unsigned short hapCount, struct hapDistanceMatrixCell *c1, struct hapDistanceMatrixCell *c2) +static void fillOutHapOrder(unsigned int *hapOrder, unsigned int hapCount, struct hapDistanceMatrixCell *c1, struct hapDistanceMatrixCell *c2) /* Assign indices to hapOrder in the order we should draw the alleles. Allows for the second parent * cell to be NULL */ { if (c1 == NULL) errAbort("Error: fillOutHapOrder passed NULL parent"); int numSamplesToDraw = hapCount / 2; int i; for (i = 0; i < numSamplesToDraw; i++) { - short hapIx = 2*i; + int hapIx = 2*i; if (i == 0) // top set of lines { - hapOrder[hapIx] = toggleShort(c1->alleleIx); + hapOrder[hapIx] = toggleInt(c1->alleleIx); hapOrder[hapIx+1] = c1->alleleIx; } else if (i + 1 == numSamplesToDraw) // last set of lines { if (c2) { hapOrder[hapIx] = c2->alleleIx; - hapOrder[hapIx+1] = toggleShort(c2->alleleIx); + hapOrder[hapIx+1] = toggleInt(c2->alleleIx); } else { hapOrder[hapIx] = c1->otherAlleleIx; - hapOrder[hapIx+1] = toggleShort(c1->otherAlleleIx); + hapOrder[hapIx+1] = toggleInt(c1->otherAlleleIx); } } else // orient based on above and below { hapOrder[hapIx] = c1->otherAlleleIx; hapOrder[hapIx+1] = c2->otherAlleleIx; } } } -static void setHapOrderFromMatrix(unsigned short *hapOrder, unsigned short hapCount, +static void setHapOrderFromMatrix(unsigned int *hapOrder, unsigned int hapCount, struct hapDistanceMatrix *matrix, struct hapCluster **hapArray, struct vcfFile *vcff, char *childName, char **sampleDrawOrder) /* Given a matrix where each row is an allele of the child, and each column * is the distance from the child allele to a parent allele, fill out the order * in which the haplotypes will be drawn. The order are the indexes into rec->genotypes. */ { char *parentNames[(hapCount/2) - 1]; struct hapDistanceMatrixCell *c1 = NULL; struct hapDistanceMatrixCell *c2 = NULL; int i, ix = 0; if (hapCount > 2) // is there a trio or at least part of one? { for (i = 0; i < hapCount/2 ; i++) if (!sameString(sampleDrawOrder[i], childName)) parentNames[ix++] = sampleDrawOrder[i]; @@ -1797,82 +1797,82 @@ { while (c1->dist < c2->dist && c1->otherAlleleIx == c2->otherAlleleIx) c2 = c2->next; } } } fillOutHapOrder(hapOrder, hapCount, c1, c2); } else { hapOrder[0] = matrix->alleleIx; hapOrder[1] = matrix->next->alleleIx; } } -static void assignHapArrayIx(short *ret, struct hapCluster **hapArray, struct vcfFile *vcff, char *sample, boolean doChild) +static void assignHapArrayIx(int *ret, struct hapCluster **hapArray, struct vcfFile *vcff, char *sample, boolean doChild) { int i; struct vcfRecord *rec = vcff->records; int ix = 0; // index into ret for (i = 0; i < slCount(hapArray); i++) { struct hapCluster *hc = hapArray[i]; struct vcfGenotype *gt = &(rec->genotypes[hc->gtHapIx >> 1]); if (!doChild && !sameString(gt->id, sample)) ret[ix++] = i; else if (doChild && sameString(gt->id, sample)) ret[ix++] = i; } } static struct hapDistanceMatrix *fillOutDistanceMatrix(struct hapCluster **hapArray, struct vcfFile *vcff, char *sample, struct cwaExtraData *helper, int gtCount) /* Allocates and fill out a struct hapDistanceMatrix, one row per child allele, and a * hapDistanceMatrixCell per parent allele */ { -short parGtCount = (gtCount - 1) * 2; +int parGtCount = (gtCount - 1) * 2; int i,j; struct vcfRecord *rec = vcff->records; struct hapDistanceMatrix *matrix = NULL; -short childHapArrayIndices[2]; -short parentHapArrayIndices[parGtCount]; +int childHapArrayIndices[2]; +int parentHapArrayIndices[parGtCount]; assignHapArrayIx(childHapArrayIndices, hapArray, vcff, sample, TRUE); assignHapArrayIx(parentHapArrayIndices, hapArray, vcff, sample, FALSE); for (i = 0; i < 2; i++) { struct hapDistanceMatrix *row = needMem(sizeof(struct hapDistanceMatrix)); struct hapCluster *hcChild = hapArray[childHapArrayIndices[i]]; struct vcfGenotype *gt = &(rec->genotypes[hcChild->gtHapIx >> 1]); row->row = NULL; row->sampleId = cloneString(gt->id); row->alleleIx = hcChild->gtHapIx; for (j = 0; j < parGtCount; j++) { struct hapDistanceMatrixCell *cell = needMem(sizeof(struct hapDistanceMatrixCell)); struct hapCluster *hcParent = hapArray[parentHapArrayIndices[j]]; struct vcfGenotype *parGt = &(rec->genotypes[hcParent->gtHapIx >> 1]); cell->sampleId = cloneString(parGt->id); cell->alleleIx = hcParent->gtHapIx; cell->dist = cwaDistance((struct slList *)hcChild, (struct slList *)hcParent, helper); slAddHead(&(row->row), cell); } slAddHead(&matrix, row); } return matrix; } -unsigned short *computeHapDist(struct vcfFile *vcff, int centerIx, int startIx, int endIx, char *sample, int gtCount, char **sampleDrawOrder) +unsigned int *computeHapDist(struct vcfFile *vcff, int centerIx, int startIx, int endIx, char *sample, int gtCount, char **sampleDrawOrder) // similar to clusterHaps(), but instead of making a hacTree at the end, call cwaDistance // on each of the pairs in hapArray to make a distance matrix, then compute a hapOrder from that { double alpha = 0.5; struct lm *lm = lmInit(0); struct cwaExtraData helper = { centerIx-startIx, endIx-startIx, alpha, lm }; struct hapCluster **hapArray = lmAlloc(lm, sizeof(struct hapCluster *) * gtCount * 2); int i; for (i=0; i < 2 * gtCount; i++) { hapArray[i] = lmHapCluster(&helper); if (i > 0) hapArray[i-1]->next = hapArray[i]; } struct vcfRecord *rec; @@ -1927,41 +1927,41 @@ // Some array items will have an empty cluster for missing hap2 -- // trim those from the linked list. struct hapCluster *c = hapArray[0]; while (c != NULL && c->next != NULL) { if (c->next->leafCount == 0) c->next = c->next->next; else c = c->next; } } } // now fill out a distance matrix based on the cwaDistance between all the pairs in hapArray struct hapDistanceMatrix *hapDistMatrix = fillOutDistanceMatrix(hapArray, vcff, sample, &helper, gtCount); -unsigned short hapCount = 2 * gtCount; -unsigned short *hapOrder = needMem(sizeof(unsigned short) * hapCount); +unsigned int hapCount = 2 * gtCount; +unsigned int *hapOrder = needMem(sizeof(unsigned int) * hapCount); setHapOrderFromMatrix(hapOrder, hapCount, hapDistMatrix, hapArray, vcff, sample, sampleDrawOrder); return hapOrder; } #define VCFPHASED_UNPHASED_COLOR MG_RED void vcfPhasedDrawOneRecord(struct track *track, struct hvGfx *hvg, struct vcfRecord *rec, void *item, - unsigned short *gtHapOrder, int gtHapCount, int xOff, int yOffsets[], + unsigned int *gtHapOrder, int gtHapCount, int xOff, int yOffsets[], char *sampleOrder[], double scale) // Draw a record's haplotypes on the appropriate lines { int i; int x1 = round((double)(rec->chromStart-winStart)*scale) + xOff; int x2 = round((double)(rec->chromEnd-winStart)*scale) + xOff; struct pgSnpVcfStartEnd *psvs = (struct pgSnpVcfStartEnd *)item; int w = x2-x1; if (w < 1) w = 1; int tickHeight = track->itemHeight(track, track->items); for (i = 0; i < gtHapCount ; i++) { struct vcfGenotype *gt = &(rec->genotypes[gtHapOrder[i] >> 1]); int nameIx = stringArrayIx(gt->id, sampleOrder, track->customInt); @@ -2069,31 +2069,31 @@ char *sampleOrder[gtCount]; // order of sampleName lines int i; for (pair = sampleNames, i = 0; pair != NULL && i < gtCount; pair = pair->next, i++) sampleOrder[i] = pair->name; // child sample is required to be in the middle/bottom of the trio char *childSample = gtCount > 2 ? sampleOrder[1] : sampleOrder[0]; // set up the "haplotype" lines and the transparent yellow box to delineate samples vcfPhasedSetupHaplotypesLines(track, hvg, xOff, yOff, width, yOffsets, sampleNames, font); // sort the variants by haplotype then draw ticks int nRecords = slCount(vcff->records); int centerIx = getCenterVariantIx(track, seqStart, seqEnd, vcff->records); int startIx = 0; int endIx = nRecords; -unsigned short *hapOrder = computeHapDist(vcff, centerIx, startIx, endIx, childSample, gtCount, sampleOrder); +unsigned int *hapOrder = computeHapDist(vcff, centerIx, startIx, endIx, childSample, gtCount, sampleOrder); struct vcfRecord *rec = NULL; struct slList *item = NULL; for (rec = vcff->records, item = track->items; rec != NULL && item != NULL; rec = rec->next, item = item->next) { vcfPhasedDrawOneRecord(track, hvg, rec, item, hapOrder, gtCount * 2, xOff, yOffsets, sampleOrder, scale); } } static int vcfPhasedItemHeight(struct track *tg, void *item) { return tg->heightPer; } int vcfPhasedTrackHeight(struct track *tg, enum trackVisibility vis) {