22beccc99aff75a791a4e11b06d9fa6a0d1636b3 angie Fri Aug 12 16:52:36 2011 -0700 Feature #3711 (vcfTabix haplotype-sorting display): Limit the numberof variants close to the center variant that we use for clustering, to prevent the clustering from timing out in large regions with large genotype counts (e.g. 1094 genotypes from 1000 Genomes phase 1 interim). diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index 0a06af2..102c23a 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -215,83 +215,87 @@ rSetGtHapOrder(ht->left, gtHapOrder, retGtHapEnd); } } } static unsigned short *clusterChroms(const struct vcfFile *vcff, int centerIx, unsigned short *retGtHapEnd, struct hacTree **retTree) /* Given a bunch of VCF records with phased genotypes, build up one haplotype string * per chromosome that is the sequence of alleles in all variants (simplified to one base * per variant). Each individual/sample will have two haplotype strings (unless haploid * like Y or male X). Independently cluster the haplotype strings using hacTree with the * center-weighted alpha functions above. Return an array of genotype+haplotype indices * in the order determined by the hacTree, and set retGtHapEnd to its length/end. */ { int len = slCount(vcff->records); -// Should alpha depend on len? Should the penalty drop off with distance? Seems like -// straight-up exponential will cause the signal to drop to nothing pretty quickly... +// Limit the number of variants that we compare, to keep from timing out: +const int maxVariantsPerSide = 50; +int startIx = max(0, centerIx - maxVariantsPerSide); +int endIx = min(len, centerIx+1 + maxVariantsPerSide); double alpha = 0.5; struct lm *lm = lmInit(0); -struct cwaExtraData helper = { centerIx, len, alpha, lm }; +struct cwaExtraData helper = { centerIx-startIx, endIx-startIx, alpha, lm }; int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. int gtCount = vcff->genotypeCount; // Make an slList of hapClusters, but allocate in a big block so I can use // array indexing. struct hapCluster **hapArray = lmAlloc(lm, sizeof(struct hapCluster *) * gtCount * ploidy); int i; for (i=0; i < ploidy * gtCount; i++) { hapArray[i] = lmHapCluster(&helper); if (i > 0) hapArray[i-1]->next = hapArray[i]; } boolean haveHaploid = FALSE; int varIx; struct vcfRecord *rec; -for (varIx = 0, rec = vcff->records; rec != NULL; varIx++, rec = rec->next) +for (varIx = 0, rec = vcff->records; rec != NULL && varIx < endIx; varIx++, rec = rec->next) { - vcfParseGenotypes(rec); + if (varIx < startIx) + continue; + int countIx = varIx - startIx; int gtIx; for (gtIx=0; gtIx < gtCount; gtIx++) { struct vcfGenotype *gt = &(rec->genotypes[gtIx]); struct hapCluster *c1 = hapArray[gtIx]; struct hapCluster *c2 = hapArray[gtCount + gtIx]; // hardwired ploidy=2 if (gt->isPhased || gt->isHaploid || (gt->hapIxA == gt->hapIxB)) { // first chromosome: c1->leafCount = 1; c1->gtHapIx = gtIx << 1; if (gt->hapIxA == 0) - c1->refCounts[varIx] = 1; + c1->refCounts[countIx] = 1; if (gt->isHaploid) haveHaploid = TRUE; else { c2->leafCount = 1; c2->gtHapIx = (gtIx << 1) | 1; if (gt->hapIxB == 0) - c2->refCounts[varIx] = 1; + c2->refCounts[countIx] = 1; } } else { // Unphased heterozygote, don't use haplotype info for clustering c1->leafCount = c2->leafCount = 1; c1->gtHapIx = gtIx << 1; c2->gtHapIx = (gtIx << 1) | 1; - c1->unkCounts[varIx] = c2->unkCounts[varIx] = 1; + c1->unkCounts[countIx] = c2->unkCounts[countIx] = 1; } } if (haveHaploid) { // Some array items will have an empty cluster for missing hap2 -- // trim those from the linked list. struct hapCluster *c = hapArray[0]; while (c != NULL && c->next != NULL) { if (c->next->leafCount == 0) c->next = c->next->next; c = c->next; } } } @@ -632,35 +636,38 @@ static void vcfHapClusterDraw(struct track *tg, int seqStart, int seqEnd, struct hvGfx *hvg, int xOff, int yOff, int width, MgFont *font, Color color, enum trackVisibility vis) /* Split samples' chromosomes (haplotypes), cluster them by center-weighted * alpha similarity, and draw in the order determined by clustering. */ { const struct vcfFile *vcff = tg->extraUiData; if (vcff->records == NULL) return; purple = hvGfxFindColorIx(hvg, 0x99, 0x00, 0xcc); boolean compositeLevel = isNameAtCompositeLevel(tg->tdb, tg->tdb->track); char *colorBy = cartUsualStringClosestToHome(cart, tg->tdb, compositeLevel, VCF_HAP_COLORBY_VAR, VCF_HAP_COLORBY_REFALT); boolean colorByRefAlt = sameString(colorBy, VCF_HAP_COLORBY_REFALT); +struct vcfRecord *rec; +for (rec = vcff->records; rec != NULL; rec = rec->next) + vcfParseGenotypes(rec); unsigned short gtHapCount = 0; int ix, centerIx = getCenterVariantIx(tg, seqStart, seqEnd, vcff->records); struct hacTree *ht = NULL; unsigned short *gtHapOrder = clusterChroms(vcff, centerIx, >HapCount, &ht); -struct vcfRecord *rec, *centerRec = NULL; +struct vcfRecord *centerRec = NULL; for (rec = vcff->records, ix=0; rec != NULL; rec = rec->next, ix++) { if (ix == centerIx) centerRec = rec; else drawOneRec(rec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, FALSE, colorByRefAlt); } // Draw the center rec on top, outlined with black lines, to make sure it is very visible: drawOneRec(centerRec, gtHapOrder, gtHapCount, tg, hvg, xOff, yOff, width, TRUE, colorByRefAlt); // Draw as much of the tree as can fit in the left label area: drawTreeInLabelArea(ht, hvg, yOff, tg->height, gtHapCount, gtHapOrder); } static int vcfHapClusterTotalHeight(struct track *tg, enum trackVisibility vis) /* Return height of haplotype graph (2 * #samples * lineHeight);