9ead4a885ce52795137d26e290c397f165d163da angie Wed Aug 17 16:50:12 2011 -0700 Feature #3711 (VCF haplotype sorting display): For now, limit the numberof variants on either side of the center variant to 20 instead of 50 -- I happened to land in a small region that had many variants that seemed to have no linkage whatsoever in 1000 Genomes phase 1 interim VCF, e.g. chr1:1,225,165-1,225,869 would take 42 seconds to cluster and draw! Limiting to 20 reduces that ~0.5s (also paired w/performance improvements, but it's mostly the number of distinct haplotypes that we pass into hacTreeFromItems). Also, a rounding overflow (transforming pixels to gtHapOrder index) was sporadically causing apache-only SEGVs; check that range! :) diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index 102c23a..f09ae03 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -216,31 +216,32 @@ } } } static unsigned short *clusterChroms(const struct vcfFile *vcff, int centerIx, unsigned short *retGtHapEnd, struct hacTree **retTree) /* Given a bunch of VCF records with phased genotypes, build up one haplotype string * per chromosome that is the sequence of alleles in all variants (simplified to one base * per variant). Each individual/sample will have two haplotype strings (unless haploid * like Y or male X). Independently cluster the haplotype strings using hacTree with the * center-weighted alpha functions above. Return an array of genotype+haplotype indices * in the order determined by the hacTree, and set retGtHapEnd to its length/end. */ { int len = slCount(vcff->records); // Limit the number of variants that we compare, to keep from timing out: -const int maxVariantsPerSide = 50; +// (really what we should limit is the number of distinct haplo's passed to hacTree!) +const int maxVariantsPerSide = 20; int startIx = max(0, centerIx - maxVariantsPerSide); int endIx = min(len, centerIx+1 + maxVariantsPerSide); double alpha = 0.5; struct lm *lm = lmInit(0); struct cwaExtraData helper = { centerIx-startIx, endIx-startIx, alpha, lm }; int ploidy = 2; // Assuming diploid genomes here, no XXY, tetraploid etc. int gtCount = vcff->genotypeCount; // Make an slList of hapClusters, but allocate in a big block so I can use // array indexing. struct hapCluster **hapArray = lmAlloc(lm, sizeof(struct hapCluster *) * gtCount * ploidy); int i; for (i=0; i < ploidy * gtCount; i++) { hapArray[i] = lmHapCluster(&helper); if (i > 0) @@ -378,30 +379,33 @@ { const double scale = scaleForPixels(width); int x1 = round((double)(rec->chromStart-winStart)*scale) + xOff; int x2 = round((double)(rec->chromEnd-winStart)*scale) + xOff; int w = x2-x1; if (w <= 1) { x1--; w = 3; } double hapsPerPix = (double)gtHapCount / (tg->height-1); int pixIx; for (pixIx = 0; pixIx < tg->height; pixIx++) { int gtHapOrderIxStart = round(hapsPerPix * pixIx); + // Watch out for overflow: + if (gtHapOrderIxStart >= gtHapCount) + break; int gtHapOrderIxEnd = round(hapsPerPix * (pixIx + 1)); if (gtHapOrderIxEnd == gtHapOrderIxStart) gtHapOrderIxEnd++; int unks = 0, refs = 0, alts = 0; int gtHapOrderIx; for (gtHapOrderIx = gtHapOrderIxStart; gtHapOrderIx < gtHapOrderIxEnd; gtHapOrderIx++) { int gtHapIx = gtHapOrder[gtHapOrderIx]; int hapIx = gtHapIx & 1; int gtIx = gtHapIx >>1; struct vcfGenotype *gt = &(rec->genotypes[gtIx]); if (!gt->isPhased && gt->hapIxA != gt->hapIxB) unks++; else {