0a032dff5e4b6c84a6ae73b7cbb0b01172d2cd65 angie Fri Aug 3 14:31:48 2012 -0700 Fix to haplotype clustering in VCF display: was still giving too much weight to missing data. diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index 1e7c199..1af8b37 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -234,63 +234,53 @@ struct hapCluster *next; // hacTree wants slList of items unsigned short *refCounts; // per-variant count of reference alleles observed unsigned short *unkCounts; // per-variant count of unknown (or unphased het) alleles unsigned short leafCount; // number of leaves under this node (or 1 if leaf) unsigned short gtHapIx; // if leaf, (genotype index << 1) + hap (0 or 1 for diploid) }; INLINE boolean isRef(const struct hapCluster *c, int varIx) // Return TRUE if the leaves of cluster have at least as many reference alleles // as alternate alleles for variant varIx. { unsigned short altCount = c->leafCount - c->refCounts[varIx] - c->unkCounts[varIx]; return (c->refCounts[varIx] >= altCount); } -INLINE boolean hasUnk(const struct hapCluster *c, int varIx) -// Return TRUE if at least one haplotype in this cluster has an unknown/unphased value at varIx. -{ -return (c->unkCounts[varIx] > 0); -} - static double cwaDistance(const struct slList *item1, const struct slList *item2, void *extraData) /* Center-weighted alpha sequence distance function for hacTree clustering of haplotype seqs */ // This is inner-loop so I am not doing defensive checks. Caller must ensure: // 1. kids's sequences' lengths are both equal to helper->len // 2. 0 <= helper->center <= len // 3. 0.0 < helper->alpha <= 1.0 { const struct hapCluster *kid1 = (const struct hapCluster *)item1; const struct hapCluster *kid2 = (const struct hapCluster *)item2; struct cwaExtraData *helper = extraData; double distance = 0; double weight = 1; // start at center: alpha to the 0th power int i; for (i=helper->center; i >= 0; i--) { - if (hasUnk(kid1, i) != hasUnk(kid2, i)) - ; // no penalty! - else if (isRef(kid1, i) != isRef(kid2, i)) + if (isRef(kid1, i) != isRef(kid2, i)) distance += weight; weight *= helper->alpha; } weight = helper->alpha; // start at center+1: alpha to the 1st power for (i=helper->center+1; i < helper->len; i++) { - if (hasUnk(kid1, i) != hasUnk(kid2, i)) - ; // no penalty! - else if (isRef(kid1, i) != isRef(kid2, i)) + if (isRef(kid1, i) != isRef(kid2, i)) distance += weight; weight *= helper->alpha; } return distance; } static struct hapCluster *lmHapCluster(struct cwaExtraData *helper) /* Use localMem to allocate a new cluster of the given len. */ { struct hapCluster *c = lmAlloc(helper->localMem, sizeof(struct hapCluster)); c->refCounts = lmAlloc(helper->localMem, helper->len * sizeof(unsigned short)); c->unkCounts = lmAlloc(helper->localMem, helper->len * sizeof(unsigned short)); return c; } @@ -758,33 +748,32 @@ { struct dyString *dy = dyStringNew(0); struct hapCluster *c = (struct hapCluster *)ht->itemOrCluster; dyStringPrintf(dy, "N=%d ", c->leafCount); while (dyStringLen(dy) < 7) dyStringAppendC(dy, ' '); if (helper->startIx > 0) dyStringAppend(dy, "..."); int i, nHapsForClustering = helper->endIx - helper->startIx; for (i=0; i < nHapsForClustering; i++) { boolean isCenter = (helper->startIx+i == helper->centerIx); char *allele = isRef(c, i) ? helper->refs[i] : helper->alts[i]; if (isCenter) dyStringAppendC(dy, '['); - if (hasUnk(c, i)) - dyStringAppendC(dy, '?'); - else if (c->refCounts[i] > 0 && c->refCounts[i] < c->leafCount) + int altCount = c->leafCount - c->refCounts[i] - c->unkCounts[i]; + if (c->refCounts[i] > 0 && altCount > 0) dyStringAppendC(dy, '*'); else if (strlen(allele) == 1) dyStringAppendC(dy, allele[0]); else dyStringPrintf(dy, "(%s)", allele); if (isCenter) dyStringAppendC(dy, ']'); } if (helper->endIx < helper->nRecords) dyStringAppend(dy, "..."); imgTrackAddMapItem(curImgTrack, TITLE_BUT_NO_LINK, dy->string, x1, y1, x2, y2, helper->track); } }