0a032dff5e4b6c84a6ae73b7cbb0b01172d2cd65
angie
  Fri Aug 3 14:31:48 2012 -0700
Fix to haplotype clustering in VCF display: was still giving too much weight to missing data.
diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c
index 1e7c199..1af8b37 100644
--- src/hg/hgTracks/vcfTrack.c
+++ src/hg/hgTracks/vcfTrack.c
@@ -234,63 +234,53 @@
     struct hapCluster *next;   // hacTree wants slList of items
     unsigned short *refCounts; // per-variant count of reference alleles observed
     unsigned short *unkCounts; // per-variant count of unknown (or unphased het) alleles
     unsigned short leafCount;  // number of leaves under this node (or 1 if leaf)
     unsigned short gtHapIx;    // if leaf, (genotype index << 1) + hap (0 or 1 for diploid)
 };
 
 INLINE boolean isRef(const struct hapCluster *c, int varIx)
 // Return TRUE if the leaves of cluster have at least as many reference alleles
 // as alternate alleles for variant varIx.
 {
 unsigned short altCount = c->leafCount - c->refCounts[varIx] - c->unkCounts[varIx];
 return (c->refCounts[varIx] >= altCount);
 }
 
-INLINE boolean hasUnk(const struct hapCluster *c, int varIx)
-// Return TRUE if at least one haplotype in this cluster has an unknown/unphased value at varIx.
-{
-return (c->unkCounts[varIx] > 0);
-}
-
 static double cwaDistance(const struct slList *item1, const struct slList *item2, void *extraData)
 /* Center-weighted alpha sequence distance function for hacTree clustering of haplotype seqs */
 // This is inner-loop so I am not doing defensive checks.  Caller must ensure:
 // 1. kids's sequences' lengths are both equal to helper->len
 // 2. 0 <= helper->center <= len
 // 3. 0.0 < helper->alpha <= 1.0
 {
 const struct hapCluster *kid1 = (const struct hapCluster *)item1;
 const struct hapCluster *kid2 = (const struct hapCluster *)item2;
 struct cwaExtraData *helper = extraData;
 double distance = 0;
 double weight = 1; // start at center: alpha to the 0th power
 int i;
 for (i=helper->center;  i >= 0;  i--)
     {
-    if (hasUnk(kid1, i) != hasUnk(kid2, i))
-	; // no penalty!
-    else if (isRef(kid1, i) != isRef(kid2, i))
+    if (isRef(kid1, i) != isRef(kid2, i))
 	distance += weight;
     weight *= helper->alpha;
     }
 weight = helper->alpha; // start at center+1: alpha to the 1st power
 for (i=helper->center+1;  i < helper->len;  i++)
     {
-    if (hasUnk(kid1, i) != hasUnk(kid2, i))
-	; // no penalty!
-    else if (isRef(kid1, i) != isRef(kid2, i))
+    if (isRef(kid1, i) != isRef(kid2, i))
 	distance += weight;
     weight *= helper->alpha;
     }
 return distance;
 }
 
 static struct hapCluster *lmHapCluster(struct cwaExtraData *helper)
 /* Use localMem to allocate a new cluster of the given len. */
 {
 struct hapCluster *c = lmAlloc(helper->localMem, sizeof(struct hapCluster));
 c->refCounts = lmAlloc(helper->localMem, helper->len * sizeof(unsigned short));
 c->unkCounts = lmAlloc(helper->localMem, helper->len * sizeof(unsigned short));
 return c;
 }
 
@@ -758,33 +748,32 @@
     {
     struct dyString *dy = dyStringNew(0);
     struct hapCluster *c = (struct hapCluster *)ht->itemOrCluster;
     dyStringPrintf(dy, "N=%d ", c->leafCount);
     while (dyStringLen(dy) < 7)
 	dyStringAppendC(dy, ' ');
     if (helper->startIx > 0)
 	dyStringAppend(dy, "...");
     int i, nHapsForClustering = helper->endIx - helper->startIx;
     for (i=0;  i < nHapsForClustering;  i++)
 	{
 	boolean isCenter = (helper->startIx+i == helper->centerIx);
 	char *allele = isRef(c, i) ? helper->refs[i] : helper->alts[i];
 	if (isCenter)
 	    dyStringAppendC(dy, '[');
-	if (hasUnk(c, i))
-	    dyStringAppendC(dy, '?');
-	else if (c->refCounts[i] > 0 && c->refCounts[i] < c->leafCount)
+	int altCount = c->leafCount - c->refCounts[i] - c->unkCounts[i];
+	if (c->refCounts[i] > 0 && altCount > 0)
 	    dyStringAppendC(dy, '*');
 	else if (strlen(allele) == 1)
 	    dyStringAppendC(dy, allele[0]);
 	else
 	    dyStringPrintf(dy, "(%s)", allele);
 	if (isCenter)
 	    dyStringAppendC(dy, ']');
 	}
     if (helper->endIx < helper->nRecords)
 	dyStringAppend(dy, "...");
     imgTrackAddMapItem(curImgTrack, TITLE_BUT_NO_LINK, dy->string,
 		       x1, y1, x2, y2, helper->track);
     }
 }