c843f30cccc3610e26a26142ad0622f29402254b
angie
  Wed Jan 11 09:54:24 2012 -0800
Bug #6529 (VCF internal rep doesn't do missing data): add flags for missingdata values to vcfInfoElement, so we know the difference between "0" and ".".
This applies to both INFO column values and genotype (FORMAT) values.
Also, vcfGenotype's hapIxA and hapIxB are now signed, with negative values
indicating missing data.
User reported bug in MLQ #6462.

diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c
index 7851f89..8b8bdbe 100644
--- src/hg/hgTracks/vcfTrack.c
+++ src/hg/hgTracks/vcfTrack.c
@@ -92,50 +92,55 @@
  * tiny minor frequencies). */
 {
 struct vcfFile *vcff = record->file;
 boolean gotInfo = FALSE;
 double refFreq = 1.0;
 double maxAltFreq = 0.0;
 int i;
 const struct vcfInfoElement *afEl = vcfRecordFindInfo(record, "AF");
 const struct vcfInfoDef *afDef = vcfInfoDefForKey(vcff, "AF");
 if (afEl != NULL && afDef != NULL && afDef->type == vcfInfoFloat)
     {
     // If INFO includes alt allele freqs, use them directly.
     gotInfo = TRUE;
     for (i = 0;  i < afEl->count;  i++)
 	{
+	if (afEl->missingData[i])
+	    continue;
 	double altFreq = afEl->values[i].datFloat;
 	refFreq -= altFreq;
 	if (altFreq > maxAltFreq)
 	    maxAltFreq = altFreq;
 	}
     }
 else
     {
     // Calculate alternate allele freqs from AC and AN:
     const struct vcfInfoElement *acEl = vcfRecordFindInfo(record, "AC");
     const struct vcfInfoDef *acDef = vcfInfoDefForKey(vcff, "AC");
     const struct vcfInfoElement *anEl = vcfRecordFindInfo(record, "AN");
     const struct vcfInfoDef *anDef = vcfInfoDefForKey(vcff, "AN");
     if (acEl != NULL && acDef != NULL && acDef->type == vcfInfoInteger &&
-	anEl != NULL && anDef != NULL && anDef->type == vcfInfoInteger && anEl->count == 1)
+	anEl != NULL && anDef != NULL && anDef->type == vcfInfoInteger && anEl->count == 1 &&
+	anEl->missingData[0] == FALSE)
 	{
 	gotInfo = TRUE;
 	int totalCount = anEl->values[0].datFloat;
 	for (i = 0;  i < acEl->count;  i++)
 	    {
+	    if (acEl->missingData[i])
+		continue;
 	    int altCount = acEl->values[i].datFloat;
 	    double altFreq = (double)altCount / totalCount;
 	    refFreq -= altFreq;
 	    if (altFreq < maxAltFreq)
 		maxAltFreq = altFreq;
 	    }
 	}
     }
 if (gotInfo)
     {
     double majorAlFreq = max(refFreq, maxAltFreq);
     if (majorAlFreq > (1.0 - minFreq))
 	return TRUE;
     }
 return FALSE;
@@ -218,31 +223,31 @@
     double alpha;  // weighting factor for distance from center
     struct lm *localMem;
     };
 
 // This is the representation of a cluster of up to 65,535 haplotypes of equal length,
 // where each variant's alleles are specified as 0 (reference) or 1 (alternate)
 // [or possibly 2 for second alternate, but those are rare so I'll ignore them].
 // When an individual is heterozygous and unphased for some variant, we need to
 // account for missing data.
 struct hapCluster
 {
     struct hapCluster *next;   // hacTree wants slList of items
     unsigned short *refCounts; // per-variant count of reference alleles observed
     unsigned short *unkCounts; // per-variant count of unknown (or unphased het) alleles
     unsigned short leafCount;  // number of leaves under this node (or 1 if leaf)
-    unsigned short gtHapIx;    // if leaf, (genotype index << 1) + hapIx (0 or 1 for diploid)
+    unsigned short gtHapIx;    // if leaf, (genotype index << 1) + hap (0 or 1 for diploid)
 };
 
 INLINE boolean isRef(const struct hapCluster *c, int varIx)
 // Return TRUE if the leaves of cluster have at least as many reference alleles
 // as alternate alleles for variant varIx.
 {
 unsigned short altCount = c->leafCount - c->refCounts[varIx] - c->unkCounts[varIx];
 return (c->refCounts[varIx] >= altCount);
 }
 
 INLINE boolean hasUnk(const struct hapCluster *c, int varIx)
 // Return TRUE if at least one haplotype in this cluster has an unknown/unphased value at varIx.
 {
 return (c->unkCounts[varIx] > 0);
 }
@@ -392,45 +397,49 @@
     {
     if (varIx < startIx)
 	continue;
     int countIx = varIx - startIx;
     int gtIx;
     for (gtIx=0;  gtIx < gtCount;  gtIx++)
 	{
 	struct vcfGenotype *gt = &(rec->genotypes[gtIx]);
 	struct hapCluster *c1 = hapArray[gtIx];
 	struct hapCluster *c2 = hapArray[gtCount + gtIx]; // hardwired ploidy=2
 	if (gt->isPhased || gt->isHaploid || (gt->hapIxA == gt->hapIxB))
 	    {
 	    // first chromosome:
 	    c1->leafCount = 1;
 	    c1->gtHapIx = gtIx << 1;
-	    if (gt->hapIxA == 0)
+	    if (gt->hapIxA < 0)
+		c1->unkCounts[countIx] = 1;
+	    else if (gt->hapIxA == 0)
 		c1->refCounts[countIx] = 1;
 	    if (gt->isHaploid)
 		haveHaploid = TRUE;
 	    else
 		{
 		c2->leafCount = 1;
 		c2->gtHapIx = (gtIx << 1) | 1;
-		if (gt->hapIxB == 0)
+		if (gt->hapIxB < 0)
+		    c2->unkCounts[countIx] = 1;
+		else if (gt->hapIxB == 0)
 		    c2->refCounts[countIx] = 1;
 		}
 	    }
 	else
 	    {
-	    // Unphased heterozygote, don't use haplotype info for clustering
+	    // Missing data or unphased heterozygote, don't use haplotype info for clustering
 	    c1->leafCount = c2->leafCount = 1;
 	    c1->gtHapIx = gtIx << 1;
 	    c2->gtHapIx = (gtIx << 1) | 1;
 	    c1->unkCounts[countIx] = c2->unkCounts[countIx] = 1;
 	    }
 	}
     if (haveHaploid)
 	{
 	// Some array items will have an empty cluster for missing hap2 --
 	// trim those from the linked list.
 	struct hapCluster *c = hapArray[0];
 	while (c != NULL && c->next != NULL)
 	    {
 	    if (c->next->leafCount == 0)
 		c->next = c->next->next;
@@ -615,31 +624,33 @@
     if (gtHapOrderIxEnd == gtHapOrderIxStart)
 	gtHapOrderIxEnd++;
     int unks = 0, refs = 0, alts = 0;
     int gtHapOrderIx;
     for (gtHapOrderIx = gtHapOrderIxStart;  gtHapOrderIx < gtHapOrderIxEnd;  gtHapOrderIx++)
 	{
 	int gtHapIx = gtHapOrder[gtHapOrderIx];
 	int hapIx = gtHapIx & 1;
 	int gtIx = gtHapIx >>1;
 	struct vcfGenotype *gt = &(rec->genotypes[gtIx]);
 	if (!gt->isPhased && gt->hapIxA != gt->hapIxB)
 	    unks++;
 	else
 	    {
 	    int alIx = hapIx ? gt->hapIxB : gt->hapIxA;
-	    if (alIx)
+	    if (alIx < 0)
+		unks++;
+	    else if (alIx > 0)
 		alts++;
 	    else
 		refs++;
 	    }
 	}
     int y = yOff + extraPixel + pixIx;
     Color col;
     if (colorMode == baseMode)
 	col = colorByBase(refs, alts, unks, rec->alleles[0], rec->alleles[1]);
     else if (colorMode == refAltMode)
 	col = colorByRefAlt(refs, alts, unks);
     else
 	col = colorByAltOnly(refs, alts, unks);
     if (col != MG_WHITE)
 	hvGfxLine(hvg, x1, y, x2, y, col);