c843f30cccc3610e26a26142ad0622f29402254b
angie
  Wed Jan 11 09:54:24 2012 -0800
Bug #6529 (VCF internal rep doesn't do missing data): add flags for missingdata values to vcfInfoElement, so we know the difference between "0" and ".".
This applies to both INFO column values and genotype (FORMAT) values.
Also, vcfGenotype's hapIxA and hapIxB are now signed, with negative values
indicating missing data.
User reported bug in MLQ #6462.

diff --git src/hg/lib/pgSnp.c src/hg/lib/pgSnp.c
index 994aa27..a58bc24 100644
--- src/hg/lib/pgSnp.c
+++ src/hg/lib/pgSnp.c
@@ -703,44 +703,48 @@
 }
 
 #define VCF_MAX_ALLELE_LEN 80
 
 static char *alleleCountsFromVcfRecord(struct vcfRecord *rec, int alDescCount)
 /* Build up comma-sep list of per-allele counts, if available, up to alDescCount
  * which may be less than rec->alleleCount: */
 {
 struct dyString *dy = dyStringNew(0);
 int alCounts[VCF_MAX_ALLELE_LEN];
 boolean gotTotalCount = FALSE, gotAltCounts = FALSE;
 int i;
 for (i = 0;  i < rec->infoCount;  i++)
     if (sameString(rec->infoElements[i].key, "AN"))
 	{
+	if (rec->infoElements[i].missingData[0])
+	    break;
 	gotTotalCount = TRUE;
 	// Set ref allele to total count, subtract alt counts below.
 	alCounts[0] = rec->infoElements[i].values[0].datInt;
 	break;
 	}
 for (i = 0;  i < rec->infoCount;  i++)
     if (sameString(rec->infoElements[i].key, "AC"))
 	{
 	if (rec->infoElements[i].count > 0)
 	    {
 	    gotAltCounts = TRUE;
 	    int j;
 	    for (j = 0;  j < rec->infoElements[i].count && j < alDescCount-1;  j++)
 		{
+		if (rec->infoElements[i].missingData[j])
+		    continue;
 		int ac = rec->infoElements[i].values[j].datInt;
 		alCounts[1+j] = ac;
 		if (gotTotalCount)
 		    alCounts[0] -= ac;
 		}
 	    while (j++ < alDescCount-1)
 		alCounts[1+j] = -1;
 	    if (gotTotalCount)
 		dyStringPrintf(dy, "%d", alCounts[0]);
 	    else
 		dyStringAppend(dy, "-1");
 	    for (j = 1;  j < alDescCount;  j++)
 		if (alCounts[j] >= 0)
 		    dyStringPrintf(dy, ",%d", alCounts[j]);
 		else
@@ -748,33 +752,34 @@
 	    }
 	break;
 	}
 if (gotTotalCount && !gotAltCounts)
     dyStringPrintf(dy, "%d", alCounts[0]);
 else if (!gotTotalCount && !gotAltCounts && rec->file->genotypeCount > 0)
     {
     vcfParseGenotypes(rec);
     for (i = 0;  i < alDescCount;  i++)
 	alCounts[i] = 0;
     for (i = 0;  i < rec->file->genotypeCount;  i++)
 	{
 	struct vcfGenotype *gt = &(rec->genotypes[i]);
 	if (gt == NULL)
 	    uglyf("i=%d gt=NULL wtf?\n", i);
-	alCounts[gt->hapIxA]++;
-	if (! gt->isHaploid)
-	    alCounts[gt->hapIxB]++;
+	if (gt->hapIxA >= 0)
+	    alCounts[(unsigned char)gt->hapIxA]++;
+	if (!gt->isHaploid && gt->hapIxB >= 0)
+	    alCounts[(unsigned char)gt->hapIxB]++;
 	}
     dyStringPrintf(dy, "%d", alCounts[0]);
     for (i = 1;  i < alDescCount;  i++)
 	dyStringPrintf(dy, ",%d", alCounts[i]);
     }
 return dyStringCannibalize(&dy);
 }
 
 struct pgSnp *pgSnpFromVcfRecord(struct vcfRecord *rec)
 /* Convert VCF rec to pgSnp; don't free rec->file (vcfFile) until
  * you're done with pgSnp because pgSnp points to rec->chrom. */
 {
 struct dyString *dy = dyStringNew(0);
 struct pgSnp *pgs;
 AllocVar(pgs);
@@ -794,28 +799,28 @@
 	{
 	if ((dy->stringSize + 1 + strlen(rec->alleles[i])) > VCF_MAX_ALLELE_LEN)
 	    break;
 	dyStringPrintf(dy, "/%s", rec->alleles[i]);
 	}
     if (i < rec->alleleCount)
 	alCount = i;
     }
 pgs->name = cloneStringZ(dy->string, dy->stringSize+1);
 pgs->alleleCount = alCount;
 pgs->alleleFreq = alleleCountsFromVcfRecord(rec, alCount);
 // Build up comma-sep list... supposed to be per-allele quality scores but I think
 // the VCF spec only gives us one BQ... for the reference position?  should ask.
 dyStringClear(dy);
 for (i = 0;  i < rec->infoCount;  i++)
-    if (sameString(rec->infoElements[i].key, "BQ"))
+    if (sameString(rec->infoElements[i].key, "BQ") && !rec->infoElements[i].missingData[0])
 	{
 	float qual = rec->infoElements[i].values[0].datFloat;
 	dyStringPrintf(dy, "%.1f", qual);
 	int j;
 	for (j = 1;  j < rec->alleleCount;  j++)
 	    dyStringPrintf(dy, ",%.1f", qual);
 	break;
 	}
 pgs->alleleScores = dyStringCannibalize(&dy);
 return pgs;
 }