c843f30cccc3610e26a26142ad0622f29402254b
angie
  Wed Jan 11 09:54:24 2012 -0800
Bug #6529 (VCF internal rep doesn't do missing data): add flags for missingdata values to vcfInfoElement, so we know the difference between "0" and ".".
This applies to both INFO column values and genotype (FORMAT) values.
Also, vcfGenotype's hapIxA and hapIxB are now signed, with negative values
indicating missing data.
User reported bug in MLQ #6462.

diff --git src/lib/vcf.c src/lib/vcf.c
index 7b22ff1..bece59d 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -523,65 +523,71 @@
 static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, const char *key)
 /* Look up the type of INFO component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 {
 struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key);
 if (def == NULL)
     {
     vcfFileErr(vcff, "There is no INFO header defining \"%s\"", key);
     // default to string so we can display value as-is:
     return vcfInfoString;
     }
 return def->type;
 }
 
 static int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type,
-			  char *valStr, union vcfDatum **pData)
+			  char *valStr, union vcfDatum **pData, bool **pMissingData)
 /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */
 {
 char *valWords[VCF_MAX_INFO];
 int count = chopCommas(valStr, valWords);
 struct vcfFile *vcff = record->file;
 union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum));
+bool *missingData = vcfFileAlloc(vcff, count * sizeof(*missingData));
 int j;
 for (j = 0;  j < count;  j++)
+    {
+    if (type != vcfInfoString && type != vcfInfoCharacter && sameString(valWords[j], "."))
+	missingData[j] = TRUE;
     switch (type)
 	{
 	case vcfInfoInteger:
 	    data[j].datInt = atoi(valWords[j]);
 	    break;
 	case vcfInfoFloat:
 	    data[j].datFloat = atof(valWords[j]);
 	    break;
 	case vcfInfoFlag:
 	    // Flag key might have a value in older VCFs e.g. 3.2's DB=0, DB=1
 	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
 	    break;
 	case vcfInfoCharacter:
 	    data[j].datChar = valWords[j][0];
 	    break;
 	case vcfInfoString:
 	    data[j].datString = vcfFilePooledStr(vcff, valWords[j]);
 	    break;
 	default:
 	    errAbort("invalid vcfInfoType (uninitialized?) %d", type);
 	    break;
 	}
+    }
 // If END is given, use it as chromEnd:
 if (sameString(infoKey, vcfInfoEnd))
     record->chromEnd = data[0].datInt;
 *pData = data;
+*pMissingData = missingData;
 return count;
 }
 
 static void parseInfoColumn(struct vcfFile *vcff, struct vcfRecord *record, char *string)
 /* Translate string into array of vcfInfoElement. */
 {
 if (sameString(string, "."))
     {
     record->infoCount = 0;
     return;
     }
 char *elWords[VCF_MAX_INFO];
 record->infoCount = chopByChar(string, ';', elWords, ArraySize(elWords));
 if (record->infoCount >= VCF_MAX_INFO)
     vcfFileErr(vcff, "INFO column contains at least %d elements; "
@@ -602,31 +608,31 @@
 	    {
 	    vcfFileErr(vcff, "Missing = after key in INFO element: \"%s\" (type=%d)",
 		       elStr, type);
 	    if (type == vcfInfoString)
 		{
 		el->values = vcfFileAlloc(vcff, sizeof(union vcfDatum));
 		el->values[0].datString = emptyString;
 		}
 	    }
 	continue;
 	}
     *eq = '\0';
     el->key = vcfFilePooledStr(vcff, elStr);
     enum vcfInfoType type = typeForInfoKey(vcff, el->key);
     char *valStr = eq+1;
-    el->count = parseInfoValue(record, el->key, type, valStr, &(el->values));
+    el->count = parseInfoValue(record, el->key, type, valStr, &(el->values), &(el->missingData));
     if (el->count >= VCF_MAX_INFO)
 	vcfFileErr(vcff, "A single element of the INFO column has at least %d values; "
 	       "VCF_MAX_INFO may need to be increased in vcf.c!", VCF_MAX_INFO);
     }
 }
 
 static void vcfParseData(struct vcfFile *vcff, int maxRecords)
 /* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned
  * at the beginning of a data row, parse and store all data rows from lineFile. */
 {
 if (vcff == NULL)
     return;
 int recCount = 0, expected = 8;
 if (vcff->genotypeCount > 0)
     expected = 9 + vcff->genotypeCount;
@@ -846,40 +852,45 @@
     gt->id = vcff->genotypeIds[i];
     gt->infoCount = gtWordCount;
     gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement));
     int j;
     for (j = 0;  j < gtWordCount;  j++)
 	{
 	// Special parsing of genotype:
 	if (sameString(formatWords[j], vcfGtGenotype))
 	    {
 	    char *genotype = gtWords[j];
 	    char *sep = strchr(genotype, '|');
 	    if (sep != NULL)
 		gt->isPhased = TRUE;
 	    else
 		sep = strchr(genotype, '/');
+	    if (genotype[0] == '.')
+		gt->hapIxA = -1;
+	    else
 	    gt->hapIxA = atoi(genotype);
 	    if (sep == NULL)
 		gt->isHaploid = TRUE;
+	    else if (sep[1] == '.')
+		gt->hapIxB = -1;
 	    else
 		gt->hapIxB = atoi(sep+1);
 	    }
 	struct vcfInfoElement *el = &(gt->infoElements[j]);
 	el->key = formatWords[j];
 	el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j],
-				   &(el->values));
+				   &(el->values), &(el->missingData));
 	if (el->count >= VCF_MAX_INFO)
 	    vcfFileErr(vcff, "A single element of the genotype column for \"%s\" "
 		       "has at least %d values; "
 		       "VCF_MAX_INFO may need to be increased in vcf.c!",
 		       gt->id, VCF_MAX_INFO);
 	}
     }
 record->genotypeUnparsedStrings = NULL;
 }
 
 const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId)
 /* Find the genotype and associated info for the individual, or return NULL.
  * This calls vcfParseGenotypes if it has not already been called. */
 {
 struct vcfFile *vcff = record->file;