519e0946826199d1d9792fa8df5972843fce021c
angie
  Tue Aug 9 14:39:38 2011 -0700
Feature #2821 (VCF parser): improved representation of alleles:parse ref and comma-sep'd alt allele string into count and array
inside record, so callers don't all have to parse the comma-sep'd
alternate allele string.

diff --git src/lib/vcf.c src/lib/vcf.c
index b93295f..a5432fd 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -414,30 +414,46 @@
 if (vcff->majorVersion == 0)
     vcfFileErr(vcff, "missing ##fileformat= header line?  Assuming 4.1.");
 if ((vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) &&
     (vcff->majorVersion != 3))
     vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1",
 	       vcff->majorVersion, vcff->minorVersion);
 // Next, one header line beginning with single "#" that names the columns:
 if (line == NULL)
     // EOF after metadata
     return vcff;
 parseColumnHeaderRow(vcff, line);
 return vcff;
 }
 
 
+#define VCF_MAX_INFO 512
+
+static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt)
+/* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns.
+ * Note: this trashes the alt argument, since this is expected to be its last use. */
+{
+char *altAlleles[VCF_MAX_INFO];
+int altCount = chopCommas(alt, altAlleles);
+record->alleleCount = 1 + altCount;
+record->alleles = vcfFileAlloc(vcff, record->alleleCount * sizeof(record->alleles[0]));
+record->alleles[0] = vcfFilePooledStr(vcff, ref);
+int i;
+for (i = 0;  i < altCount;  i++)
+    record->alleles[1+i] = vcfFilePooledStr(vcff, altAlleles[i]);
+}
+
 static void parseFilterColumn(struct vcfFile *vcff, struct vcfRecord *record, char *filterStr)
 /* Transform ;-separated filter codes into count + string array. */
 {
 // We don't want to modify something allocated with vcfFilePooledStr because that uses
 // hash element names for storage!  So don't make a vcfFilePooledStr copy of filterStr and
 // chop that; instead, chop a temp string and pool the words separately.
 static struct dyString *tmp = NULL;
 if (tmp == NULL)
     tmp = dyStringNew(0);
 dyStringClear(tmp);
 dyStringAppend(tmp, filterStr);
 record->filterCount = countChars(filterStr, ';') + 1;
 record->filters = vcfFileAlloc(vcff, record->filterCount * sizeof(char **));
 (void)chopByChar(tmp->string, ';', record->filters, record->filterCount);
 int i;
@@ -466,34 +482,32 @@
 
 static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, const char *key)
 /* Look up the type of INFO component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 {
 struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key);
 if (def == NULL)
     {
     vcfFileErr(vcff, "There is no INFO header defining \"%s\"", key);
     // default to string so we can display value as-is:
     return vcfInfoString;
     }
 return def->type;
 }
 
-#define VCF_MAX_INFO 512
-
-int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type, char *valStr,
-		   union vcfDatum **pData)
+static int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type,
+			  char *valStr, union vcfDatum **pData)
 /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */
 {
 char *valWords[VCF_MAX_INFO];
 int count = chopCommas(valStr, valWords);
 struct vcfFile *vcff = record->file;
 union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum));
 int j;
 for (j = 0;  j < count;  j++)
     switch (type)
 	{
 	case vcfInfoInteger:
 	    data[j].datInt = atoi(valWords[j]);
 	    break;
 	case vcfInfoFloat:
 	    data[j].datFloat = atof(valWords[j]);
@@ -576,32 +590,31 @@
 if (vcff->genotypeCount > 0)
     expected = 9 + vcff->genotypeCount;
 char *words[VCF_MAX_COLUMNS];
 int wordCount;
 while ((wordCount = lineFileChop(vcff->lf, words)) > 0)
     {
     lineFileExpectWords(vcff->lf, expected, wordCount);
     struct vcfRecord *record;
     AllocVar(record);
     record->file = vcff;
     record->chrom = vcfFilePooledStr(vcff, words[0]);
     record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1;
     // chromEnd may be modified by parseInfoColumn, if INFO column includes END.
     record->chromEnd = record->chromStart + 1;
     record->name = vcfFilePooledStr(vcff, words[2]);
-    record->ref = vcfFilePooledStr(vcff, words[3]);
-    record->alt = vcfFilePooledStr(vcff, words[4]);
+    parseRefAndAlt(vcff, record, words[3], words[4]);
     record->qual = vcfFilePooledStr(vcff, words[5]);
     parseFilterColumn(vcff, record, words[6]);
     parseInfoColumn(vcff, record, words[7]);
     if (vcff->genotypeCount > 0)
 	{
 	record->format = vcfFilePooledStr(vcff, words[8]);
 	record->genotypeUnparsedStrings = vcfFileAlloc(vcff,
 						       vcff->genotypeCount * sizeof(char *));
 	int i;
 	// Don't bother actually parsing all these until & unless we need the info:
 	for (i = 0;  i < vcff->genotypeCount;  i++)
 	    record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]);
 	}
     slAddHead(&(vcff->records), record);
     }