1927e2636f3cd7e36bd28be1ce5aa6b2db274257
angie
  Tue Nov 29 15:28:19 2011 -0800
MLQ #6136: adding support for VCF 3.3 header line formats.
diff --git src/lib/vcf.c src/lib/vcf.c
index bc29dad..e53a6e9 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -239,30 +239,41 @@
 return vcfInfoNoType;
 }
 
 // Regular expressions to check format and extract information from header lines:
 static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$";
 static const char *infoOrFormatRegex =
     "^##(INFO|FORMAT)="
     "<ID=([A-Za-z0-9_:-]+),"
     "Number=(\\.|A|G|[0-9-]+),"
     "Type=([A-Za-z]+),"
     "Description=\"?(.*)\"?>$";
 static const char *filterOrAltRegex =
     "^##(FILTER|ALT)="
     "<ID=([^,]+),"
     "(Description|Type)=\"?(.*)\"?>$";
+// VCF version 3.3 was different enough to warrant separate regexes:
+static const char *infoOrFormatRegex3_3 =
+    "^##(INFO|FORMAT)="
+    "([A-Za-z0-9_:-]+),"
+    "(\\.|A|G|[0-9-]+),"
+    "([A-Za-z]+),"
+    "\"?(.*)\"?$";
+static const char *filterRegex3_3 =
+    "^##(FILTER)="
+    "([^,]+),"
+    "()\"?(.*)\"?$";
 
 INLINE void nonAsciiWorkaround(char *line)
 // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files:
 {
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\"");
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\"");
 }
 
 static void parseMetadataLine(struct vcfFile *vcff, char *line)
 /* Parse a VCF header line beginning with "##" that defines a metadata. */
 {
 char *ptr = line;
 if (ptr == NULL && !startsWith(ptr, "##"))
     errAbort("Bad line passed to parseMetadataLine");
 ptr += 2;
@@ -280,68 +291,76 @@
 	{
 	// substrs[2] is major version #, substrs[3] is set only if there is a minor version,
 	// and substrs[4] is the minor version #.
 	vcff->majorVersion = atoi(line + substrs[2].rm_so);
 	if (substrs[3].rm_so != -1)
 	    vcff->minorVersion = atoi(line + substrs[4].rm_so);
 	}
     else
 	vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"",
 		   fileformatRegex, line);
     }
 else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line))
     {
     boolean isInfo = startsWith("##INFO=", line);
     nonAsciiWorkaround(line);
-    if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)))
+    if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)) ||
+	regexMatchSubstr(line, infoOrFormatRegex3_3, substrs, ArraySize(substrs)))
 	// substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description.
 	{
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	char *number = vcfFileCloneSubstr(vcff, line, substrs[3]);
 	if (sameString(number, ".") || sameString(number, "A") || sameString(number, "G"))
 	    // A is #alts which varies line-to-line; "G" is #genotypes which we haven't
 	    // yet seen.  Why is there a G here -- shouldn't such attributes go in the
 	    // genotype columns?
 	    def->fieldCount = -1;
 	else
 	    def->fieldCount = atoi(number);
 	def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]);
 	// greedy regex pulls in end quote, trim if found:
 	if (line[substrs[5].rm_eo-1] == '"')
 	    line[substrs[5].rm_eo-1] = '\0';
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[5]);
 	slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def);
 	}
     else
-	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",
-		   (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, line);
+	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/ or /%s/: \"%s\"",
+		   (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, infoOrFormatRegex3_3, line);
     }
 else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line))
     {
     boolean isFilter = startsWith("##FILTER", line);
-    if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)))
+    if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)) ||
+	regexMatchSubstr(line, filterRegex3_3, substrs, ArraySize(substrs)))
 	{
 	// substrs[2] is ID/key, substrs[4] is Description.
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[4]);
 	slAddHead((isFilter ? &(vcff->filterDefs) : &(vcff->altDefs)), def);
 	}
     else
-	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",
-		   (isFilter ? "FILTER" : "ALT"), filterOrAltRegex, line);
+	{
+	if (isFilter)
+	    vcfFileErr(vcff, "##FILTER line does not match expected pattern /%s/ or /%s/: \"%s\"",
+		       filterOrAltRegex, filterRegex3_3, line);
+	else
+	    vcfFileErr(vcff, "##ALT line does not match expected pattern /%s/: \"%s\"",
+		       filterOrAltRegex, line);
+	}
     }
 }
 
 static void expectColumnName2(struct vcfFile *vcff, char *exp1, char *exp2, char *words[], int ix)
 /* Every file must include a header naming the columns, though most column names are
  * fixed; make sure the names of fixed columns are as expected. */
 {
 if (! sameString(exp1, words[ix]))
     {
     if (exp2 == NULL)
 	vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" but got \"%s\"",
 		   ix+1, exp1, words[ix]);
     else if (! sameString(exp2, words[ix]))
 	vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\"  or \"%s\" "
 		   "but got \"%s\"", ix+1, exp1, exp2, words[ix]);