6b905d759504a15d9c580b0c20d8eebb31f10dd6
angie
  Wed Oct 5 13:44:43 2011 -0700
Loosened regex and handle new non-numeric values of ##FORMAT's "Number"introduced in VCF 4.1.

diff --git src/lib/vcf.c src/lib/vcf.c
index 26ad918..b5202e1 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -232,37 +232,37 @@
 if (sameString("Flag", typeWord))
     return vcfInfoFlag;
 if (sameString("Character", typeWord))
     return vcfInfoCharacter;
 if (sameString("String", typeWord))
     return vcfInfoString;
 vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line);
 return vcfInfoNoType;
 }
 
 // Regular expressions to check format and extract information from header lines:
 static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$";
 static const char *infoOrFormatRegex =
     "^##(INFO|FORMAT)="
     "<ID=([A-Za-z0-9_:-]+),"
-    "Number=(\\.|[0-9-]+),"
+    "Number=(\\.|A|G|[0-9-]+),"
     "Type=([A-Za-z]+),"
-    "Description=\"?([^\"]+)\"?>$";
+    "Description=\"?(.*)\"?>$";
 static const char *filterOrAltRegex =
     "^##(FILTER|ALT)="
     "<ID=([^,]+),"
-    "(Description|Type)=\"([^\"]+)\">$";
+    "(Description|Type)=\"?(.*)\"?>$";
 
 INLINE void nonAsciiWorkaround(char *line)
 // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files:
 {
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\"");
 (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\"");
 }
 
 static void parseMetadataLine(struct vcfFile *vcff, char *line)
 /* Parse a VCF header line beginning with "##" that defines a metadata. */
 {
 char *ptr = line;
 if (ptr == NULL && !startsWith(ptr, "##"))
     errAbort("Bad line passed to parseMetadataLine");
 ptr += 2;
@@ -285,34 +285,38 @@
 	    vcff->minorVersion = atoi(line + substrs[4].rm_so);
 	}
     else
 	vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"",
 		   fileformatRegex, line);
     }
 else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line))
     {
     boolean isInfo = startsWith("##INFO=", line);
     nonAsciiWorkaround(line);
     if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)))
 	// substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description.
 	{
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
-	if (sameString(def->key, "."))
+	char *number = vcfFileCloneSubstr(vcff, line, substrs[3]);
+	if (sameString(number, ".") || sameString(number, "A") || sameString(number, "G"))
+	    // A is #alts which varies line-to-line; "G" is #genotypes which we haven't
+	    // yet seen.  Why is there a G here -- shouldn't such attributes go in the
+	    // genotype columns?
 	    def->fieldCount = -1;
 	else
-	    def->fieldCount = atoi(line + substrs[3].rm_so);
+	    def->fieldCount = atoi(number);
 	def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]);
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[5]);
 	slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def);
 	}
     else
 	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",
 		   (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, line);
     }
 else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line))
     {
     boolean isFilter = startsWith("##FILTER", line);
     if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)))
 	{
 	// substrs[2] is ID/key, substrs[4] is Description.
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));