42268aaa2203e616a143d1c032070320a830c467
angie
  Fri Mar 11 15:37:43 2011 -0800
Feature #2821 (VCF parser): adding workaround for annoying non-ASCIIcharacters in the header of a file included in 1000 Genomes' pilot
release, found when adding basic test case.

diff --git src/lib/vcf.c src/lib/vcf.c
index ff763dc..419d81d 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -242,30 +242,37 @@
 }
 
 // Regular expressions to check format and extract information from header lines:
 static const char *fileformatRegex = "^##fileformat=VCFv([0-9]+)(\\.([0-9]+))?$";
 static const char *infoOrFormatRegex =
     "^##(INFO|FORMAT)="
     "<ID=([A-Za-z0-9]+),"
     "Number=(\\.|[0-9]+),"
     "Type=([A-Za-z]+),"
     "Description=\"([^\"]+)\">$";
 static const char *filterOrAltRegex =
     "^##(FILTER|ALT)="
     "<ID=([A-Za-z0-9]+),"
     "Description=\"([^\"]+)\">$";
 
+INLINE void nonAsciiWorkaround(char *line)
+// Workaround for annoying 3-byte quote marks included in some 1000 Genomes files:
+{
+(void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\"");
+(void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\"");
+}
+
 static void parseMetadataLine(struct vcfFile *vcff, char *line)
 /* Parse a VCF header line beginning with "##" that defines a metadata. */
 {
 char *ptr = line;
 if (ptr == NULL && !startsWith(ptr, "##"))
     errAbort("Bad line passed to parseMetadataLine");
 ptr += 2;
 char *firstEq = strchr(ptr, '=');
 if (firstEq == NULL)
     {
     vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line);
     return;
     }
 // Every metadata line is saved here:
 hashAddN(vcff->metaDataHash, ptr, (firstEq - ptr), vcfFileCloneStr(vcff, firstEq+1));
@@ -276,30 +283,31 @@
     if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs)))
 	{
 	// substrs[1] is major version #, substrs[2] is set only if there is a minor version,
 	// and substrs[3] is the minor version #.
 	vcff->majorVersion = atoi(line + substrs[1].rm_so);
 	if (substrs[2].rm_so != -1)
 	    vcff->minorVersion = atoi(line + substrs[3].rm_so);
 	}
     else
 	vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"",
 		   fileformatRegex, line);
     }
 else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line))
     {
     boolean isInfo = startsWith("##INFO=", line);
+    nonAsciiWorkaround(line);
     if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)))
 	// substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description.
 	{
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	if (sameString(def->key, "."))
 	    def->fieldCount = -1;
 	else
 	    def->fieldCount = atoi(line + substrs[3].rm_so);
 	def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]);
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[5]);
 	slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def);
 	}
     else
 	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",