42268aaa2203e616a143d1c032070320a830c467 angie Fri Mar 11 15:37:43 2011 -0800 Feature #2821 (VCF parser): adding workaround for annoying non-ASCIIcharacters in the header of a file included in 1000 Genomes' pilot release, found when adding basic test case. diff --git src/lib/vcf.c src/lib/vcf.c index ff763dc..419d81d 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -242,30 +242,37 @@ } // Regular expressions to check format and extract information from header lines: static const char *fileformatRegex = "^##fileformat=VCFv([0-9]+)(\\.([0-9]+))?$"; static const char *infoOrFormatRegex = "^##(INFO|FORMAT)=" "<ID=([A-Za-z0-9]+)," "Number=(\\.|[0-9]+)," "Type=([A-Za-z]+)," "Description=\"([^\"]+)\">$"; static const char *filterOrAltRegex = "^##(FILTER|ALT)=" "<ID=([A-Za-z0-9]+)," "Description=\"([^\"]+)\">$"; +INLINE void nonAsciiWorkaround(char *line) +// Workaround for annoying 3-byte quote marks included in some 1000 Genomes files: +{ +(void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\""); +(void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\""); +} + static void parseMetadataLine(struct vcfFile *vcff, char *line) /* Parse a VCF header line beginning with "##" that defines a metadata. */ { char *ptr = line; if (ptr == NULL && !startsWith(ptr, "##")) errAbort("Bad line passed to parseMetadataLine"); ptr += 2; char *firstEq = strchr(ptr, '='); if (firstEq == NULL) { vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line); return; } // Every metadata line is saved here: hashAddN(vcff->metaDataHash, ptr, (firstEq - ptr), vcfFileCloneStr(vcff, firstEq+1)); @@ -276,30 +283,31 @@ if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs))) { // substrs[1] is major version #, substrs[2] is set only if there is a minor version, // and substrs[3] is the minor version #. vcff->majorVersion = atoi(line + substrs[1].rm_so); if (substrs[2].rm_so != -1) vcff->minorVersion = atoi(line + substrs[3].rm_so); } else vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"", fileformatRegex, line); } else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line)) { boolean isInfo = startsWith("##INFO=", line); + nonAsciiWorkaround(line); if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs))) // substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description. { struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); if (sameString(def->key, ".")) def->fieldCount = -1; else def->fieldCount = atoi(line + substrs[3].rm_so); def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]); def->description = vcfFileCloneSubstr(vcff, line, substrs[5]); slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def); } else vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"",