6b905d759504a15d9c580b0c20d8eebb31f10dd6 angie Wed Oct 5 13:44:43 2011 -0700 Loosened regex and handle new non-numeric values of ##FORMAT's "Number"introduced in VCF 4.1. diff --git src/lib/vcf.c src/lib/vcf.c index 26ad918..b5202e1 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -232,37 +232,37 @@ if (sameString("Flag", typeWord)) return vcfInfoFlag; if (sameString("Character", typeWord)) return vcfInfoCharacter; if (sameString("String", typeWord)) return vcfInfoString; vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line); return vcfInfoNoType; } // Regular expressions to check format and extract information from header lines: static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$"; static const char *infoOrFormatRegex = "^##(INFO|FORMAT)=" "<ID=([A-Za-z0-9_:-]+)," - "Number=(\\.|[0-9-]+)," + "Number=(\\.|A|G|[0-9-]+)," "Type=([A-Za-z]+)," - "Description=\"?([^\"]+)\"?>$"; + "Description=\"?(.*)\"?>$"; static const char *filterOrAltRegex = "^##(FILTER|ALT)=" "<ID=([^,]+)," - "(Description|Type)=\"([^\"]+)\">$"; + "(Description|Type)=\"?(.*)\"?>$"; INLINE void nonAsciiWorkaround(char *line) // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files: { (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\""); (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\""); } static void parseMetadataLine(struct vcfFile *vcff, char *line) /* Parse a VCF header line beginning with "##" that defines a metadata. */ { char *ptr = line; if (ptr == NULL && !startsWith(ptr, "##")) errAbort("Bad line passed to parseMetadataLine"); ptr += 2; @@ -285,34 +285,38 @@ vcff->minorVersion = atoi(line + substrs[4].rm_so); } else vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"", fileformatRegex, line); } else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line)) { boolean isInfo = startsWith("##INFO=", line); nonAsciiWorkaround(line); if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs))) // substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description. { struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); - if (sameString(def->key, ".")) + char *number = vcfFileCloneSubstr(vcff, line, substrs[3]); + if (sameString(number, ".") || sameString(number, "A") || sameString(number, "G")) + // A is #alts which varies line-to-line; "G" is #genotypes which we haven't + // yet seen. Why is there a G here -- shouldn't such attributes go in the + // genotype columns? def->fieldCount = -1; else - def->fieldCount = atoi(line + substrs[3].rm_so); + def->fieldCount = atoi(number); def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]); def->description = vcfFileCloneSubstr(vcff, line, substrs[5]); slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def); } else vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"", (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, line); } else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line)) { boolean isFilter = startsWith("##FILTER", line); if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs))) { // substrs[2] is ID/key, substrs[4] is Description. struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));