1927e2636f3cd7e36bd28be1ce5aa6b2db274257 angie Tue Nov 29 15:28:19 2011 -0800 MLQ #6136: adding support for VCF 3.3 header line formats. diff --git src/lib/vcf.c src/lib/vcf.c index bc29dad..e53a6e9 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -239,30 +239,41 @@ return vcfInfoNoType; } // Regular expressions to check format and extract information from header lines: static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$"; static const char *infoOrFormatRegex = "^##(INFO|FORMAT)=" "<ID=([A-Za-z0-9_:-]+)," "Number=(\\.|A|G|[0-9-]+)," "Type=([A-Za-z]+)," "Description=\"?(.*)\"?>$"; static const char *filterOrAltRegex = "^##(FILTER|ALT)=" "<ID=([^,]+)," "(Description|Type)=\"?(.*)\"?>$"; +// VCF version 3.3 was different enough to warrant separate regexes: +static const char *infoOrFormatRegex3_3 = + "^##(INFO|FORMAT)=" + "([A-Za-z0-9_:-]+)," + "(\\.|A|G|[0-9-]+)," + "([A-Za-z]+)," + "\"?(.*)\"?$"; +static const char *filterRegex3_3 = + "^##(FILTER)=" + "([^,]+)," + "()\"?(.*)\"?$"; INLINE void nonAsciiWorkaround(char *line) // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files: { (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\""); (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\""); } static void parseMetadataLine(struct vcfFile *vcff, char *line) /* Parse a VCF header line beginning with "##" that defines a metadata. */ { char *ptr = line; if (ptr == NULL && !startsWith(ptr, "##")) errAbort("Bad line passed to parseMetadataLine"); ptr += 2; @@ -280,68 +291,76 @@ { // substrs[2] is major version #, substrs[3] is set only if there is a minor version, // and substrs[4] is the minor version #. vcff->majorVersion = atoi(line + substrs[2].rm_so); if (substrs[3].rm_so != -1) vcff->minorVersion = atoi(line + substrs[4].rm_so); } else vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"", fileformatRegex, line); } else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line)) { boolean isInfo = startsWith("##INFO=", line); nonAsciiWorkaround(line); - if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs))) + if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs)) || + regexMatchSubstr(line, infoOrFormatRegex3_3, substrs, ArraySize(substrs))) // substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description. { struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); char *number = vcfFileCloneSubstr(vcff, line, substrs[3]); if (sameString(number, ".") || sameString(number, "A") || sameString(number, "G")) // A is #alts which varies line-to-line; "G" is #genotypes which we haven't // yet seen. Why is there a G here -- shouldn't such attributes go in the // genotype columns? def->fieldCount = -1; else def->fieldCount = atoi(number); def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]); // greedy regex pulls in end quote, trim if found: if (line[substrs[5].rm_eo-1] == '"') line[substrs[5].rm_eo-1] = '\0'; def->description = vcfFileCloneSubstr(vcff, line, substrs[5]); slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def); } else - vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"", - (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, line); + vcfFileErr(vcff, "##%s line does not match expected pattern /%s/ or /%s/: \"%s\"", + (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, infoOrFormatRegex3_3, line); } else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line)) { boolean isFilter = startsWith("##FILTER", line); - if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs))) + if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)) || + regexMatchSubstr(line, filterRegex3_3, substrs, ArraySize(substrs))) { // substrs[2] is ID/key, substrs[4] is Description. struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); def->description = vcfFileCloneSubstr(vcff, line, substrs[4]); slAddHead((isFilter ? &(vcff->filterDefs) : &(vcff->altDefs)), def); } else - vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"", - (isFilter ? "FILTER" : "ALT"), filterOrAltRegex, line); + { + if (isFilter) + vcfFileErr(vcff, "##FILTER line does not match expected pattern /%s/ or /%s/: \"%s\"", + filterOrAltRegex, filterRegex3_3, line); + else + vcfFileErr(vcff, "##ALT line does not match expected pattern /%s/: \"%s\"", + filterOrAltRegex, line); + } } } static void expectColumnName2(struct vcfFile *vcff, char *exp1, char *exp2, char *words[], int ix) /* Every file must include a header naming the columns, though most column names are * fixed; make sure the names of fixed columns are as expected. */ { if (! sameString(exp1, words[ix])) { if (exp2 == NULL) vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" but got \"%s\"", ix+1, exp1, words[ix]); else if (! sameString(exp2, words[ix])) vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" or \"%s\" " "but got \"%s\"", ix+1, exp1, exp2, words[ix]);