e0d0e327c82dd1fd6acb00b23aa041219b7eb59f angie Fri Jul 1 16:22:30 2011 -0700 #986 (VCF): Loosening regexes for parsing VCF header to accomodate 1000 Genomes pilot release trio indel files. diff --git src/lib/vcf.c src/lib/vcf.c index ea175e5..65bc467 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -226,38 +226,38 @@ return vcfInfoFloat; if (sameString("Flag", typeWord)) return vcfInfoFlag; if (sameString("Character", typeWord)) return vcfInfoCharacter; if (sameString("String", typeWord)) return vcfInfoString; vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line); return vcfInfoNoType; } // Regular expressions to check format and extract information from header lines: static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$"; static const char *infoOrFormatRegex = "^##(INFO|FORMAT)=" - "<ID=([A-Za-z0-9]+)," - "Number=(\\.|[0-9]+)," + "<ID=([A-Za-z0-9_:-]+)," + "Number=(\\.|[0-9-]+)," "Type=([A-Za-z]+)," - "Description=\"([^\"]+)\">$"; + "Description=\"?([^\"]+)\"?>$"; static const char *filterOrAltRegex = "^##(FILTER|ALT)=" - "<ID=([A-Za-z0-9]+)," - "Description=\"([^\"]+)\">$"; + "<ID=([A-Za-z0-9_:-]+)," + "(Description|Type)=\"([^\"]+)\">$"; INLINE void nonAsciiWorkaround(char *line) // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files: { (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\""); (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\""); } static void parseMetadataLine(struct vcfFile *vcff, char *line) /* Parse a VCF header line beginning with "##" that defines a metadata. */ { char *ptr = line; if (ptr == NULL && !startsWith(ptr, "##")) errAbort("Bad line passed to parseMetadataLine"); ptr += 2;