7a2b8c93090dbf4365464458023ef9875488185b kent Wed Feb 4 08:53:15 2015 -0800 Making it so that there can be warning messages as well as errors for VCF files. Changing message 'missing ##fileformat= header line? Assuming 4.1.' to a warning and indeed making it assume 4.1. Also having a missing #CHROM POS type header also just a warning. diff --git src/lib/vcf.c src/lib/vcf.c index cde6810..b4e308b 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -156,45 +156,61 @@ "Expected alternate allele count (typically used in association analyses)"); } static bool vcfFileStopDueToErrors(struct vcfFile *vcff) /* determine if we should stop due to the number of errors */ { return vcff->errCnt > vcff->maxErr; } static void vcfFileErr(struct vcfFile *vcff, char *format, ...) #if defined(__GNUC__) __attribute__((format(printf, 2, 3))) #endif ; +static void vaVcfWarn(struct vcfFile *vcff, char *format, va_list args) +/* Add a little bit of info like file position to warning */ +{ +if (vcff->lf != NULL) + { + char formatPlus[1024]; + safef(formatPlus, sizeof(formatPlus), + "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format); + vaWarn(formatPlus, args); + } +else + vaWarn(format, args); +} + +static void vcfFileWarn(struct vcfFile *vcff, char *format, ...) +/* Send error message to errabort stack's warn handler and abort */ +{ +va_list args; +va_start(args, format); +vaVcfWarn(vcff, format, args); +} + static void vcfFileErr(struct vcfFile *vcff, char *format, ...) /* Send error message to errabort stack's warn handler and abort */ { vcff->errCnt++; if (vcff->maxErr == VCF_IGNORE_ERRS) return; va_list args; va_start(args, format); -char formatPlus[1024]; -if (vcff->lf != NULL) - sprintf(formatPlus, "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format); -else - strcpy(formatPlus, format); -vaWarn(formatPlus, args); -va_end(args); +vaVcfWarn(vcff, format, args); if (vcfFileStopDueToErrors(vcff)) errAbort("VCF: %d parser errors, quitting", vcff->errCnt); } static void *vcfFileAlloc(struct vcfFile *vcff, size_t size) /* Use vcff's local mem to allocate memory. */ { return lmAlloc( vcfFileLm(vcff), size); } INLINE char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size) /* Use vcff's local mem to allocate memory for a string and copy it. */ { return lmCloneStringZ( vcfFileLm(vcff), str, size); } @@ -370,40 +386,36 @@ { if (exp2 == NULL) vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" but got \"%s\"", ix+1, exp1, words[ix]); else if (! sameString(exp2, words[ix])) vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" or \"%s\" " "but got \"%s\"", ix+1, exp1, exp2, words[ix]); } } #define expectColumnName(vcff, exp, words, ix) expectColumnName2(vcff, exp, NULL, words, ix) // There might be a whole lot of genotype columns... #define VCF_MAX_COLUMNS 16 * 1024 +char *vcfDefaultHeader = "#CHROM POS ID REF ALT QUAL FILTER INFO"; +/* Default header if we have none. */ + static void parseColumnHeaderRow(struct vcfFile *vcff, char *line) /* Make sure column names are as we expect, and store genotype sample IDs if any are given. */ { -if (line[0] != '#') - { - vcfFileErr(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), " - "not \"%s\"", line); - lineFileReuse(vcff->lf); - return; - } char *words[VCF_MAX_COLUMNS]; int wordCount = chopLine(line+1, words); if (wordCount >= VCF_MAX_COLUMNS) vcfFileErr(vcff, "header contains at least %d columns; " "VCF_MAX_COLUMNS may need to be increased in vcf.c!", VCF_MAX_COLUMNS); expectColumnName(vcff, "CHROM", words, 0); expectColumnName(vcff, "POS", words, 1); expectColumnName(vcff, "ID", words, 2); expectColumnName(vcff, "REF", words, 3); expectColumnName(vcff, "ALT", words, 4); expectColumnName2(vcff, "QUAL", "PROB", words, 5); expectColumnName(vcff, "FILTER", words, 6); expectColumnName(vcff, "INFO", words, 7); if (wordCount > 8) { @@ -469,39 +481,52 @@ struct dyString *dyHeader = dyStringNew(1024); char *line = NULL; // First, metadata lines beginning with "##": while (lineFileNext(lf, &line, NULL) && startsWith("##", line)) { dyStringAppend(dyHeader, line); dyStringAppendC(dyHeader, '\n'); parseMetadataLine(vcff, line); } slReverse(&(vcff->infoDefs)); slReverse(&(vcff->filterDefs)); slReverse(&(vcff->gtFormatDefs)); // Did we get the bare minimum VCF header with supported version? if (vcff->majorVersion == 0) - vcfFileErr(vcff, "missing ##fileformat= header line? Assuming 4.1."); + { + vcfFileWarn(vcff, "missing ##fileformat= header line? Assuming 4.1."); + vcff->majorVersion = 4; + vcff->minorVersion = 1; + } if ((vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) && (vcff->majorVersion != 3)) vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1", vcff->majorVersion, vcff->minorVersion); // Next, one header line beginning with single "#" that names the columns: if (line == NULL) // EOF after metadata return vcff; +char headerLineBuf[256]; +if (line[0] != '#') + { + lineFileReuse(lf); + vcfFileWarn(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), " + "assuming default VCF 4.1 columns"); + safef(headerLineBuf, sizeof(headerLineBuf), "%s", vcfDefaultHeader); + line = headerLineBuf; + } dyStringAppend(dyHeader, line); dyStringAppendC(dyHeader, '\n'); parseColumnHeaderRow(vcff, line); vcff->headerString = dyStringCannibalize(&dyHeader); return vcff; } #define VCF_MAX_INFO 512 static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt) /* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns. * Use the length of the reference sequence to set record->chromEnd. * Note: this trashes the alt argument, since this is expected to be its last use. */ {