4344e168885850e053dc245826ab42f2b445c98f angie Wed Aug 24 13:11:01 2011 -0700 Feature #2821 (VCF parser): Fun with flawed input. This file's rows have6 keywords in the format column, but most genotype columns have only 5 pieces of data: ftp://ftp-trace.ncbi.nlm.nih.gov/1000genomes/ftp/release/20100804/supporting/AFR.BI_withr2.20100804.genotypes.vcf.gz This causes many vcfFileErr calls in vcfParseGenotypes. Turned out that vcfFileErr was not stopping after the specified # of errors; and that the warnings are so verbose that I don't think we really need to display them in hgTracks and hgc. So they are now ignored, and we don't segv on missing genotype info. diff --git src/lib/vcf.c src/lib/vcf.c index 35e834a..0b8bc81 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -159,33 +159,37 @@ return vcff->errCnt > vcff->maxErr; } static void vcfFileErr(struct vcfFile *vcff, char *format, ...) #if defined(__GNUC__) __attribute__((format(printf, 2, 3))) #endif ; static void vcfFileErr(struct vcfFile *vcff, char *format, ...) /* Send error message to errabort stack's warn handler and abort */ { va_list args; va_start(args, format); char formatPlus[1024]; +if (vcff->lf != NULL) sprintf(formatPlus, "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format); +else + strcpy(formatPlus, format); vaWarn(formatPlus, args); va_end(args); +vcff->errCnt++; if (vcfFileStopDueToErrors(vcff)) errAbort("VCF: %d parser errors, quitting", vcff->errCnt); } static void *vcfFileAlloc(struct vcfFile *vcff, size_t size) /* allocate memory from the memory pool */ { return lmAlloc(vcff->pool->lm, size); } static char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size) /* allocate memory for a string and copy it */ { return lmCloneStringZ(vcff->pool->lm, str, size); } @@ -233,31 +237,31 @@ return vcfInfoString; vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line); return vcfInfoNoType; } // Regular expressions to check format and extract information from header lines: static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$"; static const char *infoOrFormatRegex = "^##(INFO|FORMAT)=" "<ID=([A-Za-z0-9_:-]+)," "Number=(\\.|[0-9-]+)," "Type=([A-Za-z]+)," "Description=\"?([^\"]+)\"?>$"; static const char *filterOrAltRegex = "^##(FILTER|ALT)=" - "<ID=([A-Za-z0-9_:-]+)," + "<ID=([^,]+)," "(Description|Type)=\"([^\"]+)\">$"; INLINE void nonAsciiWorkaround(char *line) // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files: { (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\""); (void)strSwapStrs(line, strlen(line)+1, "\342\200\235", "\""); } static void parseMetadataLine(struct vcfFile *vcff, char *line) /* Parse a VCF header line beginning with "##" that defines a metadata. */ { char *ptr = line; if (ptr == NULL && !startsWith(ptr, "##")) errAbort("Bad line passed to parseMetadataLine"); @@ -829,31 +833,30 @@ if (sep == NULL) gt->isHaploid = TRUE; else gt->hapIxB = atoi(sep+1); } struct vcfInfoElement *el = &(gt->infoElements[j]); el->key = formatWords[j]; el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j], &(el->values)); if (el->count >= VCF_MAX_INFO) vcfFileErr(vcff, "A single element of the genotype column for \"%s\" " "has at least %d values; " "VCF_MAX_INFO may need to be increased in vcf.c!", gt->id, VCF_MAX_INFO); } - } record->genotypeUnparsedStrings = NULL; } const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId) /* Find the genotype and associated info for the individual, or return NULL. * This calls vcfParseGenotypes if it has not already been called. */ { struct vcfFile *vcff = record->file; if (sampleId == NULL || vcff->genotypeCount == 0) return NULL; vcfParseGenotypes(record); int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount); if (ix >= 0) return &(record->genotypes[ix]);