491395482b817cfe82cbdef4dd7f8ee0723c1a4a angie Wed Mar 23 21:01:49 2011 -0700 Feature #2822, #2823 (VCF customFactory + track handler):Added a new track type, vcfTabix, with handlers in hgTracks and hgc and a customFactory. It is a new bigDataUrl type of track; the remote VCF file must be compressed and indexed by tabix, so like BAM a separate index file is required. If the VCF file has genotypes, then each sample's two haplotypes are graphed in a line, with one line per sample. Otherwise, alleles and counts (if available) are drawn using Belinda's pgSnp methods. The source code has to be compiled with USE_TABIX=1 (which is automatically set for us by common.mk when it finds the local installation) in order for the CGIs to recognize the track type. diff --git src/lib/vcf.c src/lib/vcf.c index b583131..ea175e5 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -1,21 +1,22 @@ /* VCF: Variant Call Format, version 4.0 / 4.1 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40 * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 */ #include "common.h" +#include "errabort.h" #include <limits.h> #include "localmem.h" #include "net.h" #include "regexHelper.h" #include "vcf.h" /* Reserved but optional INFO keys: */ const char *vcfInfoAncestralAllele = "AA"; const char *vcfInfoPerAlleleGtCount = "AC"; // allele count in genotypes, for each ALT allele, // in the same order as listed const char *vcfInfoAlleleFrequency = "AF"; // allele frequency for each ALT allele in the same // order as listed: use this when estimated from // primary data, not called genotypes const char *vcfInfoNumAlleles = "AN"; // total number of alleles in called genotypes const char *vcfInfoBaseQuality = "BQ"; // RMS base quality at this position @@ -151,49 +152,41 @@ "Expected alternate allele count (typically used in association analyses)"); } static bool vcfFileStopDueToErrors(struct vcfFile *vcff) /* determine if we should stop due to the number of errors */ { return vcff->errCnt > vcff->maxErr; } static void vcfFileErr(struct vcfFile *vcff, char *format, ...) #if defined(__GNUC__) __attribute__((format(printf, 2, 3))) #endif ; -static void vaVcfFileErr(struct vcfFile *vcff, char *format, va_list args) -/* Print error message to error file, abort if max errors have been reached */ -{ -if (vcff->lf != NULL) - fprintf(vcff->errFh, "%s:%d: ", vcff->lf->fileName, vcff->lf->lineIx); -vfprintf(vcff->errFh, format, args); -fprintf(vcff->errFh, "\n"); -vcff->errCnt++; -if (vcfFileStopDueToErrors(vcff)) - errAbort("VCF: %d parser errors", vcff->errCnt); -} - static void vcfFileErr(struct vcfFile *vcff, char *format, ...) -/* Print error message and abort */ +/* Send error message to errabort stack's warn handler and abort */ { va_list args; va_start(args, format); -vaVcfFileErr(vcff, format, args); +char formatPlus[1024]; +sprintf(formatPlus, "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format); +vaWarn(formatPlus, args); va_end(args); +if (vcfFileStopDueToErrors(vcff)) + errAbort("VCF: %d parser errors, quitting", vcff->errCnt); } static void *vcfFileAlloc(struct vcfFile *vcff, size_t size) /* allocate memory from the memory pool */ { return lmAlloc(vcff->pool->lm, size); } static char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size) /* allocate memory for a string and copy it */ { return lmCloneStringZ(vcff->pool->lm, str, size); } static char *vcfFileCloneStr(struct vcfFile *vcff, char *str) @@ -230,31 +223,31 @@ if (sameString("Integer", typeWord)) return vcfInfoInteger; if (sameString("Float", typeWord)) return vcfInfoFloat; if (sameString("Flag", typeWord)) return vcfInfoFlag; if (sameString("Character", typeWord)) return vcfInfoCharacter; if (sameString("String", typeWord)) return vcfInfoString; vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line); return vcfInfoNoType; } // Regular expressions to check format and extract information from header lines: -static const char *fileformatRegex = "^##fileformat=VCFv([0-9]+)(\\.([0-9]+))?$"; +static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$"; static const char *infoOrFormatRegex = "^##(INFO|FORMAT)=" "<ID=([A-Za-z0-9]+)," "Number=(\\.|[0-9]+)," "Type=([A-Za-z]+)," "Description=\"([^\"]+)\">$"; static const char *filterOrAltRegex = "^##(FILTER|ALT)=" "<ID=([A-Za-z0-9]+)," "Description=\"([^\"]+)\">$"; INLINE void nonAsciiWorkaround(char *line) // Workaround for annoying 3-byte quote marks included in some 1000 Genomes files: { (void)strSwapStrs(line, strlen(line)+1, "\342\200\234", "\""); @@ -266,39 +259,39 @@ { char *ptr = line; if (ptr == NULL && !startsWith(ptr, "##")) errAbort("Bad line passed to parseMetadataLine"); ptr += 2; char *firstEq = strchr(ptr, '='); if (firstEq == NULL) { vcfFileErr(vcff, "Metadata line lacks '=': \"%s\"", line); return; } // Every metadata line is saved here: hashAddN(vcff->metaDataHash, ptr, (firstEq - ptr), vcfFileCloneStr(vcff, firstEq+1)); regmatch_t substrs[8]; // Some of the metadata lines are crucial for parsing the rest of the file: -if (startsWith("##fileformat=", line)) +if (startsWith("##fileformat=", line) || startsWith("##format", line)) { if (regexMatchSubstr(line, fileformatRegex, substrs, ArraySize(substrs))) { - // substrs[1] is major version #, substrs[2] is set only if there is a minor version, - // and substrs[3] is the minor version #. - vcff->majorVersion = atoi(line + substrs[1].rm_so); - if (substrs[2].rm_so != -1) - vcff->minorVersion = atoi(line + substrs[3].rm_so); + // substrs[2] is major version #, substrs[3] is set only if there is a minor version, + // and substrs[4] is the minor version #. + vcff->majorVersion = atoi(line + substrs[2].rm_so); + if (substrs[3].rm_so != -1) + vcff->minorVersion = atoi(line + substrs[4].rm_so); } else vcfFileErr(vcff, "##fileformat line does not match expected pattern /%s/: \"%s\"", fileformatRegex, line); } else if (startsWith("##INFO=", line) || startsWith("##FORMAT=", line)) { boolean isInfo = startsWith("##INFO=", line); nonAsciiWorkaround(line); if (regexMatchSubstr(line, infoOrFormatRegex, substrs, ArraySize(substrs))) // substrs[2] is ID/key, substrs[3] is Number, [4] is Type and [5] is Description. { struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); if (sameString(def->key, ".")) @@ -318,170 +311,189 @@ boolean isFilter = startsWith("##FILTER", line); if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs))) { // substrs[2] is ID/key, substrs[3] is Description. struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); def->description = vcfFileCloneSubstr(vcff, line, substrs[3]); slAddHead((isFilter ? &(vcff->filterDefs) : &(vcff->altDefs)), def); } else vcfFileErr(vcff, "##%s line does not match expected pattern /%s/: \"%s\"", (isFilter ? "FILTER" : "ALT"), filterOrAltRegex, line); } } -static void expectColumnName(struct vcfFile *vcff, char *expected, char *words[], int ix) +static void expectColumnName2(struct vcfFile *vcff, char *exp1, char *exp2, char *words[], int ix) /* Every file must include a header naming the columns, though most column names are * fixed; make sure the names of fixed columns are as expected. */ { -if (! sameString(expected, words[ix])) +if (! sameString(exp1, words[ix])) + { + if (exp2 == NULL) vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" but got \"%s\"", - ix+1, expected, words[ix]); + ix+1, exp1, words[ix]); + else if (! sameString(exp2, words[ix])) + vcfFileErr(vcff, "Expected column %d's name in header to be \"%s\" or \"%s\" " + "but got \"%s\"", ix+1, exp1, exp2, words[ix]); + } } +#define expectColumnName(vcff, exp, words, ix) expectColumnName2(vcff, exp, NULL, words, ix) + // There might be a whole lot of genotype columns... #define VCF_MAX_COLUMNS 16 * 1024 static void parseColumnHeaderRow(struct vcfFile *vcff, char *line) /* Make sure column names are as we expect, and store genotype sample IDs if any are given. */ { if (line[0] != '#') { vcfFileErr(vcff, "Expected to find # followed by column names (\"#CHROM POS ...\"), " "not \"%s\"", line); lineFileReuse(vcff->lf); return; } char *words[VCF_MAX_COLUMNS]; int wordCount = chopLine(line+1, words); if (wordCount >= VCF_MAX_COLUMNS) vcfFileErr(vcff, "header contains at least %d columns; " "VCF_MAX_COLUMNS may need to be increased in vcf.c!", VCF_MAX_COLUMNS); expectColumnName(vcff, "CHROM", words, 0); expectColumnName(vcff, "POS", words, 1); expectColumnName(vcff, "ID", words, 2); expectColumnName(vcff, "REF", words, 3); expectColumnName(vcff, "ALT", words, 4); -expectColumnName(vcff, "QUAL", words, 5); +expectColumnName2(vcff, "QUAL", "PROB", words, 5); expectColumnName(vcff, "FILTER", words, 6); expectColumnName(vcff, "INFO", words, 7); if (wordCount > 8) { expectColumnName(vcff, "FORMAT", words, 8); if (wordCount < 10) vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?"); vcff->genotypeCount = (wordCount - 9); vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; for (i = 9; i < wordCount; i++) vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]); } } static struct vcfFile *vcfFileNew() /* Return a new, empty vcfFile object. */ { struct vcfFile *vcff = NULL; AllocVar(vcff); vcff->metaDataHash = hashNew(0); vcff->pool = hashNew(0); return vcff; } -static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr, FILE *errFh) +static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr) /* Parse a VCF file into a vcfFile object. If maxErr not zero, then * continue to parse until this number of error have been reached. A maxErr - * less than zero does not stop and reports all errors. Write errors to errFh, - * if NULL, use stderr. */ + * less than zero does not stop and reports all errors. */ { initVcfSpecInfoDefs(); initVcfSpecGtFormatDefs(); if (lf == NULL) return NULL; struct vcfFile *vcff = vcfFileNew(); vcff->lf = lf; vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName); -vcff->errFh = (errFh != NULL) ? errFh : stderr; vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr; char *line = NULL; // First, metadata lines beginning with "##": while (lineFileNext(lf, &line, NULL) && startsWith("##", line)) parseMetadataLine(vcff, line); slReverse(&(vcff->infoDefs)); slReverse(&(vcff->filterDefs)); slReverse(&(vcff->gtFormatDefs)); // Did we get the bare minimum VCF header with supported version? if (vcff->majorVersion == 0) vcfFileErr(vcff, "missing ##fileformat= header line? Assuming 4.1."); -if (vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) - vcfFileErr(vcff, "VCFv%d.%d not supported -- only 4.0 or 4.1", +if ((vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) && + (vcff->majorVersion != 3)) + vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1", vcff->majorVersion, vcff->minorVersion); // Next, one header line beginning with single "#" that names the columns: if (line == NULL) // EOF after metadata return vcff; parseColumnHeaderRow(vcff, line); return vcff; } -static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, char *key) -/* Look up the type of INFO component key, in the definitions from the header, - * and failing that, from the keys reserved in the spec. */ +struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key) +/* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */ { struct vcfInfoDef *def; // I expect there to be fairly few definitions (less than a dozen) so // I'm just doing a linear search not hash: for (def = vcff->infoDefs; def != NULL; def = def->next) { if (sameString(key, def->key)) - return def->type; + return def; } for (def = vcfSpecInfoDefs; def != NULL; def = def->next) { if (sameString(key, def->key)) - return def->type; + return def; } +return NULL; +} + +static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, const char *key) +/* Look up the type of INFO component key, in the definitions from the header, + * and failing that, from the keys reserved in the spec. */ +{ +struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key); +if (def == NULL) + { vcfFileErr(vcff, "There is no INFO header defining \"%s\"", key); // default to string so we can display value as-is: return vcfInfoString; } +return def->type; +} #define VCF_MAX_INFO 512 int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type, char *valStr, union vcfDatum **pData) /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */ { char *valWords[VCF_MAX_INFO]; int count = chopCommas(valStr, valWords); struct vcfFile *vcff = record->file; union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum)); int j; for (j = 0; j < count; j++) switch (type) { case vcfInfoInteger: data[j].datInt = atoi(valWords[j]); break; case vcfInfoFloat: data[j].datFloat = atof(valWords[j]); break; case vcfInfoFlag: - // No data value + // Flag key might have a value in older VCFs e.g. 3.2's DB=0, DB=1 + data[j].datString = vcfFilePooledStr(vcff, valWords[j]); break; case vcfInfoCharacter: data[j].datChar = valWords[j][0]; break; case vcfInfoString: data[j].datString = vcfFilePooledStr(vcff, valWords[j]); break; default: errAbort("invalid vcfInfoType (uninitialized?) %d", type); break; } // If END is given, use it as chromEnd: if (sameString(infoKey, vcfInfoEnd)) record->chromEnd = data[0].datInt; *pData = data; @@ -558,64 +570,63 @@ { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); } slAddHead(&(vcff->records), record); } slReverse(&(vcff->records)); lineFileClose(&(vcff->lf)); } -struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, FILE *errFh) +struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr) /* Parse a VCF file into a vcfFile object. If maxErr not zero, then * continue to parse until this number of error have been reached. A maxErr - * less than zero does not stop and reports all errors. Write errors to errFh, - * if NULL, use stderr. */ + * less than zero does not stop and reports all errors. */ { struct lineFile *lf = NULL; if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) || startsWith("https://", fileOrUrl)) lf = netLineFileOpen(fileOrUrl); else lf = lineFileMayOpen(fileOrUrl, TRUE); -struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr, errFh); +struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); vcfParseData(vcff); return vcff; } struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end, - int maxErr, FILE *errFh) + int maxErr) /* Parse header and rows within the given position range from a VCF file that has been * compressed and indexed by tabix into a vcfFile object; return NULL if or if file has * no items in range. * If maxErr not zero, then continue to parse until this number of error have been reached. - * A maxErr less than zero does not stop and reports all errors. Write errors to errFh, - * if NULL, use stderr. */ + * A maxErr less than zero does not stop and reports all errors. */ { struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE); -struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr, errFh); +struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr); if (vcff == NULL) return NULL; -if (! lineFileSetTabixRegion(lf, chrom, start, end)) - // No items in region - return vcff; +if (isNotEmpty(chrom) && start != end) + { + if (lineFileSetTabixRegion(lf, chrom, start, end)) vcfParseData(vcff); + } return vcff; } void vcfFileFree(struct vcfFile **pVcff) /* Free a vcfFile object. */ { if (pVcff == NULL || *pVcff == NULL) return; struct vcfFile *vcff = *pVcff; hashFree(&(vcff->pool)); hashFree(&(vcff->byName)); hashFree(&(vcff->metaDataHash)); lineFileClose(&(vcff->lf)); freez(pVcff); } @@ -658,59 +669,73 @@ return varList; } const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key) /* Find an INFO element, or NULL. */ { int i; for (i = 0; i < record->infoCount; i++) { if (sameString(key, record->infoElements[i].key)) return &(record->infoElements[i]); } return NULL; } -static enum vcfInfoType typeForGtFormat(struct vcfFile *vcff, char *key) -/* Look up the type of FORMAT component key, in the definitions from the header, +struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key) +/* Look up the type of genotype FORMAT component key, in the definitions from the header, * and failing that, from the keys reserved in the spec. */ { struct vcfInfoDef *def; // I expect there to be fairly few definitions (less than a dozen) so // I'm just doing a linear search not hash: for (def = vcff->gtFormatDefs; def != NULL; def = def->next) { if (sameString(key, def->key)) - return def->type; + return def; } for (def = vcfSpecGtFormatDefs; def != NULL; def = def->next) { if (sameString(key, def->key)) - return def->type; + return def; + } +return NULL; } + +static enum vcfInfoType typeForGtFormat(struct vcfFile *vcff, const char *key) +/* Look up the type of FORMAT component key, in the definitions from the header, + * and failing that, from the keys reserved in the spec. */ +{ +struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, key); +if (def == NULL) + { vcfFileErr(vcff, "There is no FORMAT header defining \"%s\"", key); // default to string so we can display value as-is: return vcfInfoString; } +return def->type; +} #define VCF_MAX_FORMAT VCF_MAX_INFO #define VCF_MAX_FORMAT_LEN (VCF_MAX_FORMAT * 4) -static void parseGenotypes(struct vcfRecord *record) +void vcfParseGenotypes(struct vcfRecord *record) /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[]. * This destroys genotypesUnparsedStrings. */ { +if (record->genotypeUnparsedStrings == NULL) + return; struct vcfFile *vcff = record->file; record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype)); char format[VCF_MAX_FORMAT_LEN]; safecpy(format, sizeof(format), record->format); char *formatWords[VCF_MAX_FORMAT]; int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords)); if (formatWordCount >= VCF_MAX_FORMAT) vcfFileErr(vcff, "The FORMAT column has at least %d words; " "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT); if (differentString(formatWords[0], vcfGtGenotype)) vcfFileErr(vcff, "FORMAT column should begin with \"%s\" but begins with \"%s\"", vcfGtGenotype, formatWords[0]); int i; for (i = 0; i < vcff->genotypeCount; i++) { @@ -734,43 +759,43 @@ if (sameString(formatWords[j], vcfGtGenotype)) { char *genotype = gtWords[j]; char *sep = strchr(genotype, '|'); if (sep != NULL) gt->isPhased = TRUE; else sep = strchr(genotype, '/'); gt->hapIxA = atoi(genotype); if (sep == NULL) gt->isHaploid = TRUE; else gt->hapIxB = atoi(sep+1); } struct vcfInfoElement *el = &(gt->infoElements[j]); - el->key = formatWords[j]; + el->key = vcfFilePooledStr(vcff, formatWords[j]); enum vcfInfoType type = typeForGtFormat(vcff, formatWords[j]); el->count = parseInfoValue(record, formatWords[j], type, gtWords[j], &(el->values)); if (el->count >= VCF_MAX_INFO) vcfFileErr(vcff, "A single element of the genotype column for \"%s\" " "has at least %d values; " "VCF_MAX_INFO may need to be increased in vcf.c!", gt->id, VCF_MAX_INFO); } } record->genotypeUnparsedStrings = NULL; } const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId) -/* Find the genotype and associated info for the individual, or return NULL. */ +/* Find the genotype and associated info for the individual, or return NULL. + * This calls vcfParseGenotypes if it has not already been called. */ { -if (sampleId == NULL) - return NULL; struct vcfFile *vcff = record->file; -if (record->genotypeUnparsedStrings != NULL && record->genotypes == NULL) - parseGenotypes(record); +if (sampleId == NULL || vcff->genotypeCount == 0) + return NULL; +vcfParseGenotypes(record); int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount); if (ix >= 0) return &(record->genotypes[ix]); return NULL; }