519e0946826199d1d9792fa8df5972843fce021c angie Tue Aug 9 14:39:38 2011 -0700 Feature #2821 (VCF parser): improved representation of alleles:parse ref and comma-sep'd alt allele string into count and array inside record, so callers don't all have to parse the comma-sep'd alternate allele string. diff --git src/lib/vcf.c src/lib/vcf.c index b93295f..a5432fd 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -414,30 +414,46 @@ if (vcff->majorVersion == 0) vcfFileErr(vcff, "missing ##fileformat= header line? Assuming 4.1."); if ((vcff->majorVersion != 4 || (vcff->minorVersion != 0 && vcff->minorVersion != 1)) && (vcff->majorVersion != 3)) vcfFileErr(vcff, "VCFv%d.%d not supported -- only v3.*, v4.0 or v4.1", vcff->majorVersion, vcff->minorVersion); // Next, one header line beginning with single "#" that names the columns: if (line == NULL) // EOF after metadata return vcff; parseColumnHeaderRow(vcff, line); return vcff; } +#define VCF_MAX_INFO 512 + +static void parseRefAndAlt(struct vcfFile *vcff, struct vcfRecord *record, char *ref, char *alt) +/* Make an array of alleles, ref first, from the REF and comma-sep'd ALT columns. + * Note: this trashes the alt argument, since this is expected to be its last use. */ +{ +char *altAlleles[VCF_MAX_INFO]; +int altCount = chopCommas(alt, altAlleles); +record->alleleCount = 1 + altCount; +record->alleles = vcfFileAlloc(vcff, record->alleleCount * sizeof(record->alleles[0])); +record->alleles[0] = vcfFilePooledStr(vcff, ref); +int i; +for (i = 0; i < altCount; i++) + record->alleles[1+i] = vcfFilePooledStr(vcff, altAlleles[i]); +} + static void parseFilterColumn(struct vcfFile *vcff, struct vcfRecord *record, char *filterStr) /* Transform ;-separated filter codes into count + string array. */ { // We don't want to modify something allocated with vcfFilePooledStr because that uses // hash element names for storage! So don't make a vcfFilePooledStr copy of filterStr and // chop that; instead, chop a temp string and pool the words separately. static struct dyString *tmp = NULL; if (tmp == NULL) tmp = dyStringNew(0); dyStringClear(tmp); dyStringAppend(tmp, filterStr); record->filterCount = countChars(filterStr, ';') + 1; record->filters = vcfFileAlloc(vcff, record->filterCount * sizeof(char **)); (void)chopByChar(tmp->string, ';', record->filters, record->filterCount); int i; @@ -466,34 +482,32 @@ static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, const char *key) /* Look up the type of INFO component key, in the definitions from the header, * and failing that, from the keys reserved in the spec. */ { struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key); if (def == NULL) { vcfFileErr(vcff, "There is no INFO header defining \"%s\"", key); // default to string so we can display value as-is: return vcfInfoString; } return def->type; } -#define VCF_MAX_INFO 512 - -int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type, char *valStr, - union vcfDatum **pData) +static int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type, + char *valStr, union vcfDatum **pData) /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */ { char *valWords[VCF_MAX_INFO]; int count = chopCommas(valStr, valWords); struct vcfFile *vcff = record->file; union vcfDatum *data = vcfFileAlloc(vcff, count * sizeof(union vcfDatum)); int j; for (j = 0; j < count; j++) switch (type) { case vcfInfoInteger: data[j].datInt = atoi(valWords[j]); break; case vcfInfoFloat: data[j].datFloat = atof(valWords[j]); @@ -576,32 +590,31 @@ if (vcff->genotypeCount > 0) expected = 9 + vcff->genotypeCount; char *words[VCF_MAX_COLUMNS]; int wordCount; while ((wordCount = lineFileChop(vcff->lf, words)) > 0) { lineFileExpectWords(vcff->lf, expected, wordCount); struct vcfRecord *record; AllocVar(record); record->file = vcff; record->chrom = vcfFilePooledStr(vcff, words[0]); record->chromStart = lineFileNeedNum(vcff->lf, words, 1) - 1; // chromEnd may be modified by parseInfoColumn, if INFO column includes END. record->chromEnd = record->chromStart + 1; record->name = vcfFilePooledStr(vcff, words[2]); - record->ref = vcfFilePooledStr(vcff, words[3]); - record->alt = vcfFilePooledStr(vcff, words[4]); + parseRefAndAlt(vcff, record, words[3], words[4]); record->qual = vcfFilePooledStr(vcff, words[5]); parseFilterColumn(vcff, record, words[6]); parseInfoColumn(vcff, record, words[7]); if (vcff->genotypeCount > 0) { record->format = vcfFilePooledStr(vcff, words[8]); record->genotypeUnparsedStrings = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *)); int i; // Don't bother actually parsing all these until & unless we need the info: for (i = 0; i < vcff->genotypeCount; i++) record->genotypeUnparsedStrings[i] = vcfFileCloneStr(vcff, words[9+i]); } slAddHead(&(vcff->records), record); }