3f6e034cd0844d8e99a69e56af72e8c4cce98a9c chmalee Tue Jan 30 15:55:18 2024 -0800 Hash INFO keys during header parsing for faster lookups later when loading records diff --git src/lib/vcf.c src/lib/vcf.c index 6578ff7..e34a580 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -341,30 +341,31 @@ // yet seen. Why is there a G here -- shouldn't such attributes go in the // genotype columns? def->fieldCount = -1; else def->fieldCount = atoi(number); def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]); // greedy regex pulls in end quote, trim if found: if (line[substrs[5].rm_eo-1] == '"') line[substrs[5].rm_eo-1] = '\0'; def->description = vcfFileCloneSubstr(vcff, line, substrs[5]); char *p = NULL; if ((p = strstr(def->description, "\",Source=\"")) || (p = strstr(def->description, "\",Version=\""))) *p = '\0'; slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def); + if (isInfo) hashAdd(vcff->infoDefHash, def->key, def); } else vcfFileErr(vcff, "##%s line does not match expected pattern /%s/ or /%s/: \"%s\"", (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, infoOrFormatRegex3_3, line); } else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line)) { boolean isFilter = startsWith("##FILTER", line); if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)) || regexMatchSubstr(line, filterRegex3_3, substrs, ArraySize(substrs))) { // substrs[2] is ID/key, substrs[4] is Description. struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); // greedy regex pulls in end quote, trim if found: @@ -473,30 +474,33 @@ static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr) /* Parse a VCF file into a vcfFile object. If maxErr not zero, then * continue to parse until this number of error have been reached. A maxErr * less than zero does not stop and reports all errors. * Set maxErr to VCF_IGNORE_ERRS for silence */ { initVcfSpecInfoDefs(); initVcfSpecGtFormatDefs(); if (lf == NULL) return NULL; struct vcfFile *vcff = vcfFileNew(); vcff->lf = lf; vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName); vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr; +// keep a hash of the INFO keys +vcff->infoDefHash = hashNew(0); + struct dyString *dyHeader = dyStringNew(1024); char *line = NULL; // First, metadata lines beginning with "##": while (lineFileNext(lf, &line, NULL) && startsWith("##", line)) { dyStringAppend(dyHeader, line); dyStringAppendC(dyHeader, '\n'); parseMetadataLine(vcff, line); } slReverse(&(vcff->infoDefs)); slReverse(&(vcff->filterDefs)); slReverse(&(vcff->gtFormatDefs)); // Did we get the bare minimum VCF header with supported version? if (vcff->majorVersion == 0) { @@ -569,45 +573,33 @@ if (tmp == NULL) tmp = dyStringNew(0); dyStringClear(tmp); dyStringAppend(tmp, filterStr); record->filterCount = countChars(filterStr, ';') + 1; record->filters = vcfFileAlloc(vcff, record->filterCount * sizeof(char **)); (void)chopByChar(tmp->string, ';', record->filters, record->filterCount); int i; for (i = 0; i < record->filterCount; i++) record->filters[i] = vcfFilePooledStr(vcff, record->filters[i]); } struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key) /* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */ { -struct vcfInfoDef *def; -// I expect there to be fairly few definitions (less than a dozen) so -// I'm just doing a linear search not hash: -for (def = vcff->infoDefs; def != NULL; def = def->next) - { - if (sameString(key, def->key)) - return def; - } -for (def = vcfSpecInfoDefs; def != NULL; def = def->next) - { - if (sameString(key, def->key)) +struct vcfInfoDef *def = hashFindVal(vcff->infoDefHash, (char *)key); return def; } -return NULL; -} static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, const char *key) /* Look up the type of INFO component key, in the definitions from the header, * and failing that, from the keys reserved in the spec. */ { struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key); return def ? def->type : vcfInfoString; } static int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type, char *valStr, union vcfDatum **pData, bool **pMissingData) /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */ { char *valWords[VCF_MAX_INFO]; int count = chopCommas(valStr, valWords);