3f6e034cd0844d8e99a69e56af72e8c4cce98a9c
chmalee
  Tue Jan 30 15:55:18 2024 -0800
Hash INFO keys during header parsing for faster lookups later when loading records

diff --git src/lib/vcf.c src/lib/vcf.c
index 6578ff7..e34a580 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -341,30 +341,31 @@
 	    // yet seen.  Why is there a G here -- shouldn't such attributes go in the
 	    // genotype columns?
 	    def->fieldCount = -1;
 	else
 	    def->fieldCount = atoi(number);
 	def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]);
 	// greedy regex pulls in end quote, trim if found:
 	if (line[substrs[5].rm_eo-1] == '"')
 	    line[substrs[5].rm_eo-1] = '\0';
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[5]);
         char *p = NULL;
         if ((p = strstr(def->description, "\",Source=\"")) ||
             (p = strstr(def->description, "\",Version=\"")))
             *p = '\0';
 	slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def);
+        if (isInfo) hashAdd(vcff->infoDefHash, def->key, def);
 	}
     else
 	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/ or /%s/: \"%s\"",
 		   (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, infoOrFormatRegex3_3, line);
     }
 else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line))
     {
     boolean isFilter = startsWith("##FILTER", line);
     if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)) ||
 	regexMatchSubstr(line, filterRegex3_3, substrs, ArraySize(substrs)))
 	{
 	// substrs[2] is ID/key, substrs[4] is Description.
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	// greedy regex pulls in end quote, trim if found:
@@ -473,30 +474,33 @@
 static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr)
 /* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
  * less than zero does not stop and reports all errors.
  * Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 initVcfSpecInfoDefs();
 initVcfSpecGtFormatDefs();
 if (lf == NULL)
     return NULL;
 struct vcfFile *vcff = vcfFileNew();
 vcff->lf = lf;
 vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName);
 vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr;
 
+// keep a hash of the INFO keys
+vcff->infoDefHash = hashNew(0);
+
 struct dyString *dyHeader = dyStringNew(1024);
 char *line = NULL;
 // First, metadata lines beginning with "##":
 while (lineFileNext(lf, &line, NULL) && startsWith("##", line))
     {
     dyStringAppend(dyHeader, line);
     dyStringAppendC(dyHeader, '\n');
     parseMetadataLine(vcff, line);
     }
 slReverse(&(vcff->infoDefs));
 slReverse(&(vcff->filterDefs));
 slReverse(&(vcff->gtFormatDefs));
 // Did we get the bare minimum VCF header with supported version?
 if (vcff->majorVersion == 0)
     {
@@ -569,45 +573,33 @@
 if (tmp == NULL)
     tmp = dyStringNew(0);
 dyStringClear(tmp);
 dyStringAppend(tmp, filterStr);
 record->filterCount = countChars(filterStr, ';') + 1;
 record->filters = vcfFileAlloc(vcff, record->filterCount * sizeof(char **));
 (void)chopByChar(tmp->string, ';', record->filters, record->filterCount);
 int i;
 for (i = 0;  i < record->filterCount;  i++)
     record->filters[i] = vcfFilePooledStr(vcff, record->filters[i]);
 }
 
 struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key)
 /* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */
 {
-struct vcfInfoDef *def;
-// I expect there to be fairly few definitions (less than a dozen) so
-// I'm just doing a linear search not hash:
-for (def = vcff->infoDefs;  def != NULL;  def = def->next)
-    {
-    if (sameString(key, def->key))
-	return def;
-    }
-for (def = vcfSpecInfoDefs;  def != NULL;  def = def->next)
-    {
-    if (sameString(key, def->key))
+struct vcfInfoDef *def = hashFindVal(vcff->infoDefHash, (char *)key);
 return def;
 }
-return NULL;
-}
 
 static enum vcfInfoType typeForInfoKey(struct vcfFile *vcff, const char *key)
 /* Look up the type of INFO component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 {
 struct vcfInfoDef *def = vcfInfoDefForKey(vcff, key);
 return def ? def->type : vcfInfoString;
 }
 
 static int parseInfoValue(struct vcfRecord *record, char *infoKey, enum vcfInfoType type,
 			  char *valStr, union vcfDatum **pData, bool **pMissingData)
 /* Parse a comma-separated list of values into array of union vcfInfoDatum and return count. */
 {
 char *valWords[VCF_MAX_INFO];
 int count = chopCommas(valStr, valWords);