3f6e034cd0844d8e99a69e56af72e8c4cce98a9c chmalee Tue Jan 30 15:55:18 2024 -0800 Hash INFO keys during header parsing for faster lookups later when loading records diff --git src/inc/vcf.h src/inc/vcf.h index 2e5eb5d..e5f210e 100644 --- src/inc/vcf.h +++ src/inc/vcf.h @@ -87,30 +87,31 @@ struct vcfFile *file; // Pointer back to parent vcfFile }; struct vcfFile /* Info extracted from a VCF file. Manages all memory for contents. * Clearly borrowing structure from MarkD's gff3File. :) */ { char *fileOrUrl; // VCF local file path or URL char *headerString; // Complete original header including newlines. int majorVersion; // 4 etc. int minorVersion; // 0, 1 etc. struct vcfInfoDef *infoDefs; // Header's definitions of INFO column components struct vcfInfoDef *filterDefs; // Header's definitions of FILTER column failure codes struct vcfInfoDef *altDefs; // Header's defs of symbolic alternate alleles (e.g. DEL, INS) struct vcfInfoDef *gtFormatDefs; // Header's defs of GENOTYPE compnts. listed in FORMAT col. + struct hash *infoDefHash; // Hash of all INFO keys, as there can be hundreds of them bool allPhased; // True if all record->genotypes have been phased int genotypeCount; // Number of optional genotype columns described in header char **genotypeIds; // Array of optional genotype column names described in header struct vcfRecord *records; // VCF data rows, sorted by position struct hash *byName; // Hash records by name -- not populated until needed. struct hash *pool; // Used to allocate string values that tend to // be repeated in the files. hash's localMem is also used to // allocated memory for all other objects (if recordPool null) struct lm *reusePool; // If created with vcfFileMakeReusePool, non-shared record data is // allocated from this pool. Useful when walking through huge files. struct lineFile *lf; // Used only during parsing int maxErr; // Maximum number of errors before aborting int errCnt; // Error count };