39b3ad00ec43efa3dbd5a8b0836d7a6a00776528 angie Wed Jul 29 13:28:23 2020 -0700 Added vcfParseGenotypesGtOnly to speed up parsing when we have tens of thousands of genotype columns and just want the genotypes not any associated details. refs #25943 diff --git src/inc/vcf.h src/inc/vcf.h index cb6f07a..7450ad6 100644 --- src/inc/vcf.h +++ src/inc/vcf.h @@ -287,30 +287,35 @@ /* Free a vcfFile object. */ const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId); /* Return all records with name=variantId, or NULL if not found. */ const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key); /* Find an INFO element, or NULL. */ struct vcfInfoDef *vcfInfoDefForKey(struct vcfFile *vcff, const char *key); /* Return infoDef for key, or NULL if it wasn't specified in the header or VCF spec. */ void vcfParseGenotypes(struct vcfRecord *record); /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[]. * This destroys genotypesUnparsedStrings. */ +void vcfParseGenotypesGtOnly(struct vcfRecord *record); +/* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[], but ignore + * genotype info elements, IDs, etc; parse only the actual genotypes (e.g. for quick display + * in hgTracks). This destroys genotypesUnparsedStrings. */ + const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId); /* Find the genotype and associated info for the individual, or return NULL. * This calls vcfParseGenotypes if it has not already been called. */ struct vcfInfoDef *vcfInfoDefForGtKey(struct vcfFile *vcff, const char *key); /* Look up the type of genotype FORMAT component key, in the definitions from the header, * and failing that, from the keys reserved in the spec. */ const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key); /* Find the genotype infoElement for key, or return NULL. */ int vcfGenotypeIndex(int h0Ix, int h1Ix); /* Return the index in a linear array of distinct genotypes, given two 0-based allele indexes. * This follows the following convention used by GnomAD (GATK?), that has the advantage that * gt indexes of small numbers don't change as the number of alleles increases, and also matches