39b3ad00ec43efa3dbd5a8b0836d7a6a00776528 angie Wed Jul 29 13:28:23 2020 -0700 Added vcfParseGenotypesGtOnly to speed up parsing when we have tens of thousands of genotype columns and just want the genotypes not any associated details. refs #25943 diff --git src/lib/vcf.c src/lib/vcf.c index 2e1c746..b38cbc6 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -1446,30 +1446,61 @@ gt->id, VCF_MAX_INFO); } if (!foundGT) //try SGT { parseSgtAsGt(record, gt); if (gt->hapIxA != -1) foundGT = TRUE; } if (i == 0 && !foundGT) vcfFileErr(vcff, "Genotype FORMAT column includes neither GT nor PL; unable to parse genotypes."); } record->genotypeUnparsedStrings = NULL; } +void vcfParseGenotypesGtOnly(struct vcfRecord *record) +/* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[], but ignore + * genotype info elements, IDs, etc; parse only the genotypes (e.g. for quick display in hgTracks). + * This destroys genotypesUnparsedStrings. */ +{ +if (record->genotypeUnparsedStrings == NULL) + return; +struct vcfFile *vcff = record->file; +record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype)); +int i; +for (i = 0; i < vcff->genotypeCount; i++) + { + // Parse (.|[0-9])([/|](.|[0-9]))? in first one to four characters + char *string = record->genotypeUnparsedStrings[i]; + struct vcfGenotype *gt = &(record->genotypes[i]); + gt->hapIxA = gt->hapIxB = -1; + if (string[0] != '.' && isdigit(string[0])) + gt->hapIxA = atoi(string); + if (string[1] == '/' || string[1] == '|') + { + if (isdigit(string[2])) + gt->hapIxB = atoi(string+2); + if (string[1] == '|') + gt->isPhased = TRUE; + } + else + gt->isHaploid = TRUE; + } +record->genotypeUnparsedStrings = NULL; +} + const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId) /* Find the genotype and associated info for the individual, or return NULL. * This calls vcfParseGenotypes if it has not already been called. */ { struct vcfFile *vcff = record->file; if (sampleId == NULL || vcff->genotypeCount == 0) return NULL; vcfParseGenotypes(record); int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount); if (ix >= 0) return &(record->genotypes[ix]); return NULL; } const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key)