5c8f8925eb335bb375595e839368c3f7594ea7e4 angie Mon Mar 16 12:24:10 2020 -0700 Libify code that counts genotypes correctly from vcfClick.c and use it in vcfTrack.c. refs #24623, fixes #25165 diff --git src/lib/vcf.c src/lib/vcf.c index 0213055..2fb4688 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -1463,30 +1463,98 @@ if (ix >= 0) return &(record->genotypes[ix]); return NULL; } const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key) /* Find the genotype infoElement for key, or return NULL. */ { int i; for (i = 0; i < gt->infoCount; i++) if (sameString(key, gt->infoElements[i].key)) return >->infoElements[i]; return NULL; } +int vcfGenotypeIndex(int h0Ix, int h1Ix) +/* Return the index in a linear array of distinct genotypes, given two 0-based allele indexes. + * This follows the following convention used by GnomAD (GATK?), that has the advantage that + * gt indexes of small numbers don't change as the number of alleles increases, and also matches + * the ref/ref, ref/alt, alt/alt convention for biallelic variants: + * 0/0, + * 0/1, 1/1, + * 0/2, 1/2, 2/2, + * 0/3, 1/3, 2/3, 3/3, + * ... */ +{ +// Note that in that scheme, h0Ix <= h1Ix; if h0Ix > h1Ix, swap them. +if (h0Ix > h1Ix) + { + int tmp = h0Ix; + h0Ix = h1Ix; + h1Ix = tmp; + } +return (h1Ix * (h1Ix+1) / 2) + h0Ix; +} + +static int genotypeArraySize(int alleleCount) +/* Return the number of distinct genotypes given the number of alleles. That is the same as the + * array index of the first genotype that would follow for alleleCount+1. */ +{ +return vcfGenotypeIndex(0, alleleCount + 1); +} + +void vcfCountGenotypes(struct vcfRecord *rec, int **retGtCounts, int **retAlleleCounts, + int *retPhasedCount, int *retDiploidCount) +/* Tally genotypes and alleles for summary, adding 1 to rec->alleleCount to represent missing data */ +{ +vcfParseGenotypes(rec); +int *alCounts; +AllocArray(alCounts, rec->alleleCount + 1); +int *gtCounts; +AllocArray(gtCounts, genotypeArraySize(rec->alleleCount + 1)); +int phasedGts = 0, diploidCount = 0; +int i; +for (i = 0; i < rec->file->genotypeCount; i++) + { + struct vcfGenotype *gt = &(rec->genotypes[i]); + if (gt->isPhased) + phasedGts++; + int hapIxA = gt->hapIxA; + if (hapIxA < 0) + hapIxA = rec->alleleCount; + alCounts[hapIxA]++; + if (!gt->isHaploid) + { + diploidCount++; + int hapIxB = gt->hapIxB; + if (hapIxB < 0) + hapIxB = rec->alleleCount; + alCounts[hapIxB]++; + gtCounts[vcfGenotypeIndex(hapIxA, hapIxB)]++; + } + } +if (retGtCounts) + *retGtCounts = gtCounts; +if (retAlleleCounts) + *retAlleleCounts = alCounts; +if (retPhasedCount) + *retPhasedCount = phasedGts; +if (retDiploidCount) + *retDiploidCount = diploidCount; +} + static char *vcfDataLineAutoSqlString = "table vcfDataLine" "\"The fields of a Variant Call Format data line\"" " (" " string chrom; \"An identifier from the reference genome\"" " uint pos; \"The reference position, with the 1st base having position 1\"" " string id; \"Semi-colon separated list of unique identifiers where available\"" " string ref; \"Reference base(s)\"" " string alt; \"Comma separated list of alternate non-reference alleles " "called on at least one of the samples\"" " string qual; \"Phred-scaled quality score for the assertion made in ALT. i.e. " "give -10log_10 prob(call in ALT is wrong)\"" " string filter; \"PASS if this position has passed all filters. Otherwise, a " "semicolon-separated list of codes for filters that fail\"" " string info; \"Additional information encoded as a semicolon-separated series "