d68c0ac5daa333dcaa7a9f49e0b554bfd91f05f5
angie
Thu Dec 12 14:52:56 2019 -0800
Report genotype allele counts and genotype counts correctly when there are more than two alleles. refs #24623
diff --git src/hg/hgc/vcfClick.c src/hg/hgc/vcfClick.c
index f9e7eea..7f85d50 100644
--- src/hg/hgc/vcfClick.c
+++ src/hg/hgc/vcfClick.c
@@ -312,116 +312,153 @@
}
printf("");
}
puts("");
}
hTableEnd();
jsEndCollapsibleSection();
}
static void ignoreEm(char *format, va_list args)
/* Ignore warnings from genotype parsing -- when there's one, there
* are usually hundreds more just like it. */
{
}
+static int genotypeIndex(int h0Ix, int h1Ix)
+/* Return the index in a linear array of distinct genotypes, given two 0-based allele indexes.
+ * This follows the following convention used by GnomAD (GATK?), that has the advantage that
+ * gt indexes of small numbers don't change as the number of alleles increases, and also matches
+ * the ref/ref, ref/alt, alt/alt convention for biallelic variants:
+ * 0/0,
+ * 0/1, 1/1,
+ * 0/2, 1/2, 2/2,
+ * 0/3, 1/3, 2/3, 3/3,
+ * ... */
+{
+// Note that in that scheme, h0Ix <= h1Ix; if h0Ix > h1Ix, swap them.
+if (h0Ix > h1Ix)
+ {
+ int tmp = h0Ix;
+ h0Ix = h1Ix;
+ h1Ix = tmp;
+ }
+return (h1Ix * (h1Ix+1) / 2) + h0Ix;
+}
+
+static int genotypeArraySize(int alleleCount)
+/* Return the number of distinct genotypes given the number of alleles. That is the same as the
+ * array index of the first genotype that would follow for alleleCount+1. */
+{
+return genotypeIndex(0, alleleCount + 1);
+}
+
static void vcfGenotypesDetails(struct vcfRecord *rec, struct trackDb *tdb, char **displayAls)
/* Print summary of allele and genotype frequency, plus collapsible section
* with table of genotype details. */
{
struct vcfFile *vcff = rec->file;
if (vcff->genotypeCount == 0)
return;
// Wrapper table for collapsible section:
puts("
");
pushWarnHandler(ignoreEm);
vcfParseGenotypes(rec);
popWarnHandler();
-// Tally genotypes and alleles for summary:
-int refs = 0, alts = 0, unks = 0;
-int refRefs = 0, refAlts = 0, altAlts = 0, gtUnk = 0, gtOther = 0, phasedGts = 0;
+// Tally genotypes and alleles for summary, adding 1 to rec->alleleCount to represent missing data:
+int alCounts[rec->alleleCount + 1];
+zeroBytes(alCounts, sizeof alCounts);
+int numGenotypes = genotypeArraySize(rec->alleleCount + 1);
+int gtCounts[numGenotypes];
+zeroBytes(gtCounts, sizeof gtCounts);
+int phasedGts = 0, diploidCount = 0;
int i;
for (i = 0; i < vcff->genotypeCount; i++)
{
struct vcfGenotype *gt = &(rec->genotypes[i]);
if (gt->isPhased)
phasedGts++;
- if (gt->hapIxA == 0)
- refs++;
- else if (gt->hapIxA > 0)
- alts++;
- else
- unks++;
+ int hapIxA = gt->hapIxA;
+ if (hapIxA < 0)
+ hapIxA = rec->alleleCount;
+ alCounts[hapIxA]++;
if (!gt->isHaploid)
{
- if (gt->hapIxB == 0)
- refs++;
- else if (gt->hapIxB > 0)
- alts++;
- else
- unks++;
- if (gt->hapIxA == 0 && gt->hapIxB == 0)
- refRefs++;
- else if (gt->hapIxA == 1 && gt->hapIxB == 1)
- altAlts++;
- else if ((gt->hapIxA == 1 && gt->hapIxB == 0) ||
- (gt->hapIxA == 0 && gt->hapIxB == 1))
- refAlts++;
- else if (gt->hapIxA < 0 || gt->hapIxB < 0)
- gtUnk++;
- else
- gtOther++;
+ diploidCount++;
+ int hapIxB = gt->hapIxB;
+ if (hapIxB < 0)
+ hapIxB = rec->alleleCount;
+ alCounts[hapIxB]++;
+ gtCounts[genotypeIndex(hapIxA, hapIxB)]++;
}
}
printf("Genotype count: %d", vcff->genotypeCount);
-if (differentString(seqName, "chrY"))
- printf(" (%d phased)", phasedGts);
-else
+if (sameString(seqName, "chrY"))
printf(" (haploid)");
+else if (sameString(seqName, "chrX"))
+ printf(" (%d phased, %d diploid, %d haploid)", phasedGts, diploidCount,
+ vcff->genotypeCount - diploidCount);
+else
+ printf(" (%d phased)", phasedGts);
puts("
");
-int totalAlleles = refs + alts + unks;
-double refAf = (double)refs/totalAlleles;
-double altAf = (double)alts/totalAlleles;
-printf("Alleles: %s: %d (%.3f%%); %s: %d (%.3f%%)",
- displayAls[0], refs, 100*refAf, displayAls[1], alts, 100*altAf);
-if (unks > 0)
- printf("; unknown: %d (%.3f%%)", unks, 100 * (double)unks/totalAlleles);
+int totalAlleles = vcff->genotypeCount + diploidCount;
+double refAf = (double)alCounts[0]/totalAlleles;
+printf("Alleles: %s: %d (%.3f%%)", displayAls[0], alCounts[0], 100*refAf);
+for (i = 1; i < rec->alleleCount; i++)
+ {
+ double altAf = (double)alCounts[i]/totalAlleles;
+ printf("; %s: %d (%.3f%%)", displayAls[i], alCounts[i], 100*altAf);
+ }
+if (alCounts[rec->alleleCount] > 0)
+ printf("; unknown: %d (%.3f%%)", alCounts[rec->alleleCount],
+ 100 * (double)alCounts[rec->alleleCount]/totalAlleles);
puts("
");
-// Should be a better way to detect haploid chromosomes than comparison with "chrY":
-if (vcff->genotypeCount > 1 && differentString(seqName, "chrY"))
- {
- printf("Genotypes: %s/%s: %d (%.3f%%); %s/%s: %d (%.3f%%); %s/%s: %d (%.3f%%)",
- displayAls[0], displayAls[0], refRefs, 100*(double)refRefs/vcff->genotypeCount,
- displayAls[0], displayAls[1], refAlts, 100*(double)refAlts/vcff->genotypeCount,
- displayAls[1], displayAls[1], altAlts, 100*(double)altAlts/vcff->genotypeCount);
- if (gtUnk > 0)
- printf("; unknown: %d (%.3f%%)", gtUnk, 100*(double)gtUnk/vcff->genotypeCount);
- if (gtOther > 0)
- printf("; other: %d (%.3f%%)", gtOther, 100*(double)gtOther/vcff->genotypeCount);
+if (vcff->genotypeCount > 1 && diploidCount > 0)
+ {
+ printf("Genotypes: %s/%s: %d (%.3f%%)",
+ displayAls[0], displayAls[0], gtCounts[0], 100*(double)gtCounts[0]/diploidCount);
+ for (i = 1; i < rec->alleleCount + 1; i++)
+ {
+ int j;
+ for (j = 0; j <= i; j++)
+ {
+ int gtIx = genotypeIndex(j, i);
+ if (gtCounts[gtIx] > 0)
+ {
+ char *alJ = (j == rec->alleleCount) ? "?" : displayAls[j];
+ char *alI = (i == rec->alleleCount) ? "?" : displayAls[i];
+ printf("; %s/%s: %d (%.3f%%)", alJ, alI, gtCounts[gtIx],
+ 100*(double)gtCounts[gtIx]/diploidCount);
+ }
+ }
+ }
printf("
\n");
if (rec->alleleCount == 2)
{
boolean showHW = cartOrTdbBoolean(cart, tdb, VCF_SHOW_HW_VAR, FALSE);
if (showHW)
+ {
+ double altAf = (double)alCounts[1]/totalAlleles;
printf("Hardy-Weinberg equilibrium: "
"P(%s/%s) = %.3f%%; P(%s/%s) = %.3f%%; P(%s/%s) = %.3f%%
",
displayAls[0], displayAls[0], 100*refAf*refAf,
displayAls[0], displayAls[1], 100*2*refAf*altAf,
displayAls[1], displayAls[1], 100*altAf*altAf);
}
}
+ }
puts("
");
vcfGenotypeTable(rec, tdb->track, displayAls);
puts("
");
}
static void pgSnpCodingDetail(struct vcfRecord *rec)
/* Translate rec into pgSnp (with proper chrom name) and call Belinda's
* coding effect predictor from pgSnp details. */
{
char *genePredTable = "knownGene";
if (hTableExists(database, genePredTable))
{
struct pgSnp *pgs = pgSnpFromVcfRecord(rec);
if (!sameString(rec->chrom, seqName))
// rec->chrom might be missing "chr" prefix: