5c8f8925eb335bb375595e839368c3f7594ea7e4
angie
  Mon Mar 16 12:24:10 2020 -0700
Libify code that counts genotypes correctly from vcfClick.c and use it in vcfTrack.c.  refs #24623, fixes #25165

diff --git src/lib/vcf.c src/lib/vcf.c
index 0213055..2fb4688 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -1463,30 +1463,98 @@
 if (ix >= 0)
     return &(record->genotypes[ix]);
 return NULL;
 }
 
 const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key)
 /* Find the genotype infoElement for key, or return NULL. */
 {
 int i;
 for (i = 0;  i < gt->infoCount;  i++)
     if (sameString(key, gt->infoElements[i].key))
         return &gt->infoElements[i];
 return NULL;
 }
 
+int vcfGenotypeIndex(int h0Ix, int h1Ix)
+/* Return the index in a linear array of distinct genotypes, given two 0-based allele indexes.
+ * This follows the following convention used by GnomAD (GATK?), that has the advantage that
+ * gt indexes of small numbers don't change as the number of alleles increases, and also matches
+ * the ref/ref, ref/alt, alt/alt convention for biallelic variants:
+ * 0/0,
+ * 0/1, 1/1,
+ * 0/2, 1/2, 2/2,
+ * 0/3, 1/3, 2/3, 3/3,
+ * ... */
+{
+// Note that in that scheme, h0Ix <= h1Ix; if h0Ix > h1Ix, swap them.
+if (h0Ix > h1Ix)
+    {
+    int tmp = h0Ix;
+    h0Ix = h1Ix;
+    h1Ix = tmp;
+    }
+return (h1Ix * (h1Ix+1) / 2) + h0Ix;
+}
+
+static int genotypeArraySize(int alleleCount)
+/* Return the number of distinct genotypes given the number of alleles.  That is the same as the
+ * array index of the first genotype that would follow for alleleCount+1. */
+{
+return vcfGenotypeIndex(0, alleleCount + 1);
+}
+
+void vcfCountGenotypes(struct vcfRecord *rec, int **retGtCounts, int **retAlleleCounts,
+                       int *retPhasedCount, int *retDiploidCount)
+/* Tally genotypes and alleles for summary, adding 1 to rec->alleleCount to represent missing data */
+{
+vcfParseGenotypes(rec);
+int *alCounts;
+AllocArray(alCounts, rec->alleleCount + 1);
+int *gtCounts;
+AllocArray(gtCounts, genotypeArraySize(rec->alleleCount + 1));
+int phasedGts = 0, diploidCount = 0;
+int i;
+for (i = 0;  i < rec->file->genotypeCount;  i++)
+    {
+    struct vcfGenotype *gt = &(rec->genotypes[i]);
+    if (gt->isPhased)
+	phasedGts++;
+    int hapIxA = gt->hapIxA;
+    if (hapIxA < 0)
+        hapIxA = rec->alleleCount;
+    alCounts[hapIxA]++;
+    if (!gt->isHaploid)
+	{
+        diploidCount++;
+        int hapIxB = gt->hapIxB;
+        if (hapIxB < 0)
+            hapIxB = rec->alleleCount;
+        alCounts[hapIxB]++;
+        gtCounts[vcfGenotypeIndex(hapIxA, hapIxB)]++;
+        }
+    }
+if (retGtCounts)
+    *retGtCounts = gtCounts;
+if (retAlleleCounts)
+    *retAlleleCounts = alCounts;
+if (retPhasedCount)
+    *retPhasedCount = phasedGts;
+if (retDiploidCount)
+    *retDiploidCount = diploidCount;
+}
+
 static char *vcfDataLineAutoSqlString =
         "table vcfDataLine"
         "\"The fields of a Variant Call Format data line\""
         "    ("
         "    string chrom;      \"An identifier from the reference genome\""
         "    uint pos;          \"The reference position, with the 1st base having position 1\""
         "    string id;         \"Semi-colon separated list of unique identifiers where available\""
         "    string ref;                \"Reference base(s)\""
         "    string alt;                \"Comma separated list of alternate non-reference alleles "
                                          "called on at least one of the samples\""
         "    string qual;       \"Phred-scaled quality score for the assertion made in ALT. i.e. "
                                  "give -10log_10 prob(call in ALT is wrong)\""
         "    string filter;     \"PASS if this position has passed all filters. Otherwise, a "
                                   "semicolon-separated list of codes for filters that fail\""
         "    string info;       \"Additional information encoded as a semicolon-separated series "