94699b698cc07983f2b10c5b9d4523502a4e5467
angie
  Fri Oct 28 17:18:25 2016 -0700
Adapted from code contributed by Belinda Giardine: adding support for VCF from cancer somatic variant callers Strelka and Mutect2, when genotype columns are TUMOR and NORMAL and the hgTracks display shows allele counts for the tumor sample.  For Strelka VCF, genotypes are parsed out of INFO SGT.
refs #18275

diff --git src/hg/lib/pgSnp.c src/hg/lib/pgSnp.c
index 2412e6d..7e3524f 100644
--- src/hg/lib/pgSnp.c
+++ src/hg/lib/pgSnp.c
@@ -702,30 +702,41 @@
 item->alleleFreq = cloneString(row[5]);
 char pattern[128];
 safef(pattern, sizeof(pattern), "^[0-9]+(,[0-9]+){%d}$", item->alleleCount - 1);
 if (! regexMatchNoCase(row[5], pattern))
     lineFileAbort(lf, "invalid allele frequency, %s with count of %d", row[5], item->alleleCount);
 /* scores, comma separated list of numbers with above # of items */
 item->alleleScores = cloneString(row[6]);
 safef(pattern, sizeof(pattern), "^[0-9.]+(,[0-9.]+){%d}$", item->alleleCount - 1);
 if (! regexMatchNoCase(row[6], pattern))
     lineFileAbort(lf, "invalid allele scores, %s with count of %d", row[6], item->alleleCount);
 return item;
 }
 
 #define VCF_MAX_ALLELE_LEN 80
 
+static char *alleleCountsFromVcfTumorAd(const struct vcfInfoElement *ad)
+/* Build up comma-sep list of read counts supporting tumor alleles */
+{
+struct dyString *dy = dyStringNew(0);
+// Assume only one alternate allele
+int r = ad->values[0].datInt;
+int a = ad->values[1].datInt;
+dyStringPrintf(dy, "%d,%d", r, a);
+return dyStringCannibalize(&dy);
+}
+
 static char *alleleCountsFromVcfRecord(struct vcfRecord *rec, int alDescCount)
 /* Build up comma-sep list of per-allele counts, if available, up to alDescCount
  * which may be less than rec->alleleCount: */
 {
 struct dyString *dy = dyStringNew(0);
 int alCounts[VCF_MAX_ALLELE_LEN];
 boolean gotTotalCount = FALSE, gotAltCounts = FALSE;
 int i;
 for (i = 0;  i < rec->infoCount;  i++)
     if (sameString(rec->infoElements[i].key, "AN"))
 	{
 	if (rec->infoElements[i].missingData[0])
 	    break;
 	gotTotalCount = TRUE;
 	// Set ref allele to total count, subtract alt counts below.
@@ -816,57 +827,160 @@
 	struct vcfGenotype *gt = &(rec->genotypes[i]);
 	if (gt == NULL)
 	    uglyf("i=%d gt=NULL wtf?\n", i);
 	if (gt->hapIxA >= 0)
 	    alCounts[(unsigned char)gt->hapIxA]++;
 	if (!gt->isHaploid && gt->hapIxB >= 0)
 	    alCounts[(unsigned char)gt->hapIxB]++;
 	}
     dyStringPrintf(dy, "%d", alCounts[0]);
     for (i = 1;  i < alDescCount;  i++)
 	dyStringPrintf(dy, ",%d", alCounts[i]);
     }
 return dyStringCannibalize(&dy);
 }
 
+static void pgSnpFromVcfSgtIndelRecord(struct vcfRecord *rec, struct pgSnp *pgs)
+/* parse alleles and frequency from vcf file with SGT for indels */
+{
+pgs->alleleCount = 2; // assume always 2: normal->tumor
+struct dyString *dy = dyStringCreate("%s/%s", rec->alleles[0], rec->alleles[1]);
+pgs->name = dyStringCannibalize(&dy);
+int refCnt = -1, altCnt = -1;
+const struct vcfGenotype *gt = vcfRecordFindGenotype(rec, "TUMOR");
+if (gt)
+    {
+    const struct vcfInfoElement *dp = vcfGenotypeFindInfo(gt, "DP");
+    const struct vcfInfoElement *tar = vcfGenotypeFindInfo(gt, "TAR");
+    if (dp && tar)
+        {
+        int totCnt = dp->values[0].datInt;
+        altCnt = tar->values[0].datInt;
+        refCnt = totCnt - altCnt;
+        }
+    }
+if (refCnt == -1)
+    pgs->alleleFreq = cloneString("?,?");
+else
+    {
+    char freq[64];
+    safef(freq, sizeof(freq), "%d,%d", refCnt, altCnt);
+    pgs->alleleFreq = cloneString(freq);
+    }
+}
+
+static struct pgSnp *pgSnpFromVcfSgtRecord(struct vcfRecord *rec)
+/* Convert VCF rec with SGT (from Strelka) to pgSnp; don't free rec->file (vcfFile) until
+ * you're done with pgSnp because pgSnp points to rec->chrom. */
+{
+struct pgSnp *pgs;
+AllocVar(pgs);
+pgs->chrom = rec->chrom;
+pgs->chromStart = rec->chromStart;
+pgs->chromEnd = rec->chromEnd;
+pgs->alleleScores = cloneString("-1,-1");
+//set alleles from INFO SGT
+const struct vcfInfoElement *sgtEl = vcfRecordFindInfo(rec, "SGT");
+if (sgtEl)
+    {
+    char *val = sgtEl->values[0].datString;
+    // Indels encode allele frequencies differently
+    if (startsWith("ref->", val))
+        {
+        pgSnpFromVcfSgtIndelRecord(rec, pgs);
+        return pgs;
+        }
+    // Get tumor genotype, assuming SNVs only
+    char last = val[strlen(val)-1];
+    char nlast = val[strlen(val)-2];
+    struct dyString *dy = dyStringCreate("%c/%c", nlast, last);
+    pgs->name = dyStringCannibalize(&dy);
+    //add read counts to frequency field
+    if (last == nlast)
+        pgs->alleleCount = 1;
+    else
+        pgs->alleleCount = 2;
+    //set freqs from AU,CU,GU,TU for TUMOR genotype
+    const struct vcfGenotype *gt = vcfRecordFindGenotype(rec, "TUMOR");
+    if (gt)
+        {
+        // Find counts for alleles
+        int first = -1, second = -1;
+        int k;
+        for (k = 0; k < gt->infoCount; k++)
+            {
+            struct vcfInfoElement *info = &gt->infoElements[k];
+            if (sameString(info->key, "AU") ||
+                sameString(info->key, "CU") ||
+                sameString(info->key, "GU") ||
+                sameString(info->key, "TU"))
+                {
+                char base = info->key[0];
+                if (nlast == base)
+                    first = info->values[0].datInt;
+                else if (last == base)
+                    second = info->values[0].datInt;
+                }
+            }
+        struct dyString *freq = dyStringCreate("%d,%d", first, second);
+        pgs->alleleFreq = dyStringCannibalize(&freq);
+        }
+    }
+else
+    errAbort("pcfSnpFromVcfSgtRecord: no SGT found in INFO for VCF at %s:%d-%d %s",
+             rec->chrom, rec->chromStart+1, rec->chromEnd, rec->name);
+return pgs;
+}
+
 struct pgSnp *pgSnpFromVcfRecord(struct vcfRecord *rec)
 /* Convert VCF rec to pgSnp; don't free rec->file (vcfFile) until
  * you're done with pgSnp because pgSnp points to rec->chrom. */
 {
+// If this is Strelka VCF with SGT (Somatic GenoType) info then deduce genotypes from
+// other Strelka-specific info keys.
+if (vcfRecordFindInfo(rec, "SGT"))
+    return pgSnpFromVcfSgtRecord(rec);
 struct dyString *dy = dyStringNew(0);
 struct pgSnp *pgs;
 AllocVar(pgs);
 pgs->chrom = rec->chrom;
 pgs->chromStart = rec->chromStart;
 pgs->chromEnd = rec->chromEnd;
 // Build up slash-separated allele string from rec->alleles, starting with ref allele:
 dyStringAppend(dy, rec->alleles[0]);
 int alCount = rec->alleleCount, i;
 if (rec->alleleCount == 2 &&
     (sameString(rec->alleles[1], ".") ||
      sameString(rec->alleles[1], "<X>") ||
      sameString(rec->alleles[1], "<*>")))
     // ignore N/A alternate allele
     alCount = 1;
 else if (rec->alleleCount >= 2)
     {
     // append /-sep'd alternate alleles
     for (i = 1;  i < rec->alleleCount;  i++)
 	dyStringPrintf(dy, "/%s", rec->alleles[i]);
     }
 pgs->name = cloneStringZ(dy->string, dy->stringSize+1);
 pgs->alleleCount = alCount;
+// If this is Mutect2 TUMOR/NORMAL VCF with AD (allelic depths for ref and alt)
+// then display read counts from tumor sample as allele counts.
+const struct vcfGenotype *tumorGt = vcfRecordFindGenotype(rec, "TUMOR");
+const struct vcfInfoElement *tumorAd = tumorGt ? vcfGenotypeFindInfo(tumorGt, "AD") : NULL;
+if (rec->file->genotypeCount == 2 && tumorAd && vcfRecordFindGenotype(rec, "NORMAL"))
+    pgs->alleleFreq = alleleCountsFromVcfTumorAd(tumorAd);
+else
     pgs->alleleFreq = alleleCountsFromVcfRecord(rec, alCount);
 // Build up comma-sep list... supposed to be per-allele quality scores but I think
 // the VCF spec only gives us one BQ... for the reference position?  should ask.
 dyStringClear(dy);
 for (i = 0;  i < rec->infoCount;  i++)
     if (sameString(rec->infoElements[i].key, "BQ") && !rec->infoElements[i].missingData[0])
 	{
 	float qual = rec->infoElements[i].values[0].datFloat;
 	dyStringPrintf(dy, "%.1f", qual);
 	int j;
 	for (j = 1;  j < rec->alleleCount;  j++)
 	    dyStringPrintf(dy, ",%.1f", qual);
 	break;
 	}
 pgs->alleleScores = dyStringCannibalize(&dy);