94699b698cc07983f2b10c5b9d4523502a4e5467
angie
  Fri Oct 28 17:18:25 2016 -0700
Adapted from code contributed by Belinda Giardine: adding support for VCF from cancer somatic variant callers Strelka and Mutect2, when genotype columns are TUMOR and NORMAL and the hgTracks display shows allele counts for the tumor sample.  For Strelka VCF, genotypes are parsed out of INFO SGT.
refs #18275

diff --git src/lib/vcf.c src/lib/vcf.c
index 5c67e21..0e80f39 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -1162,30 +1162,91 @@
     gt->isPhased = TRUE;
 else
     sep = strchr(genotype, '/');
 if (genotype[0] == '.')
     gt->hapIxA = -1;
 else
     gt->hapIxA = atoi(genotype);
 if (sep == NULL)
     gt->isHaploid = TRUE;
 else if (sep[1] == '.')
     gt->hapIxB = -1;
 else
     gt->hapIxB = atoi(sep+1);
 }
 
+static void parseSgtAsGt(struct vcfRecord *rec, struct vcfGenotype *gt)
+/* Parse SGT to normal and tumor genotypes */
+{
+const struct vcfInfoElement *sgtEl = vcfRecordFindInfo(rec, "SGT");
+if (sgtEl)
+    {
+    char *val = sgtEl->values[0].datString;
+    // set hapIxA and hapIxB where 0 = ref, 1 = alt
+    if (startsWith("ref->", val))
+        {
+        //indel, use 0/0 for normal and 0/1 for tumor
+        if (sameString(gt->id, "NORMAL"))
+            gt->hapIxA = gt->hapIxB = 0;
+        else if (sameString(gt->id, "TUMOR"))
+            {
+            gt->hapIxA = 0;
+            gt->hapIxB = 1;
+            }
+        }
+    else
+        {
+        if (sameString(gt->id, "NORMAL"))
+            {
+            char str[2];
+            safef(str, sizeof(str), "%c", val[0]);
+            if (sameString(rec->alleles[0], str))
+                gt->hapIxA = 0;
+            else if (sameString(rec->alleles[1], str))
+                gt->hapIxA = 1;
+            else
+                gt->hapIxA = -1;
+            safef(str, sizeof(str), "%c", val[1]);
+            if (sameString(rec->alleles[1], str))
+                gt->hapIxB = 1;
+            else if (sameString(rec->alleles[0], str))
+                gt->hapIxB = 0;
+            else
+                gt->hapIxB = -1;
+            }
+        else if (sameString(gt->id, "TUMOR"))
+            {
+            char str[2];
+            safef(str, sizeof(str), "%c", val[4]);
+            if (sameString(rec->alleles[0], str))
+                gt->hapIxA = 0;
+            else if (sameString(rec->alleles[1], str))
+                gt->hapIxA = 1;
+            else
+                gt->hapIxA = -1;
+            safef(str, sizeof(str), "%c", val[5]);
+            if (sameString(rec->alleles[1], str))
+                gt->hapIxB = 1;
+            else if (sameString(rec->alleles[0], str))
+                gt->hapIxB = 0;
+            else
+                gt->hapIxB = -1;
+            }
+        }
+    }
+}
+
 static void parsePlAsGt(char *pl, struct vcfGenotype *gt)
 /* Parse PL value, which is usually a list of 3 numbers, one of which is 0 (the selected haplotype),
  * into gt fields. */
 {
 char plCopy[strlen(pl)+1];
 safecpy(plCopy, sizeof(plCopy), pl);
 char *words[32];
 int wordCount = chopCommas(plCopy, words);
 if (wordCount > 0 && sameString(words[0], "0"))
     {
     // genotype is AA (ref/ref)
     gt->hapIxA = 0;
     gt->hapIxB = 0;
     }
 else if (wordCount > 1 && sameString(words[1], "0"))
@@ -1284,51 +1345,67 @@
         else if (!foundGT && sameString(formatWords[j], vcfGtPhred))
             {
             parsePlAsGt(gtWords[j], gt);
             foundGT = TRUE;
             }
 	struct vcfInfoElement *el = &(gt->infoElements[j]);
 	el->key = formatWords[j];
 	el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j],
 				   &(el->values), &(el->missingData));
 	if (el->count >= VCF_MAX_INFO)
 	    vcfFileErr(vcff, "A single element of the genotype column for \"%s\" "
 		       "has at least %d values; "
 		       "VCF_MAX_INFO may need to be increased in vcf.c!",
 		       gt->id, VCF_MAX_INFO);
 	}
+    if (!foundGT)   //try SGT
+        {
+        parseSgtAsGt(record, gt);
+        if (gt->hapIxA != -1)
+            foundGT = TRUE;
+        }
     if (i == 0 && !foundGT)
         vcfFileErr(vcff,
                    "Genotype FORMAT column includes neither GT nor PL; unable to parse genotypes.");
     }
 record->genotypeUnparsedStrings = NULL;
 }
 
 const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId)
 /* Find the genotype and associated info for the individual, or return NULL.
  * This calls vcfParseGenotypes if it has not already been called. */
 {
 struct vcfFile *vcff = record->file;
 if (sampleId == NULL || vcff->genotypeCount == 0)
     return NULL;
 vcfParseGenotypes(record);
 int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount);
 if (ix >= 0)
     return &(record->genotypes[ix]);
 return NULL;
 }
 
+const struct vcfInfoElement *vcfGenotypeFindInfo(const struct vcfGenotype *gt, char *key)
+/* Find the genotype infoElement for key, or return NULL. */
+{
+int i;
+for (i = 0;  i < gt->infoCount;  i++)
+    if (sameString(key, gt->infoElements[i].key))
+        return &gt->infoElements[i];
+return NULL;
+}
+
 static char *vcfDataLineAutoSqlString =
         "table vcfDataLine"
         "\"The fields of a Variant Call Format data line\""
         "    ("
         "    string chrom;      \"An identifier from the reference genome\""
         "    uint pos;          \"The reference position, with the 1st base having position 1\""
         "    string id;         \"Semi-colon separated list of unique identifiers where available\""
         "    string ref;                \"Reference base(s)\""
         "    string alt;                \"Comma separated list of alternate non-reference alleles "
                                          "called on at least one of the samples\""
         "    string qual;       \"Phred-scaled quality score for the assertion made in ALT. i.e. "
                                  "give -10log_10 prob(call in ALT is wrong)\""
         "    string filter;     \"PASS if this position has passed all filters. Otherwise, a "
                                   "semicolon-separated list of codes for filters that fail\""
         "    string info;       \"Additional information encoded as a semicolon-separated series "