src/lib/vcf.c 3640a4d6b3303a6bebc7c5b2fc5abcf7f4fae0b2

3640a4d6b3303a6bebc7c5b2fc5abcf7f4fae0b2
angie
  Wed Sep 28 11:56:00 2016 -0700
Partial support for changes in VCF4.2 and latest samtools mpileup output:
- Tolerate 'Number=R' and new INFO attributes Source and Version
- Tolerate mpileup's '<X>' alt (no alternate allele was observed)
- The 4.3 spec includes '<*>' from gVCF, also meaning no alt al obsvd.
- GT is no longer required; user's example has PL instead, so parse that
into genotypes.
- hgVai now annotates "variants" with <X> and <*> as no_sequence_alteration
- annoFormatVep now uses html encoding for html output in various places so
that "<X>" is displayed properly (custom track labels and various item
names could also have undesirable characters).  I am not encoding the
extras' descriptions because those are internal and some have <a>'s.
refs #15625

diff --git src/lib/vcf.c src/lib/vcf.c
index 91e3eb3..2b8cf7f 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -36,34 +36,36 @@
 const char *vcfInfoMappingQuality = "MQ";	// RMS mapping quality, e.g. MQ=52
 const char *vcfInfoMapQual0Count = "MQ0";	// number of MAPQ == 0 reads covering this record
 const char *vcfInfoNumSamples = "NS";		// Number of samples with data
 const char *vcfInfoStrandBias = "SB";		// strand bias at this position
 const char *vcfInfoIsSomatic = "SOMATIC";	// indicates that the record is a somatic mutation,
 						// for cancer genomics
 const char *vcfInfoIsValidated = "VALIDATED";	// validated by follow-up experiment
 
 /* Reserved but optional per-genotype keys: */
 const char *vcfGtGenotype = "GT";	// Integer allele indices separated by "/" (unphased)
 					// or "|" (phased). Allele values are 0 for
 					// reference allele, 1 for the first allele in ALT,
 					// 2 for the second allele in ALT and so on.
 const char *vcfGtDepth = "DP";		// Read depth at this position for this sample
 const char *vcfGtFilter = "FT";		// Analogous to variant's FILTER field
-const char *vcfGtLikelihoods = "GL";	// Three floating point log10-scaled likelihoods for
-					// AA,AB,BB genotypes where A=ref and B=alt;
-					// not applicable if site is not biallelic.
+const char *vcfGtLikelihoods = "GL";	// Floating point log10-scaled likelihoods for genotypes.
+					// If bi-allelic, order of genotypes is AA,AB,BB
+					// where A=ref and B=alt.
+					// If tri-allelic, AA,AB,BB,AC,BC,CC.
 const char *vcfGtPhred = "PL";		// Phred-scaled genotype likelihoods rounded to closest int
+					// Same ordering as GL
 const char *vcfGtConditionalQual = "GQ";// Conditional genotype quality
 					// i.e. phred quality -10log_10 P(genotype call is wrong,
 					// conditioned on the site's being variant)
 const char *vcfGtHaplotypeQualities = "HQ";	// two phred qualities comma separated
 const char *vcfGtPhaseSet = "PS";	// Set of phased genotypes to which this genotype belongs
 const char *vcfGtPhasingQuality = "PQ";	// Phred-scaled P(alleles ordered wrongly in heterozygote)
 const char *vcfGtExpectedAltAlleleCount = "EC";	// Typically used in association analyses
 
 
 // Make definitions of reserved INFO and genotype keys, in case people take them for
 // granted and don't make ##INFO headers for them:
 static struct vcfInfoDef *vcfSpecInfoDefs = NULL;
 static struct vcfInfoDef *vcfSpecGtFormatDefs = NULL;
 
 static void addInfoDef(struct vcfInfoDef **pList,
@@ -253,31 +255,31 @@
 if (sameString("Flag", typeWord))
     return vcfInfoFlag;
 if (sameString("Character", typeWord))
     return vcfInfoCharacter;
 if (sameString("String", typeWord))
     return vcfInfoString;
 vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line);
 return vcfInfoString;
 }
 
 // Regular expressions to check format and extract information from header lines:
 static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$";
 static const char *infoOrFormatRegex =
     "^##(INFO|FORMAT)="
     "<ID=([\\.+A-Za-z0-9_:-]+),"
-    "Number=(\\.|A|G|[0-9-]+),"
+    "Number=([\\.AGR]|[0-9-]+),"
     "Type=([A-Za-z]+),"
     "Description=\"?(.*)\"?>$";
 static const char *filterOrAltRegex =
     "^##(FILTER|ALT)="
     "<ID=([^,]+),"
     "(Description|Type)=\"?(.*)\"?>$";
 // VCF version 3.3 was different enough to warrant separate regexes:
 static const char *infoOrFormatRegex3_3 =
     "^##(INFO|FORMAT)="
     "([A-Za-z0-9_:-]+),"
     "(\\.|A|G|[0-9-]+),"
     "([A-Za-z]+),"
     "\"?(.*)\"?$";
 static const char *filterRegex3_3 =
     "^##(FILTER)="
@@ -333,30 +335,34 @@
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
 	char *number = vcfFileCloneSubstr(vcff, line, substrs[3]);
 	if (sameString(number, ".") || sameString(number, "A") || sameString(number, "G"))
 	    // A is #alts which varies line-to-line; "G" is #genotypes which we haven't
 	    // yet seen.  Why is there a G here -- shouldn't such attributes go in the
 	    // genotype columns?
 	    def->fieldCount = -1;
 	else
 	    def->fieldCount = atoi(number);
 	def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]);
 	// greedy regex pulls in end quote, trim if found:
 	if (line[substrs[5].rm_eo-1] == '"')
 	    line[substrs[5].rm_eo-1] = '\0';
 	def->description = vcfFileCloneSubstr(vcff, line, substrs[5]);
+        char *p = NULL;
+        if ((p = strstr(def->description, "\",Source=\"")) ||
+            (p = strstr(def->description, "\",Version=\"")))
+            *p = '\0';
 	slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def);
 	}
     else
 	vcfFileErr(vcff, "##%s line does not match expected pattern /%s/ or /%s/: \"%s\"",
 		   (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, infoOrFormatRegex3_3, line);
     }
 else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line))
     {
     boolean isFilter = startsWith("##FILTER", line);
     if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)) ||
 	regexMatchSubstr(line, filterRegex3_3, substrs, ArraySize(substrs)))
 	{
 	// substrs[2] is ID/key, substrs[4] is Description.
 	struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef));
 	def->key = vcfFileCloneSubstr(vcff, line, substrs[2]);
@@ -748,33 +754,39 @@
 boolean hasPaddingBase = TRUE;
 char firstBase = '\0';
 if (isAllNt(alleles[0], strlen(alleles[0])))
     firstBase = alleles[0][0];
 int i;
 for (i = 1;  i < alleleCount;  i++)
     {
     if (isAllNt(alleles[i], strlen(alleles[i])))
 	{
 	if (firstBase == '\0')
 	    firstBase = alleles[i][0];
 	if (alleles[i][0] != firstBase)
 	    // Different first base implies unpadded alleles.
 	    hasPaddingBase = FALSE;
 	}
+    else if (sameString(alleles[i], "<X>") || sameString(alleles[i], "<*>"))
+        {
+        // Special case for samtools mpileup "<X>" or gVCF "<*>" (no alternate allele observed) --
+        // being symbolic doesn't make this an indel and ref base is not necessarily padded.
+        hasPaddingBase = FALSE;
+        }
     else
 	{
-	// Symbolic ALT allele: REF must have the padding base.
+	// Symbolic ALT allele: Indel, so REF must have the padding base.
 	hasPaddingBase = TRUE;
 	break;
 	}
     }
 return hasPaddingBase;
 }
 
 unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec)
 /* For indels, VCF includes the left neighboring base; for example, if the alleles are
  * AA/- following a G base, then the VCF record will start one base to the left and have
  * "GAA" and "G" as the alleles.  Also, if any alt allele is symbolic (e.g. <DEL>) then
  * the ref allele must have a padding base.
  * That is not nice for display for two reasons:
  * 1. Indels appear one base wider than their dbSNP entries.
  * 2. In pgSnp display mode, the two alleles are always the same color.
@@ -1130,110 +1142,174 @@
     {
     if (sameString(key, def->key))
 	return def;
     }
 return NULL;
 }
 
 static enum vcfInfoType typeForGtFormat(struct vcfFile *vcff, const char *key)
 /* Look up the type of FORMAT component key, in the definitions from the header,
  * and failing that, from the keys reserved in the spec. */
 {
 struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, key);
 return def ? def->type : vcfInfoString;
 }
 
+static void parseGt(char *genotype, struct vcfGenotype *gt)
+/* Parse genotype, which should be something like "0/0", "0/1", "1|0" or "0/." into gt fields. */
+{
+char *sep = strchr(genotype, '|');
+if (sep != NULL)
+    gt->isPhased = TRUE;
+else
+    sep = strchr(genotype, '/');
+if (genotype[0] == '.')
+    gt->hapIxA = -1;
+else
+    gt->hapIxA = atoi(genotype);
+if (sep == NULL)
+    gt->isHaploid = TRUE;
+else if (sep[1] == '.')
+    gt->hapIxB = -1;
+else
+    gt->hapIxB = atoi(sep+1);
+}
+
+static void parsePlAsGt(char *pl, struct vcfGenotype *gt)
+/* Parse PL value, which is usually a list of 3 numbers, one of which is 0 (the selected haplotype),
+ * into gt fields. */
+{
+char plCopy[strlen(pl)+1];
+safecpy(plCopy, sizeof(plCopy), pl);
+char *words[32];
+int wordCount = chopCommas(plCopy, words);
+if (wordCount > 0 && sameString(words[0], "0"))
+    {
+    // genotype is AA (ref/ref)
+    gt->hapIxA = 0;
+    gt->hapIxB = 0;
+    }
+else if (wordCount > 1 && sameString(words[1], "0"))
+    {
+    // genotype is AB (ref/alt)
+    gt->hapIxA = 0;
+    gt->hapIxB = 1;
+    }
+else if (wordCount > 2 && sameString(words[2], "0"))
+    {
+    // genotype is BB (alt/alt)
+    gt->hapIxA = 1;
+    gt->hapIxB = 1;
+    }
+else if (wordCount > 3 && sameString(words[3], "0"))
+    {
+    // genotype is AC (ref/alt1)
+    gt->hapIxA = 0;
+    gt->hapIxB = 2;
+    }
+else if (wordCount > 4 && sameString(words[4], "0"))
+    {
+    // genotype is BC (alt/alt1)
+    gt->hapIxA = 1;
+    gt->hapIxB = 2;
+    }
+else if (wordCount > 5 && sameString(words[5], "0"))
+    {
+    // genotype is CC (alt1/alt1)
+    gt->hapIxA = 2;
+    gt->hapIxB = 2;
+    }
+else
+    {
+    // too fancy, treat as "./."
+    gt->hapIxA = -1;
+    gt->hapIxB = -1;
+    }
+}
+
 #define VCF_MAX_FORMAT VCF_MAX_INFO
 #define VCF_MAX_FORMAT_LEN (VCF_MAX_FORMAT * 4)
 
 void vcfParseGenotypes(struct vcfRecord *record)
 /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[].
  * This destroys genotypesUnparsedStrings. */
 {
 if (record->genotypeUnparsedStrings == NULL)
     return;
 struct vcfFile *vcff = record->file;
 record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype));
 char format[VCF_MAX_FORMAT_LEN];
 safecpy(format, sizeof(format), record->format);
 char *formatWords[VCF_MAX_FORMAT];
 int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords));
 if (formatWordCount >= VCF_MAX_FORMAT)
     {
     vcfFileErr(vcff, "The FORMAT column has at least %d words; "
 	       "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT);
     formatWordCount = VCF_MAX_FORMAT;
     }
-if (differentString(formatWords[0], vcfGtGenotype))
-    vcfFileErr(vcff, "FORMAT column should begin with \"%s\" but begins with \"%s\"",
-	       vcfGtGenotype, formatWords[0]);
 int i;
 // Store the pooled format word pointers and associated types for use in inner loop below.
 enum vcfInfoType formatTypes[VCF_MAX_FORMAT];
 for (i = 0;  i < formatWordCount;  i++)
     {
     formatTypes[i] = typeForGtFormat(vcff, formatWords[i]);
     formatWords[i] = vcfFilePooledStr(vcff, formatWords[i]);
     }
 for (i = 0;  i < vcff->genotypeCount;  i++)
     {
     char *string = record->genotypeUnparsedStrings[i];
     struct vcfGenotype *gt = &(record->genotypes[i]);
     // Each genotype can have multiple :-separated info elements:
     char *gtWords[VCF_MAX_FORMAT];
     int gtWordCount = chopByChar(string, ':', gtWords, ArraySize(gtWords));
     if (gtWordCount != formatWordCount)
 	vcfFileErr(vcff, "The FORMAT column has %d words but the genotype column for %s "
 		   "has %d words", formatWordCount, vcff->genotypeIds[i], gtWordCount);
     if (gtWordCount > formatWordCount)
 	gtWordCount = formatWordCount;
     gt->id = vcff->genotypeIds[i];
     gt->infoCount = gtWordCount;
     gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement));
+    boolean foundGT = FALSE;
     int j;
     for (j = 0;  j < gtWordCount;  j++)
 	{
 	// Special parsing of genotype:
 	if (sameString(formatWords[j], vcfGtGenotype))
 	    {
-	    char *genotype = gtWords[j];
-	    char *sep = strchr(genotype, '|');
-	    if (sep != NULL)
-		gt->isPhased = TRUE;
-	    else
-		sep = strchr(genotype, '/');
-	    if (genotype[0] == '.')
-		gt->hapIxA = -1;
-	    else
-		gt->hapIxA = atoi(genotype);
-	    if (sep == NULL)
-		gt->isHaploid = TRUE;
-	    else if (sep[1] == '.')
-		gt->hapIxB = -1;
-	    else
-		gt->hapIxB = atoi(sep+1);
+            parseGt(gtWords[j], gt);
+            foundGT = TRUE;
+	    }
+        else if (!foundGT && sameString(formatWords[j], vcfGtPhred))
+            {
+            parsePlAsGt(gtWords[j], gt);
+            foundGT = TRUE;
             }
 	struct vcfInfoElement *el = &(gt->infoElements[j]);
 	el->key = formatWords[j];
 	el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j],
 				   &(el->values), &(el->missingData));
 	if (el->count >= VCF_MAX_INFO)
 	    vcfFileErr(vcff, "A single element of the genotype column for \"%s\" "
 		       "has at least %d values; "
 		       "VCF_MAX_INFO may need to be increased in vcf.c!",
 		       gt->id, VCF_MAX_INFO);
 	}
+    if (i == 0 && !foundGT)
+        vcfFileErr(vcff,
+                   "Genotype FORMAT column includes neither GT nor PL; unable to parse genotypes.");
     }
 record->genotypeUnparsedStrings = NULL;
 }
 
 const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId)
 /* Find the genotype and associated info for the individual, or return NULL.
  * This calls vcfParseGenotypes if it has not already been called. */
 {
 struct vcfFile *vcff = record->file;
 if (sampleId == NULL || vcff->genotypeCount == 0)
     return NULL;
 vcfParseGenotypes(record);
 int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount);
 if (ix >= 0)
     return &(record->genotypes[ix]);