3640a4d6b3303a6bebc7c5b2fc5abcf7f4fae0b2 angie Wed Sep 28 11:56:00 2016 -0700 Partial support for changes in VCF4.2 and latest samtools mpileup output: - Tolerate 'Number=R' and new INFO attributes Source and Version - Tolerate mpileup's '' alt (no alternate allele was observed) - The 4.3 spec includes '<*>' from gVCF, also meaning no alt al obsvd. - GT is no longer required; user's example has PL instead, so parse that into genotypes. - hgVai now annotates "variants" with and <*> as no_sequence_alteration - annoFormatVep now uses html encoding for html output in various places so that "" is displayed properly (custom track labels and various item names could also have undesirable characters). I am not encoding the extras' descriptions because those are internal and some have 's. refs #15625 diff --git src/lib/vcf.c src/lib/vcf.c index 91e3eb3..2b8cf7f 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -36,34 +36,36 @@ const char *vcfInfoMappingQuality = "MQ"; // RMS mapping quality, e.g. MQ=52 const char *vcfInfoMapQual0Count = "MQ0"; // number of MAPQ == 0 reads covering this record const char *vcfInfoNumSamples = "NS"; // Number of samples with data const char *vcfInfoStrandBias = "SB"; // strand bias at this position const char *vcfInfoIsSomatic = "SOMATIC"; // indicates that the record is a somatic mutation, // for cancer genomics const char *vcfInfoIsValidated = "VALIDATED"; // validated by follow-up experiment /* Reserved but optional per-genotype keys: */ const char *vcfGtGenotype = "GT"; // Integer allele indices separated by "/" (unphased) // or "|" (phased). Allele values are 0 for // reference allele, 1 for the first allele in ALT, // 2 for the second allele in ALT and so on. const char *vcfGtDepth = "DP"; // Read depth at this position for this sample const char *vcfGtFilter = "FT"; // Analogous to variant's FILTER field -const char *vcfGtLikelihoods = "GL"; // Three floating point log10-scaled likelihoods for - // AA,AB,BB genotypes where A=ref and B=alt; - // not applicable if site is not biallelic. +const char *vcfGtLikelihoods = "GL"; // Floating point log10-scaled likelihoods for genotypes. + // If bi-allelic, order of genotypes is AA,AB,BB + // where A=ref and B=alt. + // If tri-allelic, AA,AB,BB,AC,BC,CC. const char *vcfGtPhred = "PL"; // Phred-scaled genotype likelihoods rounded to closest int + // Same ordering as GL const char *vcfGtConditionalQual = "GQ";// Conditional genotype quality // i.e. phred quality -10log_10 P(genotype call is wrong, // conditioned on the site's being variant) const char *vcfGtHaplotypeQualities = "HQ"; // two phred qualities comma separated const char *vcfGtPhaseSet = "PS"; // Set of phased genotypes to which this genotype belongs const char *vcfGtPhasingQuality = "PQ"; // Phred-scaled P(alleles ordered wrongly in heterozygote) const char *vcfGtExpectedAltAlleleCount = "EC"; // Typically used in association analyses // Make definitions of reserved INFO and genotype keys, in case people take them for // granted and don't make ##INFO headers for them: static struct vcfInfoDef *vcfSpecInfoDefs = NULL; static struct vcfInfoDef *vcfSpecGtFormatDefs = NULL; static void addInfoDef(struct vcfInfoDef **pList, @@ -253,31 +255,31 @@ if (sameString("Flag", typeWord)) return vcfInfoFlag; if (sameString("Character", typeWord)) return vcfInfoCharacter; if (sameString("String", typeWord)) return vcfInfoString; vcfFileErr(vcff, "Unrecognized type word \"%s\" in metadata line \"%s\"", typeWord, line); return vcfInfoString; } // Regular expressions to check format and extract information from header lines: static const char *fileformatRegex = "^##(file)?format=VCFv([0-9]+)(\\.([0-9]+))?$"; static const char *infoOrFormatRegex = "^##(INFO|FORMAT)=" "$"; static const char *filterOrAltRegex = "^##(FILTER|ALT)=" "$"; // VCF version 3.3 was different enough to warrant separate regexes: static const char *infoOrFormatRegex3_3 = "^##(INFO|FORMAT)=" "([A-Za-z0-9_:-]+)," "(\\.|A|G|[0-9-]+)," "([A-Za-z]+)," "\"?(.*)\"?$"; static const char *filterRegex3_3 = "^##(FILTER)=" @@ -333,30 +335,34 @@ struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); char *number = vcfFileCloneSubstr(vcff, line, substrs[3]); if (sameString(number, ".") || sameString(number, "A") || sameString(number, "G")) // A is #alts which varies line-to-line; "G" is #genotypes which we haven't // yet seen. Why is there a G here -- shouldn't such attributes go in the // genotype columns? def->fieldCount = -1; else def->fieldCount = atoi(number); def->type = vcfInfoTypeFromSubstr(vcff, line, substrs[4]); // greedy regex pulls in end quote, trim if found: if (line[substrs[5].rm_eo-1] == '"') line[substrs[5].rm_eo-1] = '\0'; def->description = vcfFileCloneSubstr(vcff, line, substrs[5]); + char *p = NULL; + if ((p = strstr(def->description, "\",Source=\"")) || + (p = strstr(def->description, "\",Version=\""))) + *p = '\0'; slAddHead((isInfo ? &(vcff->infoDefs) : &(vcff->gtFormatDefs)), def); } else vcfFileErr(vcff, "##%s line does not match expected pattern /%s/ or /%s/: \"%s\"", (isInfo ? "INFO" : "FORMAT"), infoOrFormatRegex, infoOrFormatRegex3_3, line); } else if (startsWith("##FILTER=", line) || startsWith("##ALT=", line)) { boolean isFilter = startsWith("##FILTER", line); if (regexMatchSubstr(line, filterOrAltRegex, substrs, ArraySize(substrs)) || regexMatchSubstr(line, filterRegex3_3, substrs, ArraySize(substrs))) { // substrs[2] is ID/key, substrs[4] is Description. struct vcfInfoDef *def = vcfFileAlloc(vcff, sizeof(struct vcfInfoDef)); def->key = vcfFileCloneSubstr(vcff, line, substrs[2]); @@ -748,33 +754,39 @@ boolean hasPaddingBase = TRUE; char firstBase = '\0'; if (isAllNt(alleles[0], strlen(alleles[0]))) firstBase = alleles[0][0]; int i; for (i = 1; i < alleleCount; i++) { if (isAllNt(alleles[i], strlen(alleles[i]))) { if (firstBase == '\0') firstBase = alleles[i][0]; if (alleles[i][0] != firstBase) // Different first base implies unpadded alleles. hasPaddingBase = FALSE; } + else if (sameString(alleles[i], "") || sameString(alleles[i], "<*>")) + { + // Special case for samtools mpileup "" or gVCF "<*>" (no alternate allele observed) -- + // being symbolic doesn't make this an indel and ref base is not necessarily padded. + hasPaddingBase = FALSE; + } else { - // Symbolic ALT allele: REF must have the padding base. + // Symbolic ALT allele: Indel, so REF must have the padding base. hasPaddingBase = TRUE; break; } } return hasPaddingBase; } unsigned int vcfRecordTrimIndelLeftBase(struct vcfRecord *rec) /* For indels, VCF includes the left neighboring base; for example, if the alleles are * AA/- following a G base, then the VCF record will start one base to the left and have * "GAA" and "G" as the alleles. Also, if any alt allele is symbolic (e.g. ) then * the ref allele must have a padding base. * That is not nice for display for two reasons: * 1. Indels appear one base wider than their dbSNP entries. * 2. In pgSnp display mode, the two alleles are always the same color. @@ -1130,110 +1142,174 @@ { if (sameString(key, def->key)) return def; } return NULL; } static enum vcfInfoType typeForGtFormat(struct vcfFile *vcff, const char *key) /* Look up the type of FORMAT component key, in the definitions from the header, * and failing that, from the keys reserved in the spec. */ { struct vcfInfoDef *def = vcfInfoDefForGtKey(vcff, key); return def ? def->type : vcfInfoString; } +static void parseGt(char *genotype, struct vcfGenotype *gt) +/* Parse genotype, which should be something like "0/0", "0/1", "1|0" or "0/." into gt fields. */ +{ +char *sep = strchr(genotype, '|'); +if (sep != NULL) + gt->isPhased = TRUE; +else + sep = strchr(genotype, '/'); +if (genotype[0] == '.') + gt->hapIxA = -1; +else + gt->hapIxA = atoi(genotype); +if (sep == NULL) + gt->isHaploid = TRUE; +else if (sep[1] == '.') + gt->hapIxB = -1; +else + gt->hapIxB = atoi(sep+1); +} + +static void parsePlAsGt(char *pl, struct vcfGenotype *gt) +/* Parse PL value, which is usually a list of 3 numbers, one of which is 0 (the selected haplotype), + * into gt fields. */ +{ +char plCopy[strlen(pl)+1]; +safecpy(plCopy, sizeof(plCopy), pl); +char *words[32]; +int wordCount = chopCommas(plCopy, words); +if (wordCount > 0 && sameString(words[0], "0")) + { + // genotype is AA (ref/ref) + gt->hapIxA = 0; + gt->hapIxB = 0; + } +else if (wordCount > 1 && sameString(words[1], "0")) + { + // genotype is AB (ref/alt) + gt->hapIxA = 0; + gt->hapIxB = 1; + } +else if (wordCount > 2 && sameString(words[2], "0")) + { + // genotype is BB (alt/alt) + gt->hapIxA = 1; + gt->hapIxB = 1; + } +else if (wordCount > 3 && sameString(words[3], "0")) + { + // genotype is AC (ref/alt1) + gt->hapIxA = 0; + gt->hapIxB = 2; + } +else if (wordCount > 4 && sameString(words[4], "0")) + { + // genotype is BC (alt/alt1) + gt->hapIxA = 1; + gt->hapIxB = 2; + } +else if (wordCount > 5 && sameString(words[5], "0")) + { + // genotype is CC (alt1/alt1) + gt->hapIxA = 2; + gt->hapIxB = 2; + } +else + { + // too fancy, treat as "./." + gt->hapIxA = -1; + gt->hapIxB = -1; + } +} + #define VCF_MAX_FORMAT VCF_MAX_INFO #define VCF_MAX_FORMAT_LEN (VCF_MAX_FORMAT * 4) void vcfParseGenotypes(struct vcfRecord *record) /* Translate record->genotypesUnparsedStrings[] into proper struct vcfGenotype[]. * This destroys genotypesUnparsedStrings. */ { if (record->genotypeUnparsedStrings == NULL) return; struct vcfFile *vcff = record->file; record->genotypes = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(struct vcfGenotype)); char format[VCF_MAX_FORMAT_LEN]; safecpy(format, sizeof(format), record->format); char *formatWords[VCF_MAX_FORMAT]; int formatWordCount = chopByChar(format, ':', formatWords, ArraySize(formatWords)); if (formatWordCount >= VCF_MAX_FORMAT) { vcfFileErr(vcff, "The FORMAT column has at least %d words; " "VCF_MAX_FORMAT may need to be increased in vcf.c!", VCF_MAX_FORMAT); formatWordCount = VCF_MAX_FORMAT; } -if (differentString(formatWords[0], vcfGtGenotype)) - vcfFileErr(vcff, "FORMAT column should begin with \"%s\" but begins with \"%s\"", - vcfGtGenotype, formatWords[0]); int i; // Store the pooled format word pointers and associated types for use in inner loop below. enum vcfInfoType formatTypes[VCF_MAX_FORMAT]; for (i = 0; i < formatWordCount; i++) { formatTypes[i] = typeForGtFormat(vcff, formatWords[i]); formatWords[i] = vcfFilePooledStr(vcff, formatWords[i]); } for (i = 0; i < vcff->genotypeCount; i++) { char *string = record->genotypeUnparsedStrings[i]; struct vcfGenotype *gt = &(record->genotypes[i]); // Each genotype can have multiple :-separated info elements: char *gtWords[VCF_MAX_FORMAT]; int gtWordCount = chopByChar(string, ':', gtWords, ArraySize(gtWords)); if (gtWordCount != formatWordCount) vcfFileErr(vcff, "The FORMAT column has %d words but the genotype column for %s " "has %d words", formatWordCount, vcff->genotypeIds[i], gtWordCount); if (gtWordCount > formatWordCount) gtWordCount = formatWordCount; gt->id = vcff->genotypeIds[i]; gt->infoCount = gtWordCount; gt->infoElements = vcfFileAlloc(vcff, gtWordCount * sizeof(struct vcfInfoElement)); + boolean foundGT = FALSE; int j; for (j = 0; j < gtWordCount; j++) { // Special parsing of genotype: if (sameString(formatWords[j], vcfGtGenotype)) { - char *genotype = gtWords[j]; - char *sep = strchr(genotype, '|'); - if (sep != NULL) - gt->isPhased = TRUE; - else - sep = strchr(genotype, '/'); - if (genotype[0] == '.') - gt->hapIxA = -1; - else - gt->hapIxA = atoi(genotype); - if (sep == NULL) - gt->isHaploid = TRUE; - else if (sep[1] == '.') - gt->hapIxB = -1; - else - gt->hapIxB = atoi(sep+1); + parseGt(gtWords[j], gt); + foundGT = TRUE; + } + else if (!foundGT && sameString(formatWords[j], vcfGtPhred)) + { + parsePlAsGt(gtWords[j], gt); + foundGT = TRUE; } struct vcfInfoElement *el = &(gt->infoElements[j]); el->key = formatWords[j]; el->count = parseInfoValue(record, formatWords[j], formatTypes[j], gtWords[j], &(el->values), &(el->missingData)); if (el->count >= VCF_MAX_INFO) vcfFileErr(vcff, "A single element of the genotype column for \"%s\" " "has at least %d values; " "VCF_MAX_INFO may need to be increased in vcf.c!", gt->id, VCF_MAX_INFO); } + if (i == 0 && !foundGT) + vcfFileErr(vcff, + "Genotype FORMAT column includes neither GT nor PL; unable to parse genotypes."); } record->genotypeUnparsedStrings = NULL; } const struct vcfGenotype *vcfRecordFindGenotype(struct vcfRecord *record, char *sampleId) /* Find the genotype and associated info for the individual, or return NULL. * This calls vcfParseGenotypes if it has not already been called. */ { struct vcfFile *vcff = record->file; if (sampleId == NULL || vcff->genotypeCount == 0) return NULL; vcfParseGenotypes(record); int ix = stringArrayIx(sampleId, vcff->genotypeIds, vcff->genotypeCount); if (ix >= 0) return &(record->genotypes[ix]);