b534e5e167df93880881df0478ecec0225fdf136 angie Wed Aug 31 17:01:24 2016 -0700 This commit adds the capability to pick apart complex HGVS sequence change descriptions, and apply those changes to reference sequence, in order to translate HGVS nucleotide terms into a variant representation suitable for functional prediction in hgVai. VCF was chosen since it is easy to integrate into hgVai. refs #11460 Changes to existing code: * hgvsMapToGenome maps to BED6 instead of BED3 because we need to know strand in order to convert transcript changes into VCF forward-strand genomic changes. * hgvsMapToGenome maps insertions to zero-length points instead of 2-base ranges as in HGVS. New file hgHgvsParse.c contains a tokenizer and parser for HGVS sequence change descriptions; top-level interface is hgvsParseNucleotideChange. hgHgvs.c has new code to translate parsed HGVS nucleotide change(s) into VCF, optionally left-shifting ambiguous alignments (VCF convention, at odds with HGVS right-shifting convention); top-level interface is hgvsToVcfRow. New hgvsToVcf utility enables testing of corner cases and may come in handy as a command-line util. HGVS terms for testing have been taken from ClinVar and do not reflect the diversity of terms in the wild, nor do they cover the full HGVS spec. For example, the HGVS repeat notation can be parsed but not mapped to the genome because all of the ClinVar repeat terms that I looked at looked wonky to me and I believe the HGVS repeat notation is inherently error-prone. The repeat notation is supposed to use the position of the first repeat unit and to specify the number of repeated copies starting at that point (right-shifted if ambiguous). However, in ClinVar, sometimes the given repeat unit sequence did not match the reference sequence at the given position; sometimes the number of sepeats made sense only if they were not perfect repeats (some differing bases); sometimes ranges of repeat numbers were given. Also, the reference assembly's number of repeats can change from one assembly to the next. So it is hard given an HGVS repeat term to determine 1) whether it makes sense in relation to the reference assembly with/without fuzzy matching and 2) what the exact change is relative to the reference assembly. Insertions of inverted sequence from elsewhere in the same reference have not yet been tested. http://varnomen.hgvs.org/recommendations/DNA/variant/inversion/ gives some complicated examples like "g.122_123ins213_234invinsAins123_211inv" but I have not yet seen terms like that in the wild. diff --git src/hg/inc/hgHgvs.h src/hg/inc/hgHgvs.h index 7b2fada..95c4b74 100644 --- src/hg/inc/hgHgvs.h +++ src/hg/inc/hgHgvs.h @@ -1,79 +1,234 @@ /* hgHgvs - support for a subset of the Human Genome Variation Society's (HGVS) nomenclature * for describing variants with respect to a specific gene sequence (or genome assembly). * See http://www.hgvs.org/mutnomen/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #ifndef HGHGVS_H #define HGHGVS_H -#include "basicBed.h" +#include "bed.h" /* The full nomenclature is extremely complicated, able to encode things such as gene fusions and * advanced clinical info (e.g. "=/" for somatic mosaicism, "=//" for chimerism). I am starting * simple with single-base substitutions, which should cover the majority of use cases, and will * work up from there. * * At the same time, since the spec has repeatedly changed, we will need to be flexible in our * parsing in order to support previously published HGVS (or HGVS-like) terms. */ enum hgvsSeqType -/* HGVS describes changes relative to several types of sequence: genomic, coding, protein, etc */ + // HGVS describes changes relative to several types of sequence: genomic, coding, protein, etc { hgvstUndefined, // the usual int default value means we haven't actually checked hgvstCoding, // "c.": Coding DNA sequence only. Beware: complex coords for intron & UTR. hgvstGenomic, // "g.": Genomic DNA hgvstMito, // "m.": Mitochondrial DNA hgvstNoncoding, // "n.": non-coding RNA hgvstRna, // "r.": RNA (like DNA but lowercase and 'u' instead of 'T') hgvstProtein, // "p.": Protein }; struct hgvsVariant -/* Components that we are able parse out of an HGVS term. (there's more to HGVS than just this) */ + // Components that we are able parse out of an HGVS term. (there's more to HGVS than just this) { struct hgvsVariant *next; char *seqAcc; // The reference sequence for the variant (optional -- may be NULL!) char *seqGeneSymbol; // Usually NULL, but DNA/RNA can have HGNC gene symbol in ()'s. - char *change; // The sequence change (following the position in nucleotide terms, - // with embedded position in protein terms); currently not parsed. + char *changes; // The sequence change(s) (following the position in nucleotide terms, + // with embedded position in protein terms); can be parsed into + // syntax tree by hgvsParseNucleotideChange (below) int start1; // 1-based start of the variant in reference seq: can be negative ! // For coding terms with complex intron or UTR coords, this is the // anchor coord (nearest CDS exon boundary) int end; // End of the variant in the reference seq; can be negative ! // For a range, this is the second anchor coord; else same as start // These two fields apply only to hgvstCoding ("c.") terms and Rna ("r.") for coding RNAs: int startOffset; // offset into intron; 1-based, 0 means N/A; can be negative! int endOffset; // offset into intron; 0 means N/A; can be negative! // For a range, this is the second offset; else same as startOffset enum hgvsSeqType type; // type of sequence: genomic, coding, protein, etc boolean startIsUtr3; // TRUE if start is relative to *end* of coding sequence boolean endIsUtr3; // TRUE if end is relative to *end* of coding sequence }; +// +// HGVS sequence change(s) syntax tree +// +// An HGVS description contains two parts: position and sequence change. +// Positions may be simple 1-based, nested ranges, or relative to various CDS boundaries. +// Currently positions are parsed using regular expressions in hgHgvs.c +// Sequence changes may contain nested HGVS terms (insertion from some other sequence). +// They are tokenized and parsed in hgHgvsParse.c. + +enum hgvsChangeType + // There may be no change, or a change that specifies ref and/or alt bases, or a change + // in which the sequence is all implied by the reference sequence. + { + hgvsctUndefined, + hgvsctNoChange, // No change; ref sequence is implied, alt sequence is the same. + hgvsctSubst, // Substitution: ref sequence and (possibly complex) alternate sequence + hgvsctDel, // Deletion; ref sequence is implied (or maybe asserted). + hgvsctDup, // Duplication; ref sequence is implied (or maybe asserted) + hgvsctInv, // Inversion; ref sequence is implied (or maybe asserted) + hgvsctIns, // Insertion: ref is implied; (possibly complex) inserted sequence follows + hgvsctCon, // Conversion: basically Deletion followed by Insertion + hgvsctRepeat, // Ref sequence is implied (first unit); alt sequence is some # of copies + hgvsctFrameshift, // First alt amino acid and possibly offset of early stop + }; + +// Sometimes instead of a sequence (or in addition to a del, dup etc), a length is given, +// sometimes not. Use int's sign to specify whether length was included or not. +#define HGVS_LENGTH_UNSPECIFIED -1 + +enum hgvsSeqChangeType + // A sequence is usually just a sequence, but it might come from a nested expression and/or + // be subjected to yet another change. + { + hgvsstUndefined, + hgvsstSimple, // IUPAC single-letter codes + '?' if some unspecified + hgvsstLength, // Only the length of the sequence is provided + hgvsstNestedChange, // Simple sequence... and another change that's applied to this one. + hgvsstNestedTerm, // HGVS term for sequence copied from some other reference sequence. + }; + +union hgvsSeqValue + // A sequence string, nested change, or nested term (external sequence) + { + char *seq; // IUPAC single-letter codes + '?' if some unspecified + int length; // Sequence length (content unspecified) + struct hgvsVariant term; // Nested HGVS term: acc, pos, possibly change + }; + +struct hgvsChangeRefAlt + // 'Before and after' (reference and alternate) sequence representations + { + char *refSequence; // NULL if the reference sequence is implied by the position, + // or if only length is asserted; otherwise it's asserted + // reference sequence in IUPAC single-letter codes + // and when some but not all bases are included in HGVS term, + // '?' for unspecified bases. + int refLength; // Sometimes, (redundant) length is given after del, dup, etc. + enum hgvsSeqChangeType altType; // Type of alternate sequence -- can be complex. + union hgvsSeqValue altValue; // IUPAC single-letter sequence, or a sequence of nested changes + }; + +struct hgvsChangeFrameshift + // Info that may or may not be given along with a frameshift that disrupts the protein sequence + { + int offset; // The number of AA bases downstream of a new termination + boolean offsetValid; // FALSE if offset was not provided + boolean elongation; // TRUE if offset appears after '*', i.e. after old termination + char *altSequence; // Sequence of IUPAC single-letter AA codes or NULL if not given + char refBase; // IUPAC single-letter AA code (or '\0' if not given) + }; + +struct hgvsChangeRepeat + // Repeat count (range) and optional repeated sequence + { + char *seq; // The sequence that is repeated, may be NULL (implied from ref) + int min; // Minimum repeat count specified + int max; // Max repeat count specified + }; + +union hgvsChangeValue + // Depending on type of change, ref & alt sequences, frameshift description or repeat count + { + struct hgvsChangeRefAlt refAlt; // A description of reference and alternate sequence + struct hgvsChangeFrameshift fs; // Optional info about early termination of translation + struct hgvsChangeRepeat repeat; // Optional repeated sequence and min/max observed counts. + int count; // Repeat unit count in alternate sequence (ref count is 1) + }; + +struct hgvsChange + // Usually this contains a sequence of bases from the reference and a sequence of bases from + // the alternate allele, but can contain nested HGVS terms. For deletions, reference bases + // are usually not specified, except for protein which includes the first and last amino acid. + { + struct hgvsChange *next; // One HGVS term can specify a sequence of changes + enum hgvsChangeType type; // HGVS operator: >, del, ins, fs etc. + union hgvsChangeValue value; // the actual sequences changed (possibly complex) + }; + +void hgvsVariantFree(struct hgvsVariant **pHgvs); +// Free *pHgvs and its contents, and set *pHgvs to NULL. + struct hgvsVariant *hgvsParseTerm(char *term); /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions or alleles. */ struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term); /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS. */ boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele); /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ -struct bed3 *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable); -/* Return a psl with target coords from db, mapping the variant to the genome. - * Return NULL if unable to map. +struct bed *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable); +/* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ +struct bed *hgvsValidateAndMap(struct hgvsVariant *hgvs, char *db, char *term, + struct dyString *dyWarn, char **retPslTable); +/* If we have sequence for hgvs->seqAcc and the hgvs coords are within the bounds + * of that sequence, and we are able to map the coords in seqAcc to genomic coords in + * db, return a bed6 with those coords and strand. If unsuccessful, or if the reference + * sequence differs at the mapped location, then write a warning message to dyWarn. + * If mapping is successful and retPslTable is not NULL, set it to the psl table + * used for mapping. */ + +struct hgvsChange *hgvsParseNucleotideChange(char *changeStr, enum hgvsSeqType type, + struct dyString *dyError); +/* Return a parse tree of an HGVS nt sequence change (the part after the position range) + * possibly followed by additional changes. If there are any parse errors, they will be + * appended to dyError. */ + +struct vcfRow +/* char* fields of VCF row (no genotypes) except POS (int for sorting) and QUAL (omitted). */ +// See src/inc/vcf.h and VCF spec for definitions of fields. +// Currently this is used only in the context of HGVS. If it turns out to be useful elsewhere +// then it could be libified on its own or perhaps into vcf.h. + { + struct vcfRow *next; + char *chrom; + int pos; + char *id; + char *ref; + char *alt; + char *filter; + char *info; + }; + +int vcfRowCmp(const void *va, const void *vb); +/* Compare for sorting based on chrom,pos. */ + +void vcfRowTabOutList(FILE *f, struct vcfRow *rowList); +/* Write out each row of VCF to f. */ + +// VCF header definitions of FILTER and INFO terms created by hgvsToVcfRow: +#define HGVS_VCF_HEADER_DEFS \ + "##FILTER=\n" \ + "##FILTER=\n" \ + "##INFO=\n" \ + "##INFO=\n" + +struct vcfRow *hgvsToVcfRow(char *db, char *term, boolean doLeftShift, struct dyString *dyError); +/* Convert HGVS to a row of VCF suitable for sorting & printing. If unable, return NULL and + * put the reason in dyError. */ + #endif /* HGHGVS_H */