b534e5e167df93880881df0478ecec0225fdf136 angie Wed Aug 31 17:01:24 2016 -0700 This commit adds the capability to pick apart complex HGVS sequence change descriptions, and apply those changes to reference sequence, in order to translate HGVS nucleotide terms into a variant representation suitable for functional prediction in hgVai. VCF was chosen since it is easy to integrate into hgVai. refs #11460 Changes to existing code: * hgvsMapToGenome maps to BED6 instead of BED3 because we need to know strand in order to convert transcript changes into VCF forward-strand genomic changes. * hgvsMapToGenome maps insertions to zero-length points instead of 2-base ranges as in HGVS. New file hgHgvsParse.c contains a tokenizer and parser for HGVS sequence change descriptions; top-level interface is hgvsParseNucleotideChange. hgHgvs.c has new code to translate parsed HGVS nucleotide change(s) into VCF, optionally left-shifting ambiguous alignments (VCF convention, at odds with HGVS right-shifting convention); top-level interface is hgvsToVcfRow. New hgvsToVcf utility enables testing of corner cases and may come in handy as a command-line util. HGVS terms for testing have been taken from ClinVar and do not reflect the diversity of terms in the wild, nor do they cover the full HGVS spec. For example, the HGVS repeat notation can be parsed but not mapped to the genome because all of the ClinVar repeat terms that I looked at looked wonky to me and I believe the HGVS repeat notation is inherently error-prone. The repeat notation is supposed to use the position of the first repeat unit and to specify the number of repeated copies starting at that point (right-shifted if ambiguous). However, in ClinVar, sometimes the given repeat unit sequence did not match the reference sequence at the given position; sometimes the number of sepeats made sense only if they were not perfect repeats (some differing bases); sometimes ranges of repeat numbers were given. Also, the reference assembly's number of repeats can change from one assembly to the next. So it is hard given an HGVS repeat term to determine 1) whether it makes sense in relation to the reference assembly with/without fuzzy matching and 2) what the exact change is relative to the reference assembly. Insertions of inverted sequence from elsewhere in the same reference have not yet been tested. http://varnomen.hgvs.org/recommendations/DNA/variant/inversion/ gives some complicated examples like "g.122_123ins213_234invinsAins123_211inv" but I have not yet seen terms like that in the wild. diff --git src/hg/lib/hgHgvsParse.c src/hg/lib/hgHgvsParse.c new file mode 100644 index 0000000..e97b6ae --- /dev/null +++ src/hg/lib/hgHgvsParse.c @@ -0,0 +1,803 @@ +/* hgHgvsParse.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */ +/* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */ + +/* Copyright (C) 2016 The Regents of the University of California + * See README in this or parent directory for licensing information. */ +#include "common.h" +#include "hgHgvs.h" +#include "regexHelper.h" + +// +// Tokenizer for HGVS description (the part after the /[cgnmpr]\./) +// +// Some tokens are valid only for certain prefixes/types of sequence, e.g. some are valid +// only after "p." (protein), some only after "c." (CDS), some only for /[cgmnr]\./ (nucleotides). +// Some operator tokens have different meanings depending on what prefix they follow. + +enum hdTokenType +{ + tk_undefined, + tk_eof, + tk_int, + tk_seq3Letter, // IUPAC protein 3-letter codes + tk_seq1Letter, // IUPAC single-letter codes for protein, DNA or RNA + // Operators + tk_del, // deletion + tk_inv, // inversion + tk_dup, // duplication + tk_ins, // insertion + tk_con, // conversion http://varnomen.hgvs.org/recommendations/DNA/variant/conversion + // http://varnomen.hgvs.org/recommendations/RNA/variant/conversion + // -- looks like "delins" to me. + tk_fs, // frameshift (protein only) + tk_star, // Stop codon position anchor + tk_underscore, // Coordinate range separator + tk_minus, // CDS intron relative to start of next exon | offset for protein ext operator + tk_plus, // CDS intron relative to end of previous exon | offset for protein ext operator + tk_equal, // synonymous (protein) or no change from reference (nucleotide) + tk_ntSubst, // nucleotide substitution + tk_question, // unknown position + tk_leftParen, // begin uncertain subrange | predicted consequence | grouping of "or" terms | + // surrounding semicolon with no square brackets when synteny is uncertain + tk_rightParen, // end " + + // Advanced operators, not likely to be supported soon: + tk_colon, // oops, the previous tokens, possibly including a "." + version, and + // possibly followed by a /[cgmnpr]\./, were + // actually the accession of a sequence from which some bases are copied here. + // Fortunately it's irrelevant for mapping to genomic coords in browser. + // example from http://varnomen.hgvs.org/recommendations/DNA/variant/insertion/ : + // g.123_124insL37425.1:23_361 + // "the insertion of nucleotides 23 to 361 as described in GenBank file L37425.1 + // between nucleotides g.123 and g.124" + // and ClinVar: NM_020533.2(MCOLN1):c.236_237insNC_012920.1:m.12435_12527 + tk_period, // hopefully this is followed by a version number and colon, or after a colon + // and [cgmnpr]. + tk_caret, // "or" (e.g. c.(370A>C^372C>R) as back translation of p.Ser124Arg) + tk_ext, // extension http://varnomen.hgvs.org/recommendations/protein/variant/extension + tk_leftSquare, // begin haplotype allele; or, if followed by number, repeat count. + tk_rightSquare,// " + tk_semicolon, // separator for variants of a genomic haplotype allele inside []s, + // separator for multiple haplotypes (e.g. diploid) outside []s + // enclosed in parens "(;)" when not certain if variants are on same allele + tk_comma, // sep transcript or protein (not genomic) variants inside []s + tk_doubleSlash,// chimerism http://varnomen.hgvs.org/recommendations/DNA/variant/complex/ + tk_singleSlash,// mosaicism http://varnomen.hgvs.org/recommendations/DNA/variant/complex/ + tk_leftCurly, // see http://varnomen.hgvs.org/recommendations/open-issues/#imperfectcopy + tk_rightCurly, // " +}; + +struct hdTokenProps + // Associate the token string that we'll scan for in input with type and attributes + { + char *value; // token string literal (do not clobber!) + enum hdTokenType type; // token as enum + }; + +static struct hdTokenProps dnaTokens[] = + // Tokens that may appear after "g." (genomic), "m." (mitochondrial) and "n." (non-coding) + { + // Operators + { "ins", tk_ins }, + { "del", tk_del }, + { "dup", tk_dup }, + { "inv", tk_inv }, + { "_", tk_underscore }, + { "=", tk_equal }, + { ">", tk_ntSubst }, + { "?", tk_question }, + { "(", tk_leftParen }, + { ")", tk_rightParen }, + { "[", tk_leftSquare }, + { "]", tk_rightSquare }, + // IUPAC nucleotide single-letter codes + // Note: the HGVS website doesn't spell out whether ambiguous codes are OK, + // but ClinVar uses them so let's support them. + { "A", tk_seq1Letter }, + { "C", tk_seq1Letter }, + { "G", tk_seq1Letter }, + { "T", tk_seq1Letter }, + { "B", tk_seq1Letter }, + { "D", tk_seq1Letter }, + { "H", tk_seq1Letter }, + { "K", tk_seq1Letter }, + { "M", tk_seq1Letter }, + { "N", tk_seq1Letter }, + { "R", tk_seq1Letter }, + { "S", tk_seq1Letter }, + { "V", tk_seq1Letter }, + { "W", tk_seq1Letter }, + { "Y", tk_seq1Letter }, + { NULL, tk_undefined }, + }; + +static struct hdTokenProps codingTokens[] = + // Tokens that may appear after "c." (cDNA with CDS numbering). + { + // Operators + { "ins", tk_ins }, + { "del", tk_del }, + { "dup", tk_dup }, + { "inv", tk_inv }, + { "*", tk_star }, + { "_", tk_underscore }, + { "-", tk_minus }, + { "+", tk_plus }, + { "=", tk_equal }, + { ">", tk_ntSubst }, + { "?", tk_question }, + { "(", tk_leftParen }, + { ")", tk_rightParen }, + { "[", tk_leftSquare }, + { "]", tk_rightSquare }, + // IUPAC nucleotide single-letter codes + // Note: the HGVS website doesn't spell out whether ambiguous codes are OK, + // but ClinVar uses them so let's support them. + { "A", tk_seq1Letter }, + { "C", tk_seq1Letter }, + { "G", tk_seq1Letter }, + { "T", tk_seq1Letter }, + { "B", tk_seq1Letter }, + { "D", tk_seq1Letter }, + { "H", tk_seq1Letter }, + { "K", tk_seq1Letter }, + { "M", tk_seq1Letter }, + { "N", tk_seq1Letter }, + { "R", tk_seq1Letter }, + { "S", tk_seq1Letter }, + { "V", tk_seq1Letter }, + { "W", tk_seq1Letter }, + { "Y", tk_seq1Letter }, + { NULL, tk_undefined }, + }; + +static struct hdTokenProps rnaTokens[] = + // Tokens that may appear after "r." (RNA) -- maddeningly, these may use CDS or noncoding + // numbering, unspecified, I guess you have to guess from NM_ vs NR_ but who knows about LRG. + // I hope this isn't used too often. + { + // Operators + { "ins", tk_ins }, + { "del", tk_del }, + { "dup", tk_dup }, + { "inv", tk_inv }, + { "*", tk_star }, + { "_", tk_underscore }, + { "-", tk_minus }, + { "+", tk_plus }, + { "=", tk_equal }, + { ">", tk_ntSubst }, + { "?", tk_question }, + { "(", tk_leftParen }, + { ")", tk_rightParen }, + { "[", tk_leftSquare }, + { "]", tk_rightSquare }, + // IUPAC RNA bases: lower-case, u instead of t + // Note: the HGVS website doesn't spell out whether ambiguous codes are OK, + // but ClinVar uses them so let's support them. + { "a", tk_seq1Letter }, + { "c", tk_seq1Letter }, + { "g", tk_seq1Letter }, + { "u", tk_seq1Letter }, + { "b", tk_seq1Letter }, + { "d", tk_seq1Letter }, + { "h", tk_seq1Letter }, + { "k", tk_seq1Letter }, + { "m", tk_seq1Letter }, + { "n", tk_seq1Letter }, + { "r", tk_seq1Letter }, + { "s", tk_seq1Letter }, + { "v", tk_seq1Letter }, + { "w", tk_seq1Letter }, + { "y", tk_seq1Letter }, + { NULL, tk_undefined }, + }; + +static struct hdTokenProps proteinTokens[] = + // Tokens that may appear after "p." (protein) + { + // Operators + { "ins", tk_ins }, + { "del", tk_del }, + { "dup", tk_dup }, + { "inv", tk_inv }, + { "fs", tk_fs }, + { "_", tk_underscore }, + { "=", tk_equal }, + { "?", tk_question }, + { "(", tk_leftParen }, + { ")", tk_rightParen }, + { "*", tk_star }, + // IUPAC protein 3-letter codes + { "Ala", tk_seq3Letter }, + { "Arg", tk_seq3Letter }, + { "Asn", tk_seq3Letter }, + { "Asp", tk_seq3Letter }, + { "Cys", tk_seq3Letter }, + { "Gln", tk_seq3Letter }, + { "Glu", tk_seq3Letter }, + { "Gly", tk_seq3Letter }, + { "His", tk_seq3Letter }, + { "Ile", tk_seq3Letter }, + { "Leu", tk_seq3Letter }, + { "Lys", tk_seq3Letter }, + { "Met", tk_seq3Letter }, + { "Phe", tk_seq3Letter }, + { "Pro", tk_seq3Letter }, + { "Ser", tk_seq3Letter }, + { "Thr", tk_seq3Letter }, + { "Trp", tk_seq3Letter }, + { "Tyr", tk_seq3Letter }, + { "Val", tk_seq3Letter }, + { "Ter", tk_seq3Letter }, + // IUPAC protein (and/or nucleotide ACGT) single-letter codes + { "A", tk_seq1Letter }, + { "R", tk_seq1Letter }, + { "N", tk_seq1Letter }, + { "D", tk_seq1Letter }, + { "C", tk_seq1Letter }, + { "Q", tk_seq1Letter }, + { "E", tk_seq1Letter }, + { "G", tk_seq1Letter }, + { "H", tk_seq1Letter }, + { "I", tk_seq1Letter }, + { "L", tk_seq1Letter }, + { "K", tk_seq1Letter }, + { "M", tk_seq1Letter }, + { "F", tk_seq1Letter }, + { "P", tk_seq1Letter }, + { "S", tk_seq1Letter }, + { "T", tk_seq1Letter }, + { "W", tk_seq1Letter }, + { "Y", tk_seq1Letter }, + { "V", tk_seq1Letter }, + { "X", tk_seq1Letter }, + { NULL, tk_undefined }, + }; + +static struct hdTokenProps *hdTokenAlphabetFromType(enum hgvsSeqType type) +/* Return a set of acceptable tokens for type. */ +{ +struct hdTokenProps *props = NULL; +switch (type) + { + case hgvstCoding: + props = codingTokens; + break; + case hgvstGenomic: + case hgvstMito: + case hgvstNoncoding: + props = dnaTokens; + break; + case hgvstProtein: + props = proteinTokens; + break; + case hgvstRna: + props = rnaTokens; + break; + default: + errAbort("Unrecognized hgvsSeqType %d", type); + } +return props; +} + +struct hdTokenizer +/* Tokenizer on string input, with properties of current token, that can reuse the current token. */ +{ + char *input; // Text to be parsed (after current token) + struct hdTokenProps *tokenAlphabet; // Array of all tokens that we might encounter + enum hdTokenType type; // Type of the current token + char *string; // String value of current token + int sAlloc; // Allocated string size + bool reuse; // TRUE if want to give back this token + bool eof; // TRUE at end of input +}; + +// Expand sAlloc as needed, this many bytes at a time: +#define HDT_BUF_SIZE 128 + +static void hdtReuse(struct hdTokenizer *hdt) +/* Reuse current token, i.e. don't advance on the next call to hdtNext(). */ +{ +hdt->reuse = TRUE; +} + +static void hdtAdvance(struct hdTokenizer *hdt, int len) +/* Copy len chars of hdt->input into hdt->string & null-terminate, resizing if necessary, + * and advance hdt->input by len. */ +{ +if (len >= hdt->sAlloc) + { + while (len >= hdt->sAlloc) + hdt->sAlloc += HDT_BUF_SIZE; + hdt->string = needMoreMem(hdt->string, 0, hdt->sAlloc); + } +memcpy(hdt->string, hdt->input, len); +hdt->string[len] = '\0'; +hdt->input += len; +} + +static boolean hdtEof(struct hdTokenizer *hdt) +/* If input is empty, update hdt to EOF state and return TRUE; */ +{ +if (isEmpty(hdt->input)) + { + hdt->eof = TRUE; + hdt->type = tk_eof; + hdt->string[0] = '\0'; + return TRUE; + } +return FALSE; +} + +static boolean hdtNextFromAlphabet(struct hdTokenizer *hdt) +/* If hdt->input starts with a static string in the tokenAlphabet, update hdt and return TRUE */ +{ +if (hdtEof(hdt)) + return FALSE; +int i; +for (i = 0; isNotEmpty(hdt->tokenAlphabet[i].value); i++) + { + struct hdTokenProps *props = &hdt->tokenAlphabet[i]; + if (startsWith(props->value, hdt->input)) + { + hdtAdvance(hdt, strlen(props->value)); + hdt->type = props->type; + return TRUE; + } + } +hdt->type = tk_undefined; +return FALSE; +} + +static void dyPrependf(struct dyString *dyError, char *format, ...) +/* Add a warning to *beginning* of dyError so that when we pop the call stack, higher + * level errors appear before lower lever errors. */ +{ +char oldErr[dyStringLen(dyError)+1]; +safecpy(oldErr, sizeof(oldErr), dyStringContents(dyError)); +dyStringClear(dyError); +va_list args; +va_start(args, format); +dyStringVaPrintf(dyError, format, args); +va_end(args); +if (isNotEmpty(oldErr)) + { + char newErr[dyStringLen(dyError)+1]; + safecpy(newErr, sizeof(newErr), dyStringContents(dyError)); + dyStringClear(dyError); + dyStringAppend(dyError, newErr); + dyStringAppend(dyError, "; "); + dyStringAppend(dyError, oldErr); + } +} + +static boolean hdtNext(struct hdTokenizer *hdt, struct dyString *dyError, char *context) +/* If hdt can advance to the next token in input, then make that hdt's current token and return + * TRUE. EOF is considered valid. If input starts with something unexpected then return FALSE. */ +{ +if (hdt->reuse) + { + // Stay on current token. + hdt->reuse = FALSE; + return TRUE; + } +if (hdtEof(hdt)) + return TRUE; +if (isdigit(hdt->input[0])) + { + // Handle numbers separately from static tokens. + int len = 1; + while (isdigit(hdt->input[len])) + len++; + hdtAdvance(hdt, len); + hdt->type = tk_int; + return TRUE; + } +else + { + char *start = hdt->input; + if (hdtNextFromAlphabet(hdt)) + { + enum hdTokenType firstType = hdt->type; + if (firstType == tk_seq1Letter || firstType == tk_seq3Letter) + { + // Accumulate sequence into string instead of returning one base at a time. + boolean isValid = TRUE; + while (hdt->type == firstType && isValid) + isValid = hdtNextFromAlphabet(hdt); + // We advanced to something that is not sequence, so we'll need to back up. + int len = hdt->input - start; + if (isValid) + len -= strlen(hdt->string); + hdt->input = start; + hdt->type = firstType; + hdt->eof = FALSE; + hdtAdvance(hdt, len); + } + return TRUE; + } + } +dyPrependf(dyError, "bad token %s at '%s'", context, hdt->input); +return FALSE; +} + +static struct hdTokenizer *hdTokenizerNew(char *input, struct hdTokenProps *tokenAlphabet) +/* Return a new tokenizer on input that will use tokenAlphabet. Call htdNext on the returned + * tokenizer to get the first token (and the next...) loaded into tokenizer. Note that this + * does not clone input, it simply scans it. */ +{ +struct hdTokenizer *hdt; +AllocVar(hdt); +hdt->input = input; +hdt->tokenAlphabet = tokenAlphabet; +hdt->sAlloc = HDT_BUF_SIZE; +hdt->string = needMem(hdt->sAlloc); +return hdt; +} + +static void hdTokenizerFree(struct hdTokenizer **pHdt) +/* Free up what we allocated (which does not include hdt->input). */ +{ +struct hdTokenizer *hdt = *pHdt; +if (hdt != NULL) + { + freeMem(hdt->string); + freez(pHdt); + } +} + +static boolean parseNonNegNumOrRange(struct hdTokenizer *hdt, int *retMin, int *retMax, + struct dyString *dyError) +/* Parse a nonnegative number or _-separated range of nonneg numbers, possibly '?' for + * unknown value. + * Set retMin and retMax to -1 for '?', otherwise the min and max number. */ +{ +int min = 0, max = 0; +// Expect a number +if (hdt->type == tk_int || hdt->type == tk_question) + { + min = max = (hdt->type == tk_int) ? atoi(hdt->string) : HGVS_LENGTH_UNSPECIFIED; + if (!hdtNext(hdt, dyError, "after number")) + return FALSE; + if (hdt->type == tk_underscore) + { + // range of counts -- get next number + if (!hdtNext(hdt, dyError, "after underscore")) + return FALSE; + if (hdt->type == tk_int || hdt->type == tk_question) + max = (hdt->type == tk_int) ? atoi(hdt->string) : HGVS_LENGTH_UNSPECIFIED; + else + { + dyPrependf(dyError, + "expecting nonnegative numeric range but got non-number after '%d_' at '%s'", + min, hdt->string); + return FALSE; + } + } + else + // just a number, not a range; don't consume this token + hdtReuse(hdt); + } +else + { + dyPrependf(dyError, "expecting nonnegative number or range (or '?') but got '%s'", hdt->string); + return FALSE; + } +*retMin = min; +*retMax = max; +return TRUE; +} + +static boolean parseNonNegNumOrRangeMaybeParens(struct hdTokenizer *hdt, int *retMin, int *retMax, + struct dyString *dyError) +{ +/* Parse a nonnegative number or _-separated range of nonneg numbers, possibly '?' for + * unknown value, possibly enclosed in parens. + * Set retMin and retMax to -1 for '?', otherwise the min and max number. */ +if (hdt->type == tk_leftParen) + { + if (!hdtNext(hdt, dyError, "after '('")) + return FALSE; + if (! parseNonNegNumOrRange(hdt, retMin, retMax, dyError)) + { + dyPrependf(dyError, "after '('"); + return FALSE; + } + if (!hdtNext(hdt, dyError, "after '(' followed by number/range")) + return FALSE; + if (hdt->type != tk_rightParen) + { + dyPrependf(dyError, "expected ')' after number/range, got '%s'", hdt->string); + return FALSE; + } + return TRUE; + } +else + return parseNonNegNumOrRange(hdt, retMin, retMax, dyError); +} + +static struct hgvsChange *hgvsChangeNewRepeat(char *seq, int min, int max) +/* Allocate and return an hgvsChange specifying a repeated sequence. */ +{ +struct hgvsChange *change; +AllocVar(change); +change->type = hgvsctRepeat; +change->value.repeat.seq = seq; +change->value.repeat.min = min; +change->value.repeat.max = max; +return change; +} + +static struct hgvsChange *parseNtRepeat(char *seq, struct hdTokenizer *hdt, + struct dyString *dyError) +/* Optionally preceded by seq, enclosed in square brackets and/or parens, a number or range. */ +{ +// Make sure we start with '(' or '[' +enum hdTokenType lType = hdt->type; +if (lType != tk_leftParen && lType != tk_leftSquare) + errAbort("parseNtRepeat: expected to start with ( or ["); +if (!hdtNext(hdt, dyError, "after '[' or '(' in repeat")) + return NULL; +// Next should be a repeat count or range of counts +int min, max; +if (! parseNonNegNumOrRangeMaybeParens(hdt, &min, &max, dyError)) + { + dyPrependf(dyError, "in brackets for repeat"); + return NULL; + } +// Now matching ')' or ']' +if (!hdtNext(hdt, dyError, "after '[' or '(' followed by number")) + return NULL; +struct hgvsChange *change = NULL; +if ((lType == tk_leftParen && hdt->type == tk_rightParen) || + (lType == tk_leftSquare && hdt->type == tk_rightSquare)) + { + change = hgvsChangeNewRepeat(seq, min, max); + } +else + dyPrependf(dyError, "mismatching brackets/parens for repeat"); +return change; +} + +static struct hgvsChange *hgvsChangeNewSimple(enum hgvsChangeType type, char *ref, int refLength, + char *alt, int altLength) +/* Allocate and return a simple change with optional ref and optional alt (or just lengths). + * ref and alt are not cloned here, they must be allocated by caller! */ +{ +struct hgvsChange *change; +AllocVar(change); +change->type = type; +change->value.refAlt.refSequence = ref; +change->value.refAlt.refLength = refLength; +if (alt == NULL && altLength != HGVS_LENGTH_UNSPECIFIED) + { + change->value.refAlt.altType = hgvsstLength; + change->value.refAlt.altValue.length = altLength; + } +else + { + change->value.refAlt.altType = hgvsstSimple; + change->value.refAlt.altValue.seq = alt; + } +return change; +} + +static struct hgvsChange *parseNtIupac(struct hdTokenizer *hdt, struct dyString *dyError) +/* IUPAC sequence should be followed by '>' (subst) or '=' (no change) or '[' (repeat) */ +{ +if (hdt->type != tk_seq1Letter) + errAbort("parseNtIupac: expected to start with nucleotide IUPAC sequence"); +char *ref = cloneString(hdt->string); +if (!hdtNext(hdt, dyError, "after IUPAC sequence")) + return NULL; +if (hdt->type == tk_ntSubst) + { + if (!hdtNext(hdt, dyError, "after '>'")) + return NULL; + else if (hdt->type == tk_seq1Letter) + return hgvsChangeNewSimple(hgvsctSubst, ref, HGVS_LENGTH_UNSPECIFIED, + cloneString(hdt->string), HGVS_LENGTH_UNSPECIFIED); + else + dyPrependf(dyError, "expected IUPAC sequence after '>', got '%s'", hdt->string); + } +else if (hdt->type == tk_equal) + { + // Reference sequence was asserted, so store that for comparison with actual reference. + return hgvsChangeNewSimple(hgvsctNoChange, ref, HGVS_LENGTH_UNSPECIFIED, + NULL, HGVS_LENGTH_UNSPECIFIED); + } +else if (hdt->type == tk_leftSquare || hdt->type == tk_leftParen) + return parseNtRepeat(ref, hdt, dyError); +else + dyPrependf(dyError, "something unexpected after IUPAC sequence"); +return NULL; +} + +static struct hgvsChange *parseNtDelOrDupOrInv(struct hdTokenizer *hdt, struct dyString *dyError) +/* del, dup or inv may be followed by an asserted reference sequence, a redundant length (ignore), + * or a new change. */ +{ +struct hgvsChange *change = NULL; +if (hdt->type != tk_del && hdt->type != tk_dup && hdt->type != tk_inv) + errAbort("parseNtDelOrDupOrInv: expecting 'del', 'dup' or 'inv'"); +enum hdTokenType opType = hdt->type; +if (!hdtNext(hdt, dyError, "after del, dup or inv")) + return NULL; +char *ref = NULL; +int refLength = HGVS_LENGTH_UNSPECIFIED; +if (hdt->type == tk_seq1Letter) + ref = cloneString(hdt->string); +else if (hdt->type == tk_int) + refLength = atoi(hdt->string); +else + // Something else -- hopefully the beginning of the next change or eof + hdtReuse(hdt); +enum hgvsChangeType type = hgvsctUndefined; +if (opType == tk_del) + type = hgvsctDel; +else if (opType == tk_dup) + type = hgvsctDup; +else if (opType == tk_inv) + type = hgvsctInv; +change = hgvsChangeNewSimple(type, ref, refLength, NULL, HGVS_LENGTH_UNSPECIFIED); +return change; +} + +static enum hgvsSeqType hgvsSeqTypeFromString(char changeType) +/* Translate [cgmnrp] to enum hgvsSeqType */ +{ +switch (changeType) + { + // Not using break below, because return does the job + case 'c': + return hgvstCoding; + case 'g': + return hgvstGenomic; + case 'm': + return hgvstMito; + case 'n': + return hgvstNoncoding; + case 'r': + return hgvstRna; + case 'p': + return hgvstProtein; + default: + errAbort("hgvsSeqTypeFromString: unrecognized input '%c'", changeType); + } +return hgvstUndefined; +} + +// After ins or con, it's possible to have a nested term to specify sequence from outside the ref: +#define nestedExp "^([A-Z_]+[0-9]+(\\.[0-9]+)?):([cgmnr])\\.([0-9]+)_([0-9]+)(inv[0-9]*)?" +// 1 ....................... accession +// 2 ......... dot suffix? +// 3 .... type +// 4 .... start +// 5 .... end +// 6 ... inv? + +static struct hgvsChange *hgvsChangeNewNested(struct hdTokenizer *hdt, regmatch_t *substrs) +/* hdt's current token is inv or con; next up is a nested HGVS term with components in + * substrs as outlined for nestedExp above. Return an hgvsChange with nested hgvsVariant. */ +{ +struct hgvsChange *change; +AllocVar(change); +change->type = (hdt->type == tk_ins) ? hgvsctIns : hgvsctCon; +struct hgvsChangeRefAlt *refAlt = &change->value.refAlt; +refAlt->refLength = HGVS_LENGTH_UNSPECIFIED; +refAlt->altType = hgvsstNestedTerm; +struct hgvsVariant *nestedTerm = &refAlt->altValue.term; +nestedTerm->seqAcc = regexSubstringClone(hdt->input, substrs[1]); +char changeType = hdt->input[substrs[3].rm_so]; // [cgmnrp] +nestedTerm->type = hgvsSeqTypeFromString(changeType); +nestedTerm->start1 = regexSubstringInt(hdt->input, substrs[4]); +nestedTerm->end = regexSubstringInt(hdt->input, substrs[5]); +nestedTerm->changes = regexSubstringClone(hdt->input, substrs[6]); +return change; +} + +static struct hgvsChange *parseNtInsOrCon(struct hdTokenizer *hdt, struct dyString *dyError) +/* ins or con may be followed by a nested term (accession, type dot, pos range, possibly inv) + * so do some extra lookahead. ins can also be followed by numbers in parens (or missing the + * parens...) to indicate a number of unknown bases (Ns); if so then we do an end run around the + * tokenizer. */ +{ +struct hgvsChange *change = NULL; +if (hdt->type != tk_ins && hdt->type != tk_con) + errAbort("parseNtInsOrCon: expecting 'ins' or 'con', got '%s' (followed by '%s')", + hdt->string, hdt->input); +// Don't advance hdt until we first look ahead for accession:... +regmatch_t substrs[8]; +if (regexMatchSubstr(hdt->input, nestedExp, substrs, ArraySize(substrs))) + { + change = hgvsChangeNewNested(hdt, substrs); + // Skip over everything that nestedRegex matched, and update hdt->type. + hdtAdvance(hdt, substrs[0].rm_eo - substrs[0].rm_so); + hdt->type = (regexSubstrMatched(substrs[6])) ? tk_inv : tk_seq1Letter; + } +else + { + // Tokenize and parse the usual way -- a sequence or length might be asserted after this. + enum hdTokenType type = hdt->type; + char context[hdt->sAlloc + 256]; + safef(context, sizeof(context), "after '%s'", hdt->string); + if (!hdtNext(hdt, dyError, context)) + return NULL; + char *insSeq = NULL; + int insLen = HGVS_LENGTH_UNSPECIFIED; + if (hdt->type == tk_seq1Letter) + insSeq = cloneString(hdt->string); + else if (hdt->type == tk_int || hdt->type == tk_leftParen) + { + // Length only, should be in parens but ClinVar just puts number. + int min, max; + if (! parseNonNegNumOrRangeMaybeParens(hdt, &min, &max, dyError)) + { + dyPrependf(dyError, "expecting number (optionally in parens) after '%s'", + (type == tk_ins) ? "ins" : "con"); + return NULL; + } + if (min != max) + { +//#*** A range is valid -- it means the insertion is from some other position range in this sequence! And it can be inverted. + dyPrependf(dyError, "expecting number after %s but got range (%d_%d)", + (type == tk_ins) ? "ins" : "con", min, max); + return NULL; + } + insLen = max; + } + if (insSeq || insLen != HGVS_LENGTH_UNSPECIFIED) + { + enum hgvsChangeType changeType = (type == tk_ins) ? hgvsctIns : hgvsctCon; + change = hgvsChangeNewSimple(changeType, NULL, HGVS_LENGTH_UNSPECIFIED, + insSeq, insLen); + } + } +return change; +} + +struct hgvsChange *hgvsParseNucleotideChange(char *changeStr, enum hgvsSeqType type, + struct dyString *dyError) +/* Return a parse tree of a coding HGVS sequence change (the part after the position range) + * possibly followed by additional changes. If there are any parse errors, they will be + * appended to dyError. */ +{ +struct hdTokenProps *alphabet = hdTokenAlphabetFromType(type); +struct hdTokenizer *hdt = hdTokenizerNew(changeStr, alphabet); +struct hgvsChange *changeList = NULL; +char context[strlen(changeStr) + 256]; +safef(context, sizeof(context), "at start of '%s'", changeStr); +if (!hdtNext(hdt, dyError, context)) + { + hdTokenizerFree(&hdt); + return NULL; + } +while (! hdt->eof) + { + struct hgvsChange *change = NULL; + if (hdt->type == tk_seq1Letter) + change = parseNtIupac(hdt, dyError); + else if (hdt->type == tk_leftSquare || hdt->type == tk_leftParen) + change = parseNtRepeat(NULL, hdt, dyError); + else if (hdt->type == tk_del || hdt->type == tk_dup || + hdt->type == tk_inv) + change = parseNtDelOrDupOrInv(hdt, dyError); + else if (hdt->type == tk_ins || hdt->type == tk_con) + change = parseNtInsOrCon(hdt, dyError); + if (change != NULL) + slAddHead(&changeList, change); + else + { + dyPrependf(dyError, "couldn't parse HGVS nucleotide change string '%s'", changeStr); + break; + } + if (!hdtNext(hdt, dyError, "after valid change")) + { + break; + } + } +slReverse(&changeList); +hdTokenizerFree(&hdt); +return changeList; +}