812dec9c5d8a5c947bd30442feae4a4449f50257 angie Wed Aug 9 13:41:58 2017 -0700 Oops, forgot to handle '=' in hgvsParseNucleotideChange. diff --git src/hg/lib/hgHgvsParse.c src/hg/lib/hgHgvsParse.c index e97b6ae..41c070d 100644 --- src/hg/lib/hgHgvsParse.c +++ src/hg/lib/hgHgvsParse.c @@ -1,803 +1,806 @@ /* hgHgvsParse.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */ /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hgHgvs.h" #include "regexHelper.h" // // Tokenizer for HGVS description (the part after the /[cgnmpr]\./) // // Some tokens are valid only for certain prefixes/types of sequence, e.g. some are valid // only after "p." (protein), some only after "c." (CDS), some only for /[cgmnr]\./ (nucleotides). // Some operator tokens have different meanings depending on what prefix they follow. enum hdTokenType { tk_undefined, tk_eof, tk_int, tk_seq3Letter, // IUPAC protein 3-letter codes tk_seq1Letter, // IUPAC single-letter codes for protein, DNA or RNA // Operators tk_del, // deletion tk_inv, // inversion tk_dup, // duplication tk_ins, // insertion tk_con, // conversion http://varnomen.hgvs.org/recommendations/DNA/variant/conversion // http://varnomen.hgvs.org/recommendations/RNA/variant/conversion // -- looks like "delins" to me. tk_fs, // frameshift (protein only) tk_star, // Stop codon position anchor tk_underscore, // Coordinate range separator tk_minus, // CDS intron relative to start of next exon | offset for protein ext operator tk_plus, // CDS intron relative to end of previous exon | offset for protein ext operator tk_equal, // synonymous (protein) or no change from reference (nucleotide) tk_ntSubst, // nucleotide substitution tk_question, // unknown position tk_leftParen, // begin uncertain subrange | predicted consequence | grouping of "or" terms | // surrounding semicolon with no square brackets when synteny is uncertain tk_rightParen, // end " // Advanced operators, not likely to be supported soon: tk_colon, // oops, the previous tokens, possibly including a "." + version, and // possibly followed by a /[cgmnpr]\./, were // actually the accession of a sequence from which some bases are copied here. // Fortunately it's irrelevant for mapping to genomic coords in browser. // example from http://varnomen.hgvs.org/recommendations/DNA/variant/insertion/ : // g.123_124insL37425.1:23_361 // "the insertion of nucleotides 23 to 361 as described in GenBank file L37425.1 // between nucleotides g.123 and g.124" // and ClinVar: NM_020533.2(MCOLN1):c.236_237insNC_012920.1:m.12435_12527 tk_period, // hopefully this is followed by a version number and colon, or after a colon // and [cgmnpr]. tk_caret, // "or" (e.g. c.(370A>C^372C>R) as back translation of p.Ser124Arg) tk_ext, // extension http://varnomen.hgvs.org/recommendations/protein/variant/extension tk_leftSquare, // begin haplotype allele; or, if followed by number, repeat count. tk_rightSquare,// " tk_semicolon, // separator for variants of a genomic haplotype allele inside []s, // separator for multiple haplotypes (e.g. diploid) outside []s // enclosed in parens "(;)" when not certain if variants are on same allele tk_comma, // sep transcript or protein (not genomic) variants inside []s tk_doubleSlash,// chimerism http://varnomen.hgvs.org/recommendations/DNA/variant/complex/ tk_singleSlash,// mosaicism http://varnomen.hgvs.org/recommendations/DNA/variant/complex/ tk_leftCurly, // see http://varnomen.hgvs.org/recommendations/open-issues/#imperfectcopy tk_rightCurly, // " }; struct hdTokenProps // Associate the token string that we'll scan for in input with type and attributes { char *value; // token string literal (do not clobber!) enum hdTokenType type; // token as enum }; static struct hdTokenProps dnaTokens[] = // Tokens that may appear after "g." (genomic), "m." (mitochondrial) and "n." (non-coding) { // Operators { "ins", tk_ins }, { "del", tk_del }, { "dup", tk_dup }, { "inv", tk_inv }, { "_", tk_underscore }, { "=", tk_equal }, { ">", tk_ntSubst }, { "?", tk_question }, { "(", tk_leftParen }, { ")", tk_rightParen }, { "[", tk_leftSquare }, { "]", tk_rightSquare }, // IUPAC nucleotide single-letter codes // Note: the HGVS website doesn't spell out whether ambiguous codes are OK, // but ClinVar uses them so let's support them. { "A", tk_seq1Letter }, { "C", tk_seq1Letter }, { "G", tk_seq1Letter }, { "T", tk_seq1Letter }, { "B", tk_seq1Letter }, { "D", tk_seq1Letter }, { "H", tk_seq1Letter }, { "K", tk_seq1Letter }, { "M", tk_seq1Letter }, { "N", tk_seq1Letter }, { "R", tk_seq1Letter }, { "S", tk_seq1Letter }, { "V", tk_seq1Letter }, { "W", tk_seq1Letter }, { "Y", tk_seq1Letter }, { NULL, tk_undefined }, }; static struct hdTokenProps codingTokens[] = // Tokens that may appear after "c." (cDNA with CDS numbering). { // Operators { "ins", tk_ins }, { "del", tk_del }, { "dup", tk_dup }, { "inv", tk_inv }, { "*", tk_star }, { "_", tk_underscore }, { "-", tk_minus }, { "+", tk_plus }, { "=", tk_equal }, { ">", tk_ntSubst }, { "?", tk_question }, { "(", tk_leftParen }, { ")", tk_rightParen }, { "[", tk_leftSquare }, { "]", tk_rightSquare }, // IUPAC nucleotide single-letter codes // Note: the HGVS website doesn't spell out whether ambiguous codes are OK, // but ClinVar uses them so let's support them. { "A", tk_seq1Letter }, { "C", tk_seq1Letter }, { "G", tk_seq1Letter }, { "T", tk_seq1Letter }, { "B", tk_seq1Letter }, { "D", tk_seq1Letter }, { "H", tk_seq1Letter }, { "K", tk_seq1Letter }, { "M", tk_seq1Letter }, { "N", tk_seq1Letter }, { "R", tk_seq1Letter }, { "S", tk_seq1Letter }, { "V", tk_seq1Letter }, { "W", tk_seq1Letter }, { "Y", tk_seq1Letter }, { NULL, tk_undefined }, }; static struct hdTokenProps rnaTokens[] = // Tokens that may appear after "r." (RNA) -- maddeningly, these may use CDS or noncoding // numbering, unspecified, I guess you have to guess from NM_ vs NR_ but who knows about LRG. // I hope this isn't used too often. { // Operators { "ins", tk_ins }, { "del", tk_del }, { "dup", tk_dup }, { "inv", tk_inv }, { "*", tk_star }, { "_", tk_underscore }, { "-", tk_minus }, { "+", tk_plus }, { "=", tk_equal }, { ">", tk_ntSubst }, { "?", tk_question }, { "(", tk_leftParen }, { ")", tk_rightParen }, { "[", tk_leftSquare }, { "]", tk_rightSquare }, // IUPAC RNA bases: lower-case, u instead of t // Note: the HGVS website doesn't spell out whether ambiguous codes are OK, // but ClinVar uses them so let's support them. { "a", tk_seq1Letter }, { "c", tk_seq1Letter }, { "g", tk_seq1Letter }, { "u", tk_seq1Letter }, { "b", tk_seq1Letter }, { "d", tk_seq1Letter }, { "h", tk_seq1Letter }, { "k", tk_seq1Letter }, { "m", tk_seq1Letter }, { "n", tk_seq1Letter }, { "r", tk_seq1Letter }, { "s", tk_seq1Letter }, { "v", tk_seq1Letter }, { "w", tk_seq1Letter }, { "y", tk_seq1Letter }, { NULL, tk_undefined }, }; static struct hdTokenProps proteinTokens[] = // Tokens that may appear after "p." (protein) { // Operators { "ins", tk_ins }, { "del", tk_del }, { "dup", tk_dup }, { "inv", tk_inv }, { "fs", tk_fs }, { "_", tk_underscore }, { "=", tk_equal }, { "?", tk_question }, { "(", tk_leftParen }, { ")", tk_rightParen }, { "*", tk_star }, // IUPAC protein 3-letter codes { "Ala", tk_seq3Letter }, { "Arg", tk_seq3Letter }, { "Asn", tk_seq3Letter }, { "Asp", tk_seq3Letter }, { "Cys", tk_seq3Letter }, { "Gln", tk_seq3Letter }, { "Glu", tk_seq3Letter }, { "Gly", tk_seq3Letter }, { "His", tk_seq3Letter }, { "Ile", tk_seq3Letter }, { "Leu", tk_seq3Letter }, { "Lys", tk_seq3Letter }, { "Met", tk_seq3Letter }, { "Phe", tk_seq3Letter }, { "Pro", tk_seq3Letter }, { "Ser", tk_seq3Letter }, { "Thr", tk_seq3Letter }, { "Trp", tk_seq3Letter }, { "Tyr", tk_seq3Letter }, { "Val", tk_seq3Letter }, { "Ter", tk_seq3Letter }, // IUPAC protein (and/or nucleotide ACGT) single-letter codes { "A", tk_seq1Letter }, { "R", tk_seq1Letter }, { "N", tk_seq1Letter }, { "D", tk_seq1Letter }, { "C", tk_seq1Letter }, { "Q", tk_seq1Letter }, { "E", tk_seq1Letter }, { "G", tk_seq1Letter }, { "H", tk_seq1Letter }, { "I", tk_seq1Letter }, { "L", tk_seq1Letter }, { "K", tk_seq1Letter }, { "M", tk_seq1Letter }, { "F", tk_seq1Letter }, { "P", tk_seq1Letter }, { "S", tk_seq1Letter }, { "T", tk_seq1Letter }, { "W", tk_seq1Letter }, { "Y", tk_seq1Letter }, { "V", tk_seq1Letter }, { "X", tk_seq1Letter }, { NULL, tk_undefined }, }; static struct hdTokenProps *hdTokenAlphabetFromType(enum hgvsSeqType type) /* Return a set of acceptable tokens for type. */ { struct hdTokenProps *props = NULL; switch (type) { case hgvstCoding: props = codingTokens; break; case hgvstGenomic: case hgvstMito: case hgvstNoncoding: props = dnaTokens; break; case hgvstProtein: props = proteinTokens; break; case hgvstRna: props = rnaTokens; break; default: errAbort("Unrecognized hgvsSeqType %d", type); } return props; } struct hdTokenizer /* Tokenizer on string input, with properties of current token, that can reuse the current token. */ { char *input; // Text to be parsed (after current token) struct hdTokenProps *tokenAlphabet; // Array of all tokens that we might encounter enum hdTokenType type; // Type of the current token char *string; // String value of current token int sAlloc; // Allocated string size bool reuse; // TRUE if want to give back this token bool eof; // TRUE at end of input }; // Expand sAlloc as needed, this many bytes at a time: #define HDT_BUF_SIZE 128 static void hdtReuse(struct hdTokenizer *hdt) /* Reuse current token, i.e. don't advance on the next call to hdtNext(). */ { hdt->reuse = TRUE; } static void hdtAdvance(struct hdTokenizer *hdt, int len) /* Copy len chars of hdt->input into hdt->string & null-terminate, resizing if necessary, * and advance hdt->input by len. */ { if (len >= hdt->sAlloc) { while (len >= hdt->sAlloc) hdt->sAlloc += HDT_BUF_SIZE; hdt->string = needMoreMem(hdt->string, 0, hdt->sAlloc); } memcpy(hdt->string, hdt->input, len); hdt->string[len] = '\0'; hdt->input += len; } static boolean hdtEof(struct hdTokenizer *hdt) /* If input is empty, update hdt to EOF state and return TRUE; */ { if (isEmpty(hdt->input)) { hdt->eof = TRUE; hdt->type = tk_eof; hdt->string[0] = '\0'; return TRUE; } return FALSE; } static boolean hdtNextFromAlphabet(struct hdTokenizer *hdt) /* If hdt->input starts with a static string in the tokenAlphabet, update hdt and return TRUE */ { if (hdtEof(hdt)) return FALSE; int i; for (i = 0; isNotEmpty(hdt->tokenAlphabet[i].value); i++) { struct hdTokenProps *props = &hdt->tokenAlphabet[i]; if (startsWith(props->value, hdt->input)) { hdtAdvance(hdt, strlen(props->value)); hdt->type = props->type; return TRUE; } } hdt->type = tk_undefined; return FALSE; } static void dyPrependf(struct dyString *dyError, char *format, ...) /* Add a warning to *beginning* of dyError so that when we pop the call stack, higher * level errors appear before lower lever errors. */ { char oldErr[dyStringLen(dyError)+1]; safecpy(oldErr, sizeof(oldErr), dyStringContents(dyError)); dyStringClear(dyError); va_list args; va_start(args, format); dyStringVaPrintf(dyError, format, args); va_end(args); if (isNotEmpty(oldErr)) { char newErr[dyStringLen(dyError)+1]; safecpy(newErr, sizeof(newErr), dyStringContents(dyError)); dyStringClear(dyError); dyStringAppend(dyError, newErr); dyStringAppend(dyError, "; "); dyStringAppend(dyError, oldErr); } } static boolean hdtNext(struct hdTokenizer *hdt, struct dyString *dyError, char *context) /* If hdt can advance to the next token in input, then make that hdt's current token and return * TRUE. EOF is considered valid. If input starts with something unexpected then return FALSE. */ { if (hdt->reuse) { // Stay on current token. hdt->reuse = FALSE; return TRUE; } if (hdtEof(hdt)) return TRUE; if (isdigit(hdt->input[0])) { // Handle numbers separately from static tokens. int len = 1; while (isdigit(hdt->input[len])) len++; hdtAdvance(hdt, len); hdt->type = tk_int; return TRUE; } else { char *start = hdt->input; if (hdtNextFromAlphabet(hdt)) { enum hdTokenType firstType = hdt->type; if (firstType == tk_seq1Letter || firstType == tk_seq3Letter) { // Accumulate sequence into string instead of returning one base at a time. boolean isValid = TRUE; while (hdt->type == firstType && isValid) isValid = hdtNextFromAlphabet(hdt); // We advanced to something that is not sequence, so we'll need to back up. int len = hdt->input - start; if (isValid) len -= strlen(hdt->string); hdt->input = start; hdt->type = firstType; hdt->eof = FALSE; hdtAdvance(hdt, len); } return TRUE; } } dyPrependf(dyError, "bad token %s at '%s'", context, hdt->input); return FALSE; } static struct hdTokenizer *hdTokenizerNew(char *input, struct hdTokenProps *tokenAlphabet) /* Return a new tokenizer on input that will use tokenAlphabet. Call htdNext on the returned * tokenizer to get the first token (and the next...) loaded into tokenizer. Note that this * does not clone input, it simply scans it. */ { struct hdTokenizer *hdt; AllocVar(hdt); hdt->input = input; hdt->tokenAlphabet = tokenAlphabet; hdt->sAlloc = HDT_BUF_SIZE; hdt->string = needMem(hdt->sAlloc); return hdt; } static void hdTokenizerFree(struct hdTokenizer **pHdt) /* Free up what we allocated (which does not include hdt->input). */ { struct hdTokenizer *hdt = *pHdt; if (hdt != NULL) { freeMem(hdt->string); freez(pHdt); } } static boolean parseNonNegNumOrRange(struct hdTokenizer *hdt, int *retMin, int *retMax, struct dyString *dyError) /* Parse a nonnegative number or _-separated range of nonneg numbers, possibly '?' for * unknown value. * Set retMin and retMax to -1 for '?', otherwise the min and max number. */ { int min = 0, max = 0; // Expect a number if (hdt->type == tk_int || hdt->type == tk_question) { min = max = (hdt->type == tk_int) ? atoi(hdt->string) : HGVS_LENGTH_UNSPECIFIED; if (!hdtNext(hdt, dyError, "after number")) return FALSE; if (hdt->type == tk_underscore) { // range of counts -- get next number if (!hdtNext(hdt, dyError, "after underscore")) return FALSE; if (hdt->type == tk_int || hdt->type == tk_question) max = (hdt->type == tk_int) ? atoi(hdt->string) : HGVS_LENGTH_UNSPECIFIED; else { dyPrependf(dyError, "expecting nonnegative numeric range but got non-number after '%d_' at '%s'", min, hdt->string); return FALSE; } } else // just a number, not a range; don't consume this token hdtReuse(hdt); } else { dyPrependf(dyError, "expecting nonnegative number or range (or '?') but got '%s'", hdt->string); return FALSE; } *retMin = min; *retMax = max; return TRUE; } static boolean parseNonNegNumOrRangeMaybeParens(struct hdTokenizer *hdt, int *retMin, int *retMax, struct dyString *dyError) { /* Parse a nonnegative number or _-separated range of nonneg numbers, possibly '?' for * unknown value, possibly enclosed in parens. * Set retMin and retMax to -1 for '?', otherwise the min and max number. */ if (hdt->type == tk_leftParen) { if (!hdtNext(hdt, dyError, "after '('")) return FALSE; if (! parseNonNegNumOrRange(hdt, retMin, retMax, dyError)) { dyPrependf(dyError, "after '('"); return FALSE; } if (!hdtNext(hdt, dyError, "after '(' followed by number/range")) return FALSE; if (hdt->type != tk_rightParen) { dyPrependf(dyError, "expected ')' after number/range, got '%s'", hdt->string); return FALSE; } return TRUE; } else return parseNonNegNumOrRange(hdt, retMin, retMax, dyError); } static struct hgvsChange *hgvsChangeNewRepeat(char *seq, int min, int max) /* Allocate and return an hgvsChange specifying a repeated sequence. */ { struct hgvsChange *change; AllocVar(change); change->type = hgvsctRepeat; change->value.repeat.seq = seq; change->value.repeat.min = min; change->value.repeat.max = max; return change; } static struct hgvsChange *parseNtRepeat(char *seq, struct hdTokenizer *hdt, struct dyString *dyError) /* Optionally preceded by seq, enclosed in square brackets and/or parens, a number or range. */ { // Make sure we start with '(' or '[' enum hdTokenType lType = hdt->type; if (lType != tk_leftParen && lType != tk_leftSquare) errAbort("parseNtRepeat: expected to start with ( or ["); if (!hdtNext(hdt, dyError, "after '[' or '(' in repeat")) return NULL; // Next should be a repeat count or range of counts int min, max; if (! parseNonNegNumOrRangeMaybeParens(hdt, &min, &max, dyError)) { dyPrependf(dyError, "in brackets for repeat"); return NULL; } // Now matching ')' or ']' if (!hdtNext(hdt, dyError, "after '[' or '(' followed by number")) return NULL; struct hgvsChange *change = NULL; if ((lType == tk_leftParen && hdt->type == tk_rightParen) || (lType == tk_leftSquare && hdt->type == tk_rightSquare)) { change = hgvsChangeNewRepeat(seq, min, max); } else dyPrependf(dyError, "mismatching brackets/parens for repeat"); return change; } static struct hgvsChange *hgvsChangeNewSimple(enum hgvsChangeType type, char *ref, int refLength, char *alt, int altLength) /* Allocate and return a simple change with optional ref and optional alt (or just lengths). * ref and alt are not cloned here, they must be allocated by caller! */ { struct hgvsChange *change; AllocVar(change); change->type = type; change->value.refAlt.refSequence = ref; change->value.refAlt.refLength = refLength; if (alt == NULL && altLength != HGVS_LENGTH_UNSPECIFIED) { change->value.refAlt.altType = hgvsstLength; change->value.refAlt.altValue.length = altLength; } else { change->value.refAlt.altType = hgvsstSimple; change->value.refAlt.altValue.seq = alt; } return change; } static struct hgvsChange *parseNtIupac(struct hdTokenizer *hdt, struct dyString *dyError) /* IUPAC sequence should be followed by '>' (subst) or '=' (no change) or '[' (repeat) */ { if (hdt->type != tk_seq1Letter) errAbort("parseNtIupac: expected to start with nucleotide IUPAC sequence"); char *ref = cloneString(hdt->string); if (!hdtNext(hdt, dyError, "after IUPAC sequence")) return NULL; if (hdt->type == tk_ntSubst) { if (!hdtNext(hdt, dyError, "after '>'")) return NULL; else if (hdt->type == tk_seq1Letter) return hgvsChangeNewSimple(hgvsctSubst, ref, HGVS_LENGTH_UNSPECIFIED, cloneString(hdt->string), HGVS_LENGTH_UNSPECIFIED); else dyPrependf(dyError, "expected IUPAC sequence after '>', got '%s'", hdt->string); } else if (hdt->type == tk_equal) { // Reference sequence was asserted, so store that for comparison with actual reference. return hgvsChangeNewSimple(hgvsctNoChange, ref, HGVS_LENGTH_UNSPECIFIED, NULL, HGVS_LENGTH_UNSPECIFIED); } else if (hdt->type == tk_leftSquare || hdt->type == tk_leftParen) return parseNtRepeat(ref, hdt, dyError); else dyPrependf(dyError, "something unexpected after IUPAC sequence"); return NULL; } static struct hgvsChange *parseNtDelOrDupOrInv(struct hdTokenizer *hdt, struct dyString *dyError) /* del, dup or inv may be followed by an asserted reference sequence, a redundant length (ignore), * or a new change. */ { struct hgvsChange *change = NULL; if (hdt->type != tk_del && hdt->type != tk_dup && hdt->type != tk_inv) errAbort("parseNtDelOrDupOrInv: expecting 'del', 'dup' or 'inv'"); enum hdTokenType opType = hdt->type; if (!hdtNext(hdt, dyError, "after del, dup or inv")) return NULL; char *ref = NULL; int refLength = HGVS_LENGTH_UNSPECIFIED; if (hdt->type == tk_seq1Letter) ref = cloneString(hdt->string); else if (hdt->type == tk_int) refLength = atoi(hdt->string); else // Something else -- hopefully the beginning of the next change or eof hdtReuse(hdt); enum hgvsChangeType type = hgvsctUndefined; if (opType == tk_del) type = hgvsctDel; else if (opType == tk_dup) type = hgvsctDup; else if (opType == tk_inv) type = hgvsctInv; change = hgvsChangeNewSimple(type, ref, refLength, NULL, HGVS_LENGTH_UNSPECIFIED); return change; } static enum hgvsSeqType hgvsSeqTypeFromString(char changeType) /* Translate [cgmnrp] to enum hgvsSeqType */ { switch (changeType) { // Not using break below, because return does the job case 'c': return hgvstCoding; case 'g': return hgvstGenomic; case 'm': return hgvstMito; case 'n': return hgvstNoncoding; case 'r': return hgvstRna; case 'p': return hgvstProtein; default: errAbort("hgvsSeqTypeFromString: unrecognized input '%c'", changeType); } return hgvstUndefined; } // After ins or con, it's possible to have a nested term to specify sequence from outside the ref: #define nestedExp "^([A-Z_]+[0-9]+(\\.[0-9]+)?):([cgmnr])\\.([0-9]+)_([0-9]+)(inv[0-9]*)?" // 1 ....................... accession // 2 ......... dot suffix? // 3 .... type // 4 .... start // 5 .... end // 6 ... inv? static struct hgvsChange *hgvsChangeNewNested(struct hdTokenizer *hdt, regmatch_t *substrs) /* hdt's current token is inv or con; next up is a nested HGVS term with components in * substrs as outlined for nestedExp above. Return an hgvsChange with nested hgvsVariant. */ { struct hgvsChange *change; AllocVar(change); change->type = (hdt->type == tk_ins) ? hgvsctIns : hgvsctCon; struct hgvsChangeRefAlt *refAlt = &change->value.refAlt; refAlt->refLength = HGVS_LENGTH_UNSPECIFIED; refAlt->altType = hgvsstNestedTerm; struct hgvsVariant *nestedTerm = &refAlt->altValue.term; nestedTerm->seqAcc = regexSubstringClone(hdt->input, substrs[1]); char changeType = hdt->input[substrs[3].rm_so]; // [cgmnrp] nestedTerm->type = hgvsSeqTypeFromString(changeType); nestedTerm->start1 = regexSubstringInt(hdt->input, substrs[4]); nestedTerm->end = regexSubstringInt(hdt->input, substrs[5]); nestedTerm->changes = regexSubstringClone(hdt->input, substrs[6]); return change; } static struct hgvsChange *parseNtInsOrCon(struct hdTokenizer *hdt, struct dyString *dyError) /* ins or con may be followed by a nested term (accession, type dot, pos range, possibly inv) * so do some extra lookahead. ins can also be followed by numbers in parens (or missing the * parens...) to indicate a number of unknown bases (Ns); if so then we do an end run around the * tokenizer. */ { struct hgvsChange *change = NULL; if (hdt->type != tk_ins && hdt->type != tk_con) errAbort("parseNtInsOrCon: expecting 'ins' or 'con', got '%s' (followed by '%s')", hdt->string, hdt->input); // Don't advance hdt until we first look ahead for accession:... regmatch_t substrs[8]; if (regexMatchSubstr(hdt->input, nestedExp, substrs, ArraySize(substrs))) { change = hgvsChangeNewNested(hdt, substrs); // Skip over everything that nestedRegex matched, and update hdt->type. hdtAdvance(hdt, substrs[0].rm_eo - substrs[0].rm_so); hdt->type = (regexSubstrMatched(substrs[6])) ? tk_inv : tk_seq1Letter; } else { // Tokenize and parse the usual way -- a sequence or length might be asserted after this. enum hdTokenType type = hdt->type; char context[hdt->sAlloc + 256]; safef(context, sizeof(context), "after '%s'", hdt->string); if (!hdtNext(hdt, dyError, context)) return NULL; char *insSeq = NULL; int insLen = HGVS_LENGTH_UNSPECIFIED; if (hdt->type == tk_seq1Letter) insSeq = cloneString(hdt->string); else if (hdt->type == tk_int || hdt->type == tk_leftParen) { // Length only, should be in parens but ClinVar just puts number. int min, max; if (! parseNonNegNumOrRangeMaybeParens(hdt, &min, &max, dyError)) { dyPrependf(dyError, "expecting number (optionally in parens) after '%s'", (type == tk_ins) ? "ins" : "con"); return NULL; } if (min != max) { //#*** A range is valid -- it means the insertion is from some other position range in this sequence! And it can be inverted. dyPrependf(dyError, "expecting number after %s but got range (%d_%d)", (type == tk_ins) ? "ins" : "con", min, max); return NULL; } insLen = max; } if (insSeq || insLen != HGVS_LENGTH_UNSPECIFIED) { enum hgvsChangeType changeType = (type == tk_ins) ? hgvsctIns : hgvsctCon; change = hgvsChangeNewSimple(changeType, NULL, HGVS_LENGTH_UNSPECIFIED, insSeq, insLen); } } return change; } struct hgvsChange *hgvsParseNucleotideChange(char *changeStr, enum hgvsSeqType type, struct dyString *dyError) /* Return a parse tree of a coding HGVS sequence change (the part after the position range) * possibly followed by additional changes. If there are any parse errors, they will be * appended to dyError. */ { struct hdTokenProps *alphabet = hdTokenAlphabetFromType(type); struct hdTokenizer *hdt = hdTokenizerNew(changeStr, alphabet); struct hgvsChange *changeList = NULL; char context[strlen(changeStr) + 256]; safef(context, sizeof(context), "at start of '%s'", changeStr); if (!hdtNext(hdt, dyError, context)) { hdTokenizerFree(&hdt); return NULL; } while (! hdt->eof) { struct hgvsChange *change = NULL; if (hdt->type == tk_seq1Letter) change = parseNtIupac(hdt, dyError); else if (hdt->type == tk_leftSquare || hdt->type == tk_leftParen) change = parseNtRepeat(NULL, hdt, dyError); else if (hdt->type == tk_del || hdt->type == tk_dup || hdt->type == tk_inv) change = parseNtDelOrDupOrInv(hdt, dyError); else if (hdt->type == tk_ins || hdt->type == tk_con) change = parseNtInsOrCon(hdt, dyError); + else if (hdt->type == tk_equal) + change = hgvsChangeNewSimple(hgvsctNoChange, "", HGVS_LENGTH_UNSPECIFIED, + NULL, HGVS_LENGTH_UNSPECIFIED); if (change != NULL) slAddHead(&changeList, change); else { dyPrependf(dyError, "couldn't parse HGVS nucleotide change string '%s'", changeStr); break; } if (!hdtNext(hdt, dyError, "after valid change")) { break; } } slReverse(&changeList); hdTokenizerFree(&hdt); return changeList; }