3712ae6f8f6f212bae33ecc5499feeea598d83e4 angie Mon Aug 14 11:52:33 2017 -0700 HGVS pos/search: support parentheses around p. terms. Also watch out for start>end in hgvsValidateGene. refs #19968 note-6 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index e00c269..2bed75a 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -73,38 +73,38 @@ // ... intron offset separator // ... intron offset number // It's pretty common for users to omit the '.' so if it's missing but the rest of the regex fits, // roll with it. #define hgvsCDotPosExp "c\\.?" hgvsCdsPosExp #define hgvsGMDotPosExp "([gm])\\.?" hgvsGenoPosExp #define hgvsNDotPosExp "n\\.?" hgvsCdsPosExp // Not supporting RDot at this point because r. terms may use either n. or c. numbering! // #define hgvsRDotPosExp "r\\.?" hgvsCdsPosExp // Protein substitution regex #define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter" #define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp #define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)" -#define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp +#define hgvsPDotSubstExp "p\\.\\(?" hgvsAminoAcidSubstExp "\\)?" // ... // original sequence // ...... // 1-based position // ... // replacement sequence // Protein range (or just single pos) regex #define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)" -#define hgvsPDotRangeExp "p\\." hgvsAaRangeExp +#define hgvsPDotRangeExp "p\\.\\(?" hgvsAaRangeExp "\\)?" // original start AA ... // 1-based start position ... // optional range sep and AA+pos ..................................... // original end AA ... // 1-based end position ... // change description ... // Complete HGVS term regexes combining sequence identifiers and change operations #define hgvsFullRegex(seq, op) "^" seq "[ :]+" op #define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp) #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)" // substring numbering: // 0..................................... whole matching string // 1................. accession and optional dot version @@ -218,76 +218,79 @@ // 9... 1-based end position // 10..... change description // Pseudo-HGVS in common usage // g. with "chr" ID: #define pseudoHgvsChrGDotExp hgvsFullRegex("(chr[0-9A-Za-z_]+)", hgvsGMDotPosExp) "(.*)" // substring numbering: // 0..................................... whole matching string // 1........... chr... // 2. g or m // 3...... 1-based start position // 4....... optional range separator and end position // 5..... 1-based end position // 6.... change description + // Sometimes users give an NM_ accession, but a protein change. -#define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp "[ :]+p?\\.?" hgvsAminoAcidSubstExp +#define maybePDot "[ :]+p?\\.?\\(?" +#define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp maybePDot hgvsAminoAcidSubstExp "\\)?" // substring numbering: // 0..................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position // 7...... replacement sequence -#define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp "[ :]+p?\\.?" hgvsAaRangeExp +#define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp maybePDot hgvsAaRangeExp "\\)?" + // substring numbering: // 0..................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5... original start AA // 6... 1-based start position // 7.......... optional range sep and AA+pos // 8... original end AA // 9... 1-based end position // 10.... change description // Common: gene symbol followed by space and/or punctuation followed by protein change -#define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp "[ :]+p?\\.?" hgvsAminoAcidSubstExp +#define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp maybePDot hgvsAminoAcidSubstExp "\\)?" // 0..................................................... whole matching string // 1................... gene symbol // 2..... original sequence // 3...... 1-based position // 4...... replacement sequence -#define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp "[ :]+p?\\.?" hgvsAaRangeExp +#define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp maybePDot hgvsAaRangeExp "\\)?" // 0..................................................... whole matching string // 1................... gene symbol // 2... original start AA // 3... 1-based start position // 4................ optional range sep and AA+pos // 5... original end AA // 6... 1-based end position // 7..... change description // As above but omitting the protein change -#define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp "[ :]+p?\\.?" posIntExp +#define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp maybePDot posIntExp "\\)?" // 0.......................... whole matching string // 1................... gene symbol // 2..... 1-based position // Gene symbol, maybe punctuation, and a clear "c." position (and possibly change) #define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... optional beginning of position exp // 3..... beginning of position exp static struct hgvsVariant *hgvsParseGDotPos(char *term) /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */ { @@ -1002,44 +1005,50 @@ return FALSE; boolean coordsOK = FALSE; char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc)); if (accSeq) { // By convention, foundAcc is always versionless because it's accompanied by foundVersion. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(acc, '.'); int seqLen = strlen(accSeq); int start, end; hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); if (hgvs->type == hgvstCoding) { struct genbankCds cds; coordsOK = getCds(db, acc, &cds); - if (coordsOK && retDiffRefAllele) + if (coordsOK) { start += (hgvs->startIsUtr3 ? cds.end : cds.start); - if (hgvs->startOffset == 0 && start >= 0 && start < seqLen) + end += (hgvs->endIsUtr3 ? cds.end : cds.start); + if (start > end) + coordsOK = FALSE; + else if (retDiffRefAllele && hgvs->startOffset == 0 && start >= 0 && start <= seqLen) checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); } } else { + if (start <= end) + { coordsOK = TRUE; if (retDiffRefAllele && hgvs->startOffset == 0 && start >= 0 && start < seqLen) checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); } } + } freeMem(accSeq); freeMem(acc); return coordsOK; } boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords