8de038378e49e6e1ba6b2de24bddc334c8b75d99 angie Tue May 9 11:51:59 2017 -0700 Adding support for HGVS m. and n. terms in position/search. refs #19423 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 66a0c48..4ba4bfa 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -62,69 +62,89 @@ // ... optional anchor "-" or "*" // ... mandatory first number // ....... optional offset separator and offset // ... offset separator // ... offset number // ............... optional range separator and complex num // ... optional anchor "-" or "*" // ... first number // ....... optional offset separator and offset // ... offset separator // ... offset number // It's pretty common for users to omit the '.' so if it's missing but the rest of the regex fits, // roll with it. #define hgvsCDotPosExp "c\\.?" hgvsCdsPosExp -#define hgvsGDotPosExp "g\\.?" hgvsNtPosExp -#define hgvsMDotPosExp "m\\.?" hgvsNtPosExp +#define hgvsGMDotPosExp "([gm])\\.?" hgvsNtPosExp #define hgvsNDotPosExp "n\\.?" hgvsNtPosExp #define hgvsRDotPosExp "r\\.?" hgvsNtPosExp // Protein substitution regex #define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter" #define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp #define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)" #define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp // ... // original sequence // ...... // 1-based position // ... // replacement sequence // Protein range (or just single pos) regex #define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)" #define hgvsPDotRangeExp "p\\." hgvsAaRangeExp // original start AA ... // 1-based start position ... // optional range sep and AA+pos ..................................... // original end AA ... // 1-based end position ... // change description ... // Complete HGVS term regexes combining sequence identifiers and change operations #define hgvsFullRegex(seq, op) "^" seq ":" op -#define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGDotPosExp) +#define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp) #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)" // substring numbering: // 0..................................... whole matching string // 1................. accession and optional dot version // 2........ optional dot version // 3...... (n/a) optional gene symbol in ()s // 4.... (n/a) optional gene symbol -// 5... 1-based start position -// 6.... optional range separator and end position -// 7... 1-based end position -// 8.... change description +// 5. g or m +// 6.. 1-based start position +// 7... optional range separator and end position +// 8.. 1-based end position +// 9... change description + +#define hgvsLrgNDotExp hgvsFullRegex(lrgTranscriptExp, hgvsNDotPosExp) "(.*)" +// 0..................................... whole matching string +// 1................... LRG transcript +// 2...... 1-based start position +// 3.......... optional range separator and end position +// 4..... 1-based end position +// 5... change description + +#define hgvsRefSeqNMNDotExp hgvsFullRegex(versionedRefSeqNMExp, hgvsNDotPosExp) "(.*)" +// substring numbering: +// 0..................................... whole matching string +// 1................. accession and optional dot version +// 2........ optional dot version +// 3...... (n/a) optional gene symbol in ()s +// 4.... (n/a) optional gene symbol +// 5.. 1-based start position +// 6... optional range separator and end position +// 7.. 1-based end position +// 8... change description #define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) #define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)" // substring numbering: // 0..................................................... whole matching string // 1................... LRG transcript // 2... optional anchor "-" or "*" // 3... mandatory first number // 4....... optional offset separator and offset // 5... offset separator // 6... offset number // 7............... optional range sep and complex num // 8... optional anchor "-" or "*" // 9... first number // 10....... optional offset separator and offset @@ -233,46 +253,90 @@ // Gene symbol, maybe punctuation, and a clear "c." position (and possibly change) #define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... optional beginning of position exp // 3..... beginning of position exp static struct hgvsVariant *hgvsParseGDotPos(char *term) /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */ { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[17]; if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs))) { int accIx = 1; - int startPosIx = 5; - int endPosIx = 7; - int changeIx = 8; + int startPosIx = 6; + int endPosIx = 8; + int changeIx = 9; AllocVar(hgvs); + // HGVS recommendation May 2017: replace m. with g. since mitochondrion is genomic too hgvs->type = hgvstGenomic; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; hgvs->changes = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } +static struct hgvsVariant *hgvsParseNDotPos(char *term) +/* If term is parseable as an HGVS n. term, return the parsed representation, otherwise NULL. */ +{ +struct hgvsVariant *hgvs = NULL; +boolean matches = FALSE; +int accIx = 1; +int startPosIx = 2; +int endPosIx = 4; +int changeIx = 5; +// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that +// affects all substring offsets after the accession. +int refSeqExtra = 3; +int geneSymbolIx = -1; +regmatch_t substrs[10]; +if (regexMatchSubstr(term, hgvsLrgNDotExp, substrs, ArraySize(substrs))) + { + matches = TRUE; + } +else if (regexMatchSubstr(term, hgvsRefSeqNMNDotExp, substrs, ArraySize(substrs))) + { + matches = TRUE; + geneSymbolIx = 4; + startPosIx += refSeqExtra; + endPosIx += refSeqExtra; + changeIx += refSeqExtra; + } +if (matches) + { + AllocVar(hgvs); + hgvs->type = hgvstNoncoding; + hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); + if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) + hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); + hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); + if (regexSubstrMatched(substrs[endPosIx])) + hgvs->end = regexSubstringInt(term, substrs[endPosIx]); + else + hgvs->end = hgvs->start1; + hgvs->changes = regexSubstringClone(term, substrs[changeIx]); + } +return hgvs; +} + static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset, boolean *retIsUtr3, int *retPos, int *retOffset) /* Extract matches from substrs starting at substrOffset to parse a complex number * description into anchor type, 1-based position and optional offset. */ { int anchorIx = substrOffset; int posIx = substrOffset + 1; int offsetOpIx = substrOffset + 3; int offsetIx = substrOffset + 4; // Determine what startPos is relative to, based on optional anchor and optional offset char anchor[16]; // string length 0 or 1 regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor)); char offsetOp[16]; // string length 0 or 1 regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp)); *retIsUtr3 = (anchor[0] == '*'); @@ -400,30 +464,32 @@ hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; } return hgvs; } struct hgvsVariant *hgvsParseTerm(char *term) /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions, coordinates or alleles. */ { struct hgvsVariant *hgvs = hgvsParseCDotPos(term); if (hgvs == NULL) + hgvs = hgvsParseNDotPos(term); +if (hgvs == NULL) hgvs = hgvsParsePDotSubst(term); if (hgvs == NULL) hgvs = hgvsParsePDotRange(term); if (hgvs == NULL) hgvs = hgvsParseGDotPos(term); //#*** TODO: MDot, RDot, NDot return hgvs; } static boolean dbHasNcbiRefSeq(char *db) /* Test whether NCBI's RefSeq alignments are available in db. */ { // hTableExists() caches results so this shouldn't make for loads of new SQL queries if called // more than once. return (hTableExists(db, "ncbiRefSeq") && hTableExists(db, "ncbiRefSeqPsl") &&