a476f750f106fbcde8fedeb6488389d728f46396 angie Thu Jun 29 11:26:34 2017 -0700 Oops, HGVS n. terms can have intron offsets too (like c. except for no UTR [-*] indicators). fixes #19702 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index ec5efb2..e10ef5a 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -38,147 +38,166 @@ #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version // ...... optional gene symbol in ()s // .... optional gene symbol #define versionedRefSeqNCExp versionedAccPrefixExp("NC") #define versionedRefSeqNGExp versionedAccPrefixExp("NG") #define versionedRefSeqNMExp versionedAccPrefixExp("NM") #define versionedRefSeqNPExp versionedAccPrefixExp("NP") #define versionedRefSeqNMRExp versionedAccPrefixExp("N[MR]") // Nucleotide position regexes // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA) #define posIntExp "([0-9]+)" -#define hgvsNtPosExp posIntExp "(_" posIntExp ")?" +#define hgvsGenoPosExp posIntExp "(_" posIntExp ")?" // ...... 1-based start position // ............. optional range separator and end position // ...... 1-based end position -// Now for a regex that can handle positions like "26" or "*80" or "-24+75_-24+92"... -#define anchorExp "([-*])?" +// n. terms can have exonic anchor base and intron offset for both start and end: #define offsetExp "([-+])" +// c. terms may also have a UTR indicator before the anchor base (- for UTR5, * for UTR3) +#define anchorExp "([-*])?" #define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?" #define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?" -// ... optional anchor "-" or "*" -// ... mandatory first number +// ... optional UTR anchor "-" or "*" +// ... mandatory 1-based start anchor base offset // ....... optional offset separator and offset -// ... offset separator -// ... offset number -// ............... optional range separator and complex num -// ... optional anchor "-" or "*" -// ... first number +// ... intron offset separator +// ... intron offset number +// ............... optional range separator and complex end +// ... optional UTR anchor "-" or "*" +// ... 1-based end anchor base offset // ....... optional offset separator and offset -// ... offset separator -// ... offset number +// ... intron offset separator +// ... intron offset number // It's pretty common for users to omit the '.' so if it's missing but the rest of the regex fits, // roll with it. #define hgvsCDotPosExp "c\\.?" hgvsCdsPosExp -#define hgvsGMDotPosExp "([gm])\\.?" hgvsNtPosExp -#define hgvsNDotPosExp "n\\.?" hgvsNtPosExp -#define hgvsRDotPosExp "r\\.?" hgvsNtPosExp +#define hgvsGMDotPosExp "([gm])\\.?" hgvsGenoPosExp +#define hgvsNDotPosExp "n\\.?" hgvsCdsPosExp +// Not supporting RDot at this point because r. terms may use either n. or c. numbering! +// #define hgvsRDotPosExp "r\\.?" hgvsCdsPosExp // Protein substitution regex #define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter" #define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp #define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)" #define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp // ... // original sequence // ...... // 1-based position // ... // replacement sequence // Protein range (or just single pos) regex #define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)" #define hgvsPDotRangeExp "p\\." hgvsAaRangeExp // original start AA ... // 1-based start position ... // optional range sep and AA+pos ..................................... // original end AA ... // 1-based end position ... // change description ... // Complete HGVS term regexes combining sequence identifiers and change operations -#define hgvsFullRegex(seq, op) "^" seq ":" op +#define hgvsFullRegex(seq, op) "^" seq "[ :]+" op #define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp) #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)" // substring numbering: // 0..................................... whole matching string // 1................. accession and optional dot version // 2........ optional dot version // 3...... (n/a) optional gene symbol in ()s // 4.... (n/a) optional gene symbol // 5. g or m // 6.. 1-based start position // 7... optional range separator and end position // 8.. 1-based end position // 9... change description -#define hgvsLrgNDotExp hgvsFullRegex(lrgTranscriptExp, hgvsNDotPosExp) "(.*)" -// 0..................................... whole matching string +#define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) +#define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)" +// substring numbering: +// 0..................................................... whole matching string // 1................... LRG transcript -// 2...... 1-based start position -// 3.......... optional range separator and end position -// 4..... 1-based end position -// 5... change description +// 2... optional UTR anchor "-" or "*" +// 3... mandatory first number +// 4....... optional offset separator and offset +// 5... offset separator +// 6... offset number +// 7............... optional range sep and complex num +// 8... optional UTR anchor "-" or "*" +// 9... first number +// 10....... optional offset separator and offset +// 11... offset separator +// 12... offset number +// 13........ sequence change -#define hgvsRefSeqNMRNDotExp hgvsFullRegex(versionedRefSeqNMRExp, hgvsNDotPosExp) "(.*)" +#define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp) +#define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)" // substring numbering: -// 0..................................... whole matching string -// 1................. accession and optional dot version +// 0............................................................... whole matching string +// 1............... acc & optional dot version // 2........ optional dot version -// 3...... (n/a) optional gene symbol in ()s -// 4.... (n/a) optional gene symbol -// 5.. 1-based start position -// 6... optional range separator and end position -// 7.. 1-based end position -// 8... change description +// 3..... optional gene sym in ()s +// 4... optional gene symbol +// 5... optional UTR anchor +// 6... mandatory first number +// 7....... optional offset +// 8... offset separator +// 9... offset number +// 10............... optional range +// 11... optional UTR anchor +// 12... first number +// 13....... optional offset +// 14... offset separator +// 15... offset number +// 16........ sequence change -#define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) -#define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)" +#define hgvsLrgNDotExp hgvsFullRegex(lrgTranscriptExp, hgvsNDotPosExp) "(.*)" // substring numbering: // 0..................................................... whole matching string // 1................... LRG transcript -// 2... optional anchor "-" or "*" +// 2... n/a 4 n.: UTR anchor "-" or "*" // 3... mandatory first number // 4....... optional offset separator and offset // 5... offset separator // 6... offset number // 7............... optional range sep and complex num -// 8... optional anchor "-" or "*" +// 8... n/a 4 n.: UTR anchor "-" or "*" // 9... first number // 10....... optional offset separator and offset // 11... offset separator // 12... offset number // 13........ sequence change -#define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp) -#define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)" +#define hgvsRefSeqNMRNDotExp hgvsFullRegex(versionedRefSeqNMRExp, hgvsNDotPosExp) "(.*)" // substring numbering: // 0............................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol -// 5... optional anchor +// 5... n/a 4 n.: UTR anchor // 6... mandatory first number // 7....... optional offset // 8... offset separator // 9... offset number // 10............... optional range -// 11... optional anchor +// 11... n/a 4 n.: UTR anchor // 12... first number // 13....... optional offset // 14... offset separator // 15... offset number // 16........ sequence change #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position @@ -270,142 +289,108 @@ int changeIx = 9; AllocVar(hgvs); // HGVS recommendation May 2017: replace m. with g. since mitochondrion is genomic too hgvs->type = hgvstGenomic; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; hgvs->changes = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } -static struct hgvsVariant *hgvsParseNDotPos(char *term) -/* If term is parseable as an HGVS n. term, return the parsed representation, otherwise NULL. */ -{ -struct hgvsVariant *hgvs = NULL; -boolean matches = FALSE; -int accIx = 1; -int startPosIx = 2; -int endPosIx = 4; -int changeIx = 5; -// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that -// affects all substring offsets after the accession. -int refSeqExtra = 3; -int geneSymbolIx = -1; -regmatch_t substrs[10]; -if (regexMatchSubstr(term, hgvsLrgNDotExp, substrs, ArraySize(substrs))) - { - matches = TRUE; - } -else if (regexMatchSubstr(term, hgvsRefSeqNMRNDotExp, substrs, ArraySize(substrs))) - { - matches = TRUE; - geneSymbolIx = 4; - startPosIx += refSeqExtra; - endPosIx += refSeqExtra; - changeIx += refSeqExtra; - } -if (matches) - { - AllocVar(hgvs); - hgvs->type = hgvstNoncoding; - hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); - if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) - hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); - hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); - if (regexSubstrMatched(substrs[endPosIx])) - hgvs->end = regexSubstringInt(term, substrs[endPosIx]); - else - hgvs->end = hgvs->start1; - hgvs->changes = regexSubstringClone(term, substrs[changeIx]); - } -return hgvs; -} - static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset, boolean *retIsUtr3, int *retPos, int *retOffset) /* Extract matches from substrs starting at substrOffset to parse a complex number * description into anchor type, 1-based position and optional offset. */ { int anchorIx = substrOffset; int posIx = substrOffset + 1; int offsetOpIx = substrOffset + 3; int offsetIx = substrOffset + 4; // Determine what startPos is relative to, based on optional anchor and optional offset char anchor[16]; // string length 0 or 1 regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor)); char offsetOp[16]; // string length 0 or 1 regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp)); *retIsUtr3 = (anchor[0] == '*'); *retPos = regexSubstringInt(term, substrs[posIx]); // Is pos negative? if (anchor[0] == '-') *retPos = -*retPos; // Grab offset, if there is one. if (isNotEmpty(offsetOp)) { *retOffset = regexSubstringInt(term, substrs[offsetIx]); if (offsetOp[0] == '-') *retOffset = -*retOffset; } } -static struct hgvsVariant *hgvsParseCDotPos(char *term) +static struct hgvsVariant *hgvsParseCNDotPos(char *term) /* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; +boolean isNoncoding = FALSE; boolean matches = FALSE; int accIx = 1; int startAnchorIx = 2; int endAnchorIx = 8; int endPosIx = 9; int changeIx = 13; // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that // affects all substring offsets after the accession. int refSeqExtra = 3; int geneSymbolIx = -1; regmatch_t substrs[17]; -if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs))) +if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs)) || + (isNoncoding = regexMatchSubstr(term, hgvsLrgNDotExp, substrs, ArraySize(substrs)))) { matches = TRUE; } -else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs))) +else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs)) || + (isNoncoding = regexMatchSubstr(term, hgvsRefSeqNMRNDotExp, substrs, ArraySize(substrs)))) { matches = TRUE; geneSymbolIx = 4; startAnchorIx += refSeqExtra; endAnchorIx += refSeqExtra; endPosIx += refSeqExtra; changeIx += refSeqExtra; } if (matches) { AllocVar(hgvs); - hgvs->type = hgvstCoding; + hgvs->type = isNoncoding ? hgvstNoncoding : hgvstCoding; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); extractComplexNum(term, substrs, startAnchorIx, &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset); + if (isNoncoding && (hgvs->startIsUtr3 || hgvs->start1 < 0)) + warn("hgvsParseCNDotPos: noncoding term '%s' appears to start in UTR, " + "not applicable for noncoding", term); if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); if (regexSubstrMatched(substrs[endPosIx])) { extractComplexNum(term, substrs, endAnchorIx, &hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset); + if (isNoncoding && (hgvs->endIsUtr3 || hgvs->end < 0)) + warn("hgvsParseCNDotPos: noncoding term '%s' appears to end in UTR, " + "not applicable for noncoding", term); } else { hgvs->end = hgvs->start1; hgvs->endIsUtr3 = hgvs->startIsUtr3; hgvs->endOffset = hgvs->startOffset; } hgvs->changes = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } static struct hgvsVariant *hgvsParsePDotSubst(char *term) /* If term is parseable as an HGVS protein substitution, return the parsed representation, * otherwise NULL. */ @@ -462,40 +447,37 @@ if (regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; } return hgvs; } struct hgvsVariant *hgvsParseTerm(char *term) /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions, coordinates or alleles. */ { -struct hgvsVariant *hgvs = hgvsParseCDotPos(term); -if (hgvs == NULL) - hgvs = hgvsParseNDotPos(term); +struct hgvsVariant *hgvs = hgvsParseCNDotPos(term); if (hgvs == NULL) hgvs = hgvsParsePDotSubst(term); if (hgvs == NULL) hgvs = hgvsParsePDotRange(term); if (hgvs == NULL) hgvs = hgvsParseGDotPos(term); -//#*** TODO: MDot, RDot, NDot return hgvs; } static boolean dbHasNcbiRefSeq(char *db) /* Test whether NCBI's RefSeq alignments are available in db. */ { // hTableExists() caches results so this shouldn't make for loads of new SQL queries if called // more than once. return (hTableExists(db, "ncbiRefSeq") && hTableExists(db, "ncbiRefSeqPsl") && hTableExists(db, "ncbiRefSeqCds") && hTableExists(db, "ncbiRefSeqLink") && hTableExists(db, "ncbiRefSeqPepTable") && hTableExists(db, "seqNcbiRefSeq") && hTableExists(db, "extNcbiRefSeq")); } static char *npForGeneSymbol(char *db, char *geneSymbol)