63bcbd28b1571a7efdb6628f80751481f3bd98f2 max Tue Apr 30 01:15:16 2024 -0700 adding n. notation to HGVS parser, refs #25982 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 504172b..e14faef 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -376,31 +376,38 @@ // 2... original start AA // 3... 1-based start position // 4................ optional range sep and AA+pos // 5... original end AA // 6... 1-based end position // 7..... change description // As above but omitting the protein change #define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp maybePDot posIntExp "\\)?" // 0.......................... whole matching string // 1................... gene symbol // 2..... 1-based position // Gene symbol, maybe punctuation, and a clear "c." position (and possibly change) -#define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp +#define pseudoHgvsGeneSymbolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp +// 0..................................................... whole matching string +// 1................... gene symbol +// 2..... optional beginning of position exp +// 3..... beginning of position exp + +// Gene symbol, maybe punctuation, and a clear "n." position (and possibly change) +#define pseudoHgvsGeneSymbolNDotPosExp "^" geneSymbolExp "[: ]+" hgvsNDotPosExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... optional beginning of position exp // 3..... beginning of position exp static struct hgvsVariant *hgvsParseGDotPos(char *term) /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */ { struct hgvsVariant *hgvs = NULL; int accIx = 1; int startPosIx = 3; int endPosIx = 5; int changeIx = 6; boolean matches = FALSE; regmatch_t substrs[17]; @@ -878,30 +885,52 @@ seq = aaSeq->dna; } } return seq; } static char refBaseForNp(char *db, char *npAcc, int pos) // Get the amino acid base in NP_'s sequence at 1-based offset pos. { char *seq = getProteinSeq(db, npAcc); char base = seq ? seq[pos-1] : '\0'; freeMem(seq); return base; } +static struct hgvsVariant* hgvsPseudoToRealHgvs(regmatch_t substrs[], char* term, char* db, int geneSymbolIx, char *prefix) +/* rewrite a pseudo-HGVS with symbol: to a real NM_xxx HGVS */ +{ +int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; +char geneSymbol[len+1]; +safencpy(geneSymbol, sizeof(geneSymbol), term, len); +char *nmAcc = nmForGeneSymbol(db, geneSymbol); +struct hgvsVariant *hgvs = NULL; +if (isNotEmpty(nmAcc)) + { + // Make it a real HGVS term with the NM and pass that on to the usual parser. + int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3; + char *description = term + substrs[descStartIx].rm_so; + struct dyString *nmTerm = dyStringCreate("%s(%s):%s%s", + nmAcc, geneSymbol, prefix, description); + hgvs = hgvsParseTerm(nmTerm->string); + dyStringFree(&nmTerm); + freeMem(nmAcc); + } +return hgvs; +} + struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: * Return a list of struct hgvsVariant that may be what was intended */ // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles) { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[11]; int geneSymbolIx = 1; boolean isSubst; if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp, substrs, ArraySize(substrs))) || regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs))) { // User gave an NM_ accession but a protein change -- swap in the right NP_. int nmAccIx = 1; @@ -970,48 +999,34 @@ char *npAcc = npItem->name; // Only position was provided, no change. Look up ref base and make a synonymous subst // so it's parseable HGVS. int posIx = 2; int pos = regexSubstringInt(term, substrs[posIx]); char refBase = refBaseForNp(db, npAcc, pos); struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=", npAcc, geneSymbol, refBase, pos); struct hgvsVariant *newTerm = hgvsParseTerm(npTerm->string); if (newTerm) slAddHead(&hgvs, newTerm); dyStringFree(&npTerm); } } } -else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs))) - { - int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; - char geneSymbol[len+1]; - safencpy(geneSymbol, sizeof(geneSymbol), term, len); - char *nmAcc = nmForGeneSymbol(db, geneSymbol); - if (isNotEmpty(nmAcc)) - { - // Make it a real HGVS term with the NM and pass that on to the usual parser. - int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3; - char *description = term + substrs[descStartIx].rm_so; - struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s", - nmAcc, geneSymbol, description); - hgvs = hgvsParseTerm(nmTerm->string); - dyStringFree(&nmTerm); - freeMem(nmAcc); - } - } +else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolCDotPosExp, substrs, ArraySize(substrs))) + hgvs = hgvsPseudoToRealHgvs(substrs, term, db, geneSymbolIx, "c."); +else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolNDotPosExp, substrs, ArraySize(substrs))) + hgvs = hgvsPseudoToRealHgvs(substrs, term, db, geneSymbolIx, "n."); else if (regexMatchSubstr(term, pseudoHgvsChrGDotExp, substrs, ArraySize(substrs))) { int chrIx = 1; int startPosIx = 3; int endPosIx = 5; int changeIx = 6; AllocVar(hgvs); hgvs->type = hgvstGenomic; hgvs->seqAcc = regexSubstringClone(term, substrs[chrIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; hgvs->changes = regexSubstringClone(term, substrs[changeIx]);