b17ae6ec773d69464d8c526d71040441c30da265 chmalee Wed Mar 12 12:54:15 2025 -0700 Implicitly convert ENST accessions to ENSP accessions during hgvs search if the hgvs term is a p. term, refs #35335 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index e14faefa20d..52c71d065c2 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -350,30 +350,53 @@ #define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp maybePDot hgvsAaRangeExp "\\)?" // substring numbering: // 0..................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5... original start AA // 6... 1-based start position // 7.......... optional range sep and AA+pos // 8... original end AA // 9... 1-based end position // 10.... change description +// Sometimes users give an ENST_ accession, but a protein change. +#define pseudoHgvsENSPDotSubstExp "^" ensTranscriptExp maybePDot hgvsAminoAcidSubstExp "\\)?" +// substring numbering: +// 0..................................... whole matching string +// 1..................................... ENS transcript ID including optional lift suffix +// 2... optional non-human species code e.g. MUS for mouse +// 3..... original sequence +// 4...... 1-based position +// 5...... replacement sequence + +#define pseudoHgvsENSPDotRangeExp "^" ensTranscriptExp maybePDot hgvsAaRangeExp "\\)?" + +// substring numbering: +// 0..................................... whole matching string +// 1..................................... ENS transcript ID including optional lift suffix +// 2... optional non-human species code e.g. MUS for mouse +// 3... original start AA +// 4... 1-based start position +// 5.......... optional range sep and AA+pos +// 6... original end AA +// 7... 1-based end position +// 8.... change description + // Common: gene symbol followed by space and/or punctuation followed by protein change #define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp maybePDot hgvsAminoAcidSubstExp "\\)?" // 0..................................................... whole matching string // 1................... gene symbol // 2..... original sequence // 3...... 1-based position // 4...... replacement sequence #define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp maybePDot hgvsAaRangeExp "\\)?" // 0..................................................... whole matching string // 1................... gene symbol // 2... original start AA // 3... 1-based start position // 4................ optional range sep and AA+pos // 5... original end AA @@ -744,30 +767,49 @@ refLinkTable, trimmed); } else return NULL; struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); // if user passed in old versioned transcript, check in *Old tables: if (npAcc == NULL && hDbHasNcbiRefSeqHistorical(db) && strchr(nmAcc, '.')) { sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLinkHistorical where id = '%s'", nmAcc); npAcc = sqlQuickString(conn, query); } hFreeConn(&conn); return npAcc; } +static char *enspForEnst(char *db, char *enstAcc) +/* Given an ENST_ accession, look up and return its ENSP_ accession; if not found return NULL. */ +{ +if (trackHubDatabase(db)) + return NULL; +char *txAcc = NULL; +struct sqlConnection *conn = hAllocConn(db); +char *attrsTable = hFindLatestGencodeTableConn(conn, "Attrs"); +if (attrsTable && hHasField(db, attrsTable, "proteinId")) + { + char query[2048]; + sqlSafef(query, sizeof(query), "select proteinId from %s where transcriptId = '%s'", + attrsTable, enstAcc); + txAcc = sqlQuickString(conn, query); + } +hFreeConn(&conn); +return txAcc; +} + static char *lrgProteinToTx(char *db, char *protAcc) /* Return the LRG_ transcript accession for protAcc. Each LRG_NpM has a corresponding LRG_NtM. */ { int accLen = strlen(protAcc); char txAcc[accLen+1]; safecpy(txAcc, sizeof(txAcc), protAcc); char *p = strrchr(txAcc, 'p'); if (p) *p = 't'; return cloneString(txAcc); } static char *gencodeProteinToTx(char *db, char *protAcc) /* Return the ENS*T transcript accession for protAcc, or NULL if not found. */ { @@ -916,31 +958,53 @@ dyStringFree(&nmTerm); freeMem(nmAcc); } return hgvs; } struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: * Return a list of struct hgvsVariant that may be what was intended */ // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles) { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[11]; int geneSymbolIx = 1; boolean isSubst; -if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp, +if ((isSubst = regexMatchSubstr(term, pseudoHgvsENSPDotSubstExp, + substrs, ArraySize(substrs))) || + regexMatchSubstr(term, pseudoHgvsENSPDotRangeExp, substrs, ArraySize(substrs))) + { + // User gave an ENST_ accession but a protein change -- swap in the right ENSP_. + int ensAccIx = 1; + int len = substrs[ensAccIx].rm_eo - substrs[ensAccIx].rm_so; + char ensAcc[len+1]; + safencpy(ensAcc, sizeof(ensAcc), term, len); + char *enspAcc = enspForEnst(db, ensAcc); + if (isNotEmpty(enspAcc)) + { + // Make it a real HGVS term with the ENSP and pass that on to the usual parser. + int descStartIx = 3; + char *description = term + substrs[descStartIx].rm_so; + struct dyString *enspTerm; + enspTerm = dyStringCreate("%s:p.%s", enspAcc, description); + hgvs = hgvsParseTerm(enspTerm->string); + dyStringFree(&enspTerm); + freeMem(enspAcc); + } + } +else if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp, substrs, ArraySize(substrs))) || regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs))) { // User gave an NM_ accession but a protein change -- swap in the right NP_. int nmAccIx = 1; int geneSymbolIx = 4; int len = substrs[nmAccIx].rm_eo - substrs[nmAccIx].rm_so; char nmAcc[len+1]; safencpy(nmAcc, sizeof(nmAcc), term, len); char *npAcc = npForNm(db, nmAcc); if (isNotEmpty(npAcc)) { // Make it a real HGVS term with the NP and pass that on to the usual parser. int descStartIx = 5; char *description = term + substrs[descStartIx].rm_so;