b17ae6ec773d69464d8c526d71040441c30da265
chmalee
  Wed Mar 12 12:54:15 2025 -0700
Implicitly convert ENST accessions to ENSP accessions during hgvs search if the hgvs term is a p. term, refs #35335

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index e14faefa20d..52c71d065c2 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -350,30 +350,53 @@
 #define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp maybePDot hgvsAaRangeExp "\\)?"
 
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1...............                                        acc & optional dot version
 //             2........                                        optional dot version
 //                       3.....                                 optional gene sym in ()s
 //                        4...                                  optional gene symbol
 //                                 5...                         original start AA
 //                                       6...                   1-based start position
 //                                           7..........        optional range sep and AA+pos
 //                                             8...             original end AA
 //                                                  9...        1-based end position
 //                                                      10....  change description
 
+// Sometimes users give an ENST_ accession, but a protein change.
+#define pseudoHgvsENSPDotSubstExp "^" ensTranscriptExp maybePDot hgvsAminoAcidSubstExp "\\)?"
+// substring numbering:
+//      0.....................................  whole matching string
+//      1.....................................  ENS transcript ID including optional lift suffix
+//         2...                                 optional non-human species code e.g. MUS for mouse
+//                                   3.....                     original sequence
+//                                           4......            1-based position
+//                                                     5......  replacement sequence
+
+#define pseudoHgvsENSPDotRangeExp "^" ensTranscriptExp maybePDot hgvsAaRangeExp "\\)?"
+
+// substring numbering:
+//      0.....................................  whole matching string
+//      1.....................................  ENS transcript ID including optional lift suffix
+//         2...                                 optional non-human species code e.g. MUS for mouse
+//                 3...                         original start AA
+//                       4...                   1-based start position
+//                           5..........        optional range sep and AA+pos
+//                             6...             original end AA
+//                                  7...        1-based end position
+//                                       8....  change description
+
 // Common: gene symbol followed by space and/or punctuation followed by protein change
 #define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp maybePDot hgvsAminoAcidSubstExp "\\)?"
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                                   2.....                     original sequence
 //                                           3......            1-based position
 //                                                     4......  replacement sequence
 
 #define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp maybePDot hgvsAaRangeExp "\\)?"
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                                 2...                         original start AA
 //                                       3...                   1-based start position
 //                                           4................  optional range sep and AA+pos
 //                                             5...             original end AA
@@ -744,30 +767,49 @@
              refLinkTable, trimmed);
     }
 else return NULL;
 struct sqlConnection *conn = hAllocConn(db);
 char *npAcc = sqlQuickString(conn, query);
 // if user passed in old versioned transcript, check in *Old tables:
 if (npAcc == NULL && hDbHasNcbiRefSeqHistorical(db) && strchr(nmAcc, '.'))
     {
     sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLinkHistorical where id = '%s'", nmAcc);
     npAcc = sqlQuickString(conn, query);
     }
 hFreeConn(&conn);
 return npAcc;
 }
 
+static char *enspForEnst(char *db, char *enstAcc)
+/* Given an ENST_ accession, look up and return its ENSP_ accession; if not found return NULL. */
+{
+if (trackHubDatabase(db))
+    return NULL;
+char *txAcc = NULL;
+struct sqlConnection *conn = hAllocConn(db);
+char *attrsTable = hFindLatestGencodeTableConn(conn, "Attrs");
+if (attrsTable && hHasField(db, attrsTable, "proteinId"))
+    {
+    char query[2048];
+    sqlSafef(query, sizeof(query), "select proteinId from %s where transcriptId = '%s'",
+             attrsTable, enstAcc);
+    txAcc = sqlQuickString(conn, query);
+    }
+hFreeConn(&conn);
+return txAcc;
+}
+
 static char *lrgProteinToTx(char *db, char *protAcc)
 /* Return the LRG_ transcript accession for protAcc.  Each LRG_NpM has a corresponding LRG_NtM. */
 {
 int accLen = strlen(protAcc);
 char txAcc[accLen+1];
 safecpy(txAcc, sizeof(txAcc), protAcc);
 char *p = strrchr(txAcc, 'p');
 if (p)
     *p = 't';
 return cloneString(txAcc);
 }
 
 static char *gencodeProteinToTx(char *db, char *protAcc)
 /* Return the ENS*T transcript accession for protAcc, or NULL if not found. */
 {
@@ -916,31 +958,53 @@
     dyStringFree(&nmTerm);
     freeMem(nmAcc);
     }
 return hgvs;
 }
 
 struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
 /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS:
  * Return a list of struct hgvsVariant that may be what was intended  */
 // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles)
 {
 struct hgvsVariant *hgvs = NULL;
 regmatch_t substrs[11];
 int geneSymbolIx = 1;
 boolean isSubst;
-if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp,
+if ((isSubst = regexMatchSubstr(term, pseudoHgvsENSPDotSubstExp,
+                                     substrs, ArraySize(substrs))) ||
+         regexMatchSubstr(term, pseudoHgvsENSPDotRangeExp, substrs, ArraySize(substrs)))
+    {
+    // User gave an ENST_ accession but a protein change -- swap in the right ENSP_.
+    int ensAccIx = 1;
+    int len = substrs[ensAccIx].rm_eo - substrs[ensAccIx].rm_so;
+    char ensAcc[len+1];
+    safencpy(ensAcc, sizeof(ensAcc), term, len);
+    char *enspAcc = enspForEnst(db, ensAcc);
+    if (isNotEmpty(enspAcc))
+        {
+        // Make it a real HGVS term with the ENSP and pass that on to the usual parser.
+        int descStartIx = 3;
+        char *description = term + substrs[descStartIx].rm_so;
+        struct dyString *enspTerm;
+        enspTerm = dyStringCreate("%s:p.%s", enspAcc, description);
+        hgvs = hgvsParseTerm(enspTerm->string);
+        dyStringFree(&enspTerm);
+        freeMem(enspAcc);
+        }
+    }
+else if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp,
                                      substrs, ArraySize(substrs))) ||
          regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs)))
     {
     // User gave an NM_ accession but a protein change -- swap in the right NP_.
     int nmAccIx = 1;
     int geneSymbolIx = 4;
     int len = substrs[nmAccIx].rm_eo - substrs[nmAccIx].rm_so;
     char nmAcc[len+1];
     safencpy(nmAcc, sizeof(nmAcc), term, len);
     char *npAcc = npForNm(db, nmAcc);
     if (isNotEmpty(npAcc))
         {
         // Make it a real HGVS term with the NP and pass that on to the usual parser.
         int descStartIx = 5;
         char *description = term + substrs[descStartIx].rm_so;