63bcbd28b1571a7efdb6628f80751481f3bd98f2
max
  Tue Apr 30 01:15:16 2024 -0700
adding n. notation to HGVS parser, refs #25982

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 504172b..e14faef 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -376,31 +376,38 @@
 //                                 2...                         original start AA
 //                                       3...                   1-based start position
 //                                           4................  optional range sep and AA+pos
 //                                             5...             original end AA
 //                                                  6...         1-based end position
 //                                                       7.....  change description
 
 // As above but omitting the protein change
 #define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp maybePDot posIntExp "\\)?"
 //      0..........................                             whole matching string
 //      1...................                                    gene symbol
 //                           2.....                             1-based position
 
 
 // Gene symbol, maybe punctuation, and a clear "c." position (and possibly change)
-#define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp
+#define pseudoHgvsGeneSymbolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp
+//      0.....................................................  whole matching string
+//      1...................                                    gene symbol
+//                           2.....                             optional beginning of position exp
+//                                   3.....                     beginning of position exp
+
+// Gene symbol, maybe punctuation, and a clear "n." position (and possibly change)
+#define pseudoHgvsGeneSymbolNDotPosExp "^" geneSymbolExp "[: ]+" hgvsNDotPosExp
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                           2.....                             optional beginning of position exp
 //                                   3.....                     beginning of position exp
 
 static struct hgvsVariant *hgvsParseGDotPos(char *term)
 /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 int accIx = 1;
 int startPosIx = 3;
 int endPosIx = 5;
 int changeIx = 6;
 boolean matches = FALSE;
 regmatch_t substrs[17];
@@ -878,30 +885,52 @@
             seq = aaSeq->dna;
         }
     }
 return seq;
 }
 
 static char refBaseForNp(char *db, char *npAcc, int pos)
 // Get the amino acid base in NP_'s sequence at 1-based offset pos.
 {
 char *seq = getProteinSeq(db, npAcc);
 char base = seq ? seq[pos-1] : '\0';
 freeMem(seq);
 return base;
 }
 
+static struct hgvsVariant* hgvsPseudoToRealHgvs(regmatch_t  substrs[], char* term, char* db, int geneSymbolIx, char *prefix)
+/* rewrite a pseudo-HGVS with symbol:<hgvs> to a real NM_xxx HGVS */
+{
+int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
+char geneSymbol[len+1];
+safencpy(geneSymbol, sizeof(geneSymbol), term, len);
+char *nmAcc = nmForGeneSymbol(db, geneSymbol);
+struct hgvsVariant *hgvs = NULL;
+if (isNotEmpty(nmAcc))
+    {
+    // Make it a real HGVS term with the NM and pass that on to the usual parser.
+    int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3;
+    char *description = term + substrs[descStartIx].rm_so;
+    struct dyString *nmTerm = dyStringCreate("%s(%s):%s%s",
+                                             nmAcc, geneSymbol, prefix, description);
+    hgvs = hgvsParseTerm(nmTerm->string);
+    dyStringFree(&nmTerm);
+    freeMem(nmAcc);
+    }
+return hgvs;
+}
+
 struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
 /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS:
  * Return a list of struct hgvsVariant that may be what was intended  */
 // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles)
 {
 struct hgvsVariant *hgvs = NULL;
 regmatch_t substrs[11];
 int geneSymbolIx = 1;
 boolean isSubst;
 if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp,
                                      substrs, ArraySize(substrs))) ||
          regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs)))
     {
     // User gave an NM_ accession but a protein change -- swap in the right NP_.
     int nmAccIx = 1;
@@ -970,48 +999,34 @@
             char *npAcc = npItem->name;
             // Only position was provided, no change.  Look up ref base and make a synonymous subst
             // so it's parseable HGVS.
             int posIx = 2;
             int pos = regexSubstringInt(term, substrs[posIx]);
             char refBase = refBaseForNp(db, npAcc, pos);
             struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=",
                                                      npAcc, geneSymbol, refBase, pos);
             struct hgvsVariant *newTerm = hgvsParseTerm(npTerm->string);
             if (newTerm)
                 slAddHead(&hgvs, newTerm);
             dyStringFree(&npTerm);
             }
         }
     }
-else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs)))
-    {
-    int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
-    char geneSymbol[len+1];
-    safencpy(geneSymbol, sizeof(geneSymbol), term, len);
-    char *nmAcc = nmForGeneSymbol(db, geneSymbol);
-    if (isNotEmpty(nmAcc))
-        {
-        // Make it a real HGVS term with the NM and pass that on to the usual parser.
-        int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3;
-        char *description = term + substrs[descStartIx].rm_so;
-        struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s",
-                                                 nmAcc, geneSymbol, description);
-        hgvs = hgvsParseTerm(nmTerm->string);
-        dyStringFree(&nmTerm);
-        freeMem(nmAcc);
-        }
-    }
+else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolCDotPosExp, substrs, ArraySize(substrs)))
+    hgvs = hgvsPseudoToRealHgvs(substrs, term, db, geneSymbolIx, "c.");
+else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolNDotPosExp, substrs, ArraySize(substrs)))
+    hgvs = hgvsPseudoToRealHgvs(substrs, term, db, geneSymbolIx, "n.");
 else if (regexMatchSubstr(term, pseudoHgvsChrGDotExp, substrs, ArraySize(substrs)))
     {
     int chrIx = 1;
     int startPosIx = 3;
     int endPosIx = 5;
     int changeIx = 6;
     AllocVar(hgvs);
     hgvs->type = hgvstGenomic;
     hgvs->seqAcc = regexSubstringClone(term, substrs[chrIx]);
     hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
     if (regexSubstrMatched(substrs[endPosIx]))
         hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
     else
         hgvs->end = hgvs->start1;
     hgvs->changes = regexSubstringClone(term, substrs[changeIx]);