8de038378e49e6e1ba6b2de24bddc334c8b75d99
angie
  Tue May 9 11:51:59 2017 -0700
Adding support for HGVS m. and n. terms in position/search.  refs #19423

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 66a0c48..4ba4bfa 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -62,69 +62,89 @@
 //                    ...                               optional anchor "-" or "*"
 //                        ...                           mandatory first number
 //                            .......                   optional offset separator and offset
 //                            ...                       offset separator
 //                                ...                   offset number
 //                                    ...............   optional range separator and complex num
 //                                    ...               optional anchor "-" or "*"
 //                                        ...           first number
 //                                            .......   optional offset separator and offset
 //                                            ...       offset separator
 //                                                ...   offset number
 
 // It's pretty common for users to omit the '.' so if it's missing but the rest of the regex fits,
 // roll with it.
 #define hgvsCDotPosExp "c\\.?" hgvsCdsPosExp
-#define hgvsGDotPosExp "g\\.?" hgvsNtPosExp
-#define hgvsMDotPosExp "m\\.?" hgvsNtPosExp
+#define hgvsGMDotPosExp "([gm])\\.?" hgvsNtPosExp
 #define hgvsNDotPosExp "n\\.?" hgvsNtPosExp
 #define hgvsRDotPosExp "r\\.?" hgvsNtPosExp
 
 // Protein substitution regex
 #define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter"
 #define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp
 #define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)"
 #define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp
 //                                 ...                                  // original sequence
 //                                              ......                  // 1-based position
 //                                                           ...        // replacement sequence
 
 // Protein range (or just single pos) regex
 #define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)"
 #define hgvsPDotRangeExp "p\\." hgvsAaRangeExp
 //  original start AA           ...
 //  1-based start position                       ...
 //  optional range sep and AA+pos                          .....................................
 //  original end AA                                              ...
 //  1-based end position                                                          ...
 //  change description                                                                    ...
 
 // Complete HGVS term regexes combining sequence identifiers and change operations
 #define hgvsFullRegex(seq, op) "^" seq ":" op
 
-#define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGDotPosExp)
+#define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp)
 #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)"
 // substring numbering:
 //      0.....................................  whole matching string
 //      1.................                      accession and optional dot version
 //               2........                      optional dot version
 //                       3......                (n/a) optional gene symbol in ()s
 //                        4....                 (n/a) optional gene symbol
-//                               5...           1-based start position
-//                                   6....      optional range separator and end position
-//                                    7...      1-based end position
-//                                       8....  change description
+//                              5.              g or m
+//                                6..           1-based start position
+//                                   7...       optional range separator and end position
+//                                     8..      1-based end position
+//                                        9...  change description
+
+#define hgvsLrgNDotExp hgvsFullRegex(lrgTranscriptExp, hgvsNDotPosExp) "(.*)"
+//      0.....................................  whole matching string
+//      1...................                    LRG transcript
+//                   2......                    1-based start position
+//                           3..........        optional range separator and end position
+//                               4.....         1-based end position
+//                                        5...  change description
+
+#define hgvsRefSeqNMNDotExp hgvsFullRegex(versionedRefSeqNMExp, hgvsNDotPosExp) "(.*)"
+// substring numbering:
+//      0.....................................  whole matching string
+//      1.................                      accession and optional dot version
+//               2........                      optional dot version
+//                       3......                (n/a) optional gene symbol in ()s
+//                        4....                 (n/a) optional gene symbol
+//                                5..           1-based start position
+//                                   6...       optional range separator and end position
+//                                     7..      1-based end position
+//                                        8...  change description
 
 #define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp)
 #define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)"
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1...................                                    LRG transcript
 //                   2...                                       optional anchor "-" or "*"
 //                       3...                                   mandatory first number
 //                           4.......                           optional offset separator and offset
 //                           5...                               offset separator
 //                               6...                           offset number
 //                                   7...............           optional range sep and complex num
 //                                   8...                       optional anchor "-" or "*"
 //                                       9...                   first number
 //                                          10.......           optional offset separator and offset
@@ -233,46 +253,90 @@
 // Gene symbol, maybe punctuation, and a clear "c." position (and possibly change)
 #define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                           2.....                             optional beginning of position exp
 //                                   3.....                     beginning of position exp
 
 static struct hgvsVariant *hgvsParseGDotPos(char *term)
 /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 regmatch_t substrs[17];
 if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs)))
     {
     int accIx = 1;
-    int startPosIx = 5;
-    int endPosIx = 7;
-    int changeIx = 8;
+    int startPosIx = 6;
+    int endPosIx = 8;
+    int changeIx = 9;
     AllocVar(hgvs);
+    // HGVS recommendation May 2017: replace m. with g. since mitochondrion is genomic too
     hgvs->type = hgvstGenomic;
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
     hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
     if (regexSubstrMatched(substrs[endPosIx]))
         hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
     else
         hgvs->end = hgvs->start1;
     hgvs->changes = regexSubstringClone(term, substrs[changeIx]);
     }
 return hgvs;
 }
 
+static struct hgvsVariant *hgvsParseNDotPos(char *term)
+/* If term is parseable as an HGVS n. term, return the parsed representation, otherwise NULL. */
+{
+struct hgvsVariant *hgvs = NULL;
+boolean matches = FALSE;
+int accIx = 1;
+int startPosIx = 2;
+int endPosIx = 4;
+int changeIx = 5;
+// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that
+// affects all substring offsets after the accession.
+int refSeqExtra = 3;
+int geneSymbolIx = -1;
+regmatch_t substrs[10];
+if (regexMatchSubstr(term, hgvsLrgNDotExp, substrs, ArraySize(substrs)))
+    {
+    matches = TRUE;
+    }
+else if (regexMatchSubstr(term, hgvsRefSeqNMNDotExp, substrs, ArraySize(substrs)))
+    {
+    matches = TRUE;
+    geneSymbolIx = 4;
+    startPosIx += refSeqExtra;
+    endPosIx += refSeqExtra;
+    changeIx += refSeqExtra;
+    }
+if (matches)
+    {
+    AllocVar(hgvs);
+    hgvs->type = hgvstNoncoding;
+    hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
+    if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
+        hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
+    hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
+    if (regexSubstrMatched(substrs[endPosIx]))
+        hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
+    else
+        hgvs->end = hgvs->start1;
+    hgvs->changes = regexSubstringClone(term, substrs[changeIx]);
+    }
+return hgvs;
+}
+
 static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset,
                               boolean *retIsUtr3, int *retPos, int *retOffset)
 /* Extract matches from substrs starting at substrOffset to parse a complex number
  * description into anchor type, 1-based position and optional offset. */
 {
 int anchorIx = substrOffset;
 int posIx = substrOffset + 1;
 int offsetOpIx = substrOffset + 3;
 int offsetIx = substrOffset + 4;
 // Determine what startPos is relative to, based on optional anchor and optional offset
 char anchor[16]; // string length 0 or 1
 regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor));
 char offsetOp[16]; // string length 0 or 1
 regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp));
 *retIsUtr3 = (anchor[0] == '*');
@@ -400,30 +464,32 @@
     hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
     if (regexSubstrMatched(substrs[endPosIx]))
         hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
     else
         hgvs->end = hgvs->start1;
     }
 return hgvs;
 }
 
 struct hgvsVariant *hgvsParseTerm(char *term)
 /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL.
  * This does not check validity of accessions, coordinates or alleles. */
 {
 struct hgvsVariant *hgvs = hgvsParseCDotPos(term);
 if (hgvs == NULL)
+    hgvs = hgvsParseNDotPos(term);
+if (hgvs == NULL)
     hgvs = hgvsParsePDotSubst(term);
 if (hgvs == NULL)
     hgvs = hgvsParsePDotRange(term);
 if (hgvs == NULL)
     hgvs = hgvsParseGDotPos(term);
 //#*** TODO: MDot, RDot, NDot
 return hgvs;
 }
 
 static boolean dbHasNcbiRefSeq(char *db)
 /* Test whether NCBI's RefSeq alignments are available in db. */
 {
 // hTableExists() caches results so this shouldn't make for loads of new SQL queries if called
 // more than once.
 return (hTableExists(db, "ncbiRefSeq") && hTableExists(db, "ncbiRefSeqPsl") &&