3712ae6f8f6f212bae33ecc5499feeea598d83e4
angie
  Mon Aug 14 11:52:33 2017 -0700
HGVS pos/search: support parentheses around p. terms.  Also watch out for start>end in hgvsValidateGene.  refs #19968 note-6

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index e00c269..2bed75a 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -73,38 +73,38 @@
 //                                            ...       intron offset separator
 //                                                ...   intron offset number
 
 // It's pretty common for users to omit the '.' so if it's missing but the rest of the regex fits,
 // roll with it.
 #define hgvsCDotPosExp "c\\.?" hgvsCdsPosExp
 #define hgvsGMDotPosExp "([gm])\\.?" hgvsGenoPosExp
 #define hgvsNDotPosExp "n\\.?" hgvsCdsPosExp
 // Not supporting RDot at this point because r. terms may use either n. or c. numbering!
 // #define hgvsRDotPosExp "r\\.?" hgvsCdsPosExp
 
 // Protein substitution regex
 #define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter"
 #define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp
 #define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)"
-#define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp
+#define hgvsPDotSubstExp "p\\.\\(?" hgvsAminoAcidSubstExp "\\)?"
 //                                 ...                                  // original sequence
 //                                              ......                  // 1-based position
 //                                                           ...        // replacement sequence
 
 // Protein range (or just single pos) regex
 #define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)"
-#define hgvsPDotRangeExp "p\\." hgvsAaRangeExp
+#define hgvsPDotRangeExp "p\\.\\(?" hgvsAaRangeExp "\\)?"
 //  original start AA           ...
 //  1-based start position                       ...
 //  optional range sep and AA+pos                          .....................................
 //  original end AA                                              ...
 //  1-based end position                                                          ...
 //  change description                                                                    ...
 
 // Complete HGVS term regexes combining sequence identifiers and change operations
 #define hgvsFullRegex(seq, op) "^" seq "[ :]+" op
 
 #define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp)
 #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)"
 // substring numbering:
 //      0.....................................  whole matching string
 //      1.................                      accession and optional dot version
@@ -218,76 +218,79 @@
 //                                                 9...         1-based end position
 //                                                     10.....  change description
 
 // Pseudo-HGVS in common usage
 // g. with "chr" ID:
 #define pseudoHgvsChrGDotExp hgvsFullRegex("(chr[0-9A-Za-z_]+)", hgvsGMDotPosExp) "(.*)"
 // substring numbering:
 //      0.....................................  whole matching string
 //      1...........                            chr...
 //                   2.                         g or m
 //                     3......                  1-based start position
 //                            4.......          optional range separator and end position
 //                              5.....          1-based end position
 //                                    6....     change description
 
+
 // Sometimes users give an NM_ accession, but a protein change.
-#define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp "[ :]+p?\\.?" hgvsAminoAcidSubstExp
+#define maybePDot "[ :]+p?\\.?\\(?"
+#define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp maybePDot hgvsAminoAcidSubstExp "\\)?"
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1...............                                        acc & optional dot version
 //             2........                                        optional dot version
 //                       3.....                                 optional gene sym in ()s
 //                        4...                                  optional gene symbol
 //                                   5.....                     original sequence
 //                                           6......            1-based position
 //                                                     7......  replacement sequence
 
-#define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp "[ :]+p?\\.?" hgvsAaRangeExp
+#define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp maybePDot hgvsAaRangeExp "\\)?"
+
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1...............                                        acc & optional dot version
 //             2........                                        optional dot version
 //                       3.....                                 optional gene sym in ()s
 //                        4...                                  optional gene symbol
 //                                 5...                         original start AA
 //                                       6...                   1-based start position
 //                                           7..........        optional range sep and AA+pos
 //                                             8...             original end AA
 //                                                  9...        1-based end position
 //                                                      10....  change description
 
 // Common: gene symbol followed by space and/or punctuation followed by protein change
-#define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp "[ :]+p?\\.?" hgvsAminoAcidSubstExp
+#define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp maybePDot hgvsAminoAcidSubstExp "\\)?"
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                                   2.....                     original sequence
 //                                           3......            1-based position
 //                                                     4......  replacement sequence
 
-#define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp "[ :]+p?\\.?" hgvsAaRangeExp
+#define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp maybePDot hgvsAaRangeExp "\\)?"
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                                 2...                         original start AA
 //                                       3...                   1-based start position
 //                                           4................  optional range sep and AA+pos
 //                                             5...             original end AA
 //                                                  6...         1-based end position
 //                                                       7.....  change description
 
 // As above but omitting the protein change
-#define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp "[ :]+p?\\.?" posIntExp
+#define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp maybePDot posIntExp "\\)?"
 //      0..........................                             whole matching string
 //      1...................                                    gene symbol
 //                           2.....                             1-based position
 
 
 // Gene symbol, maybe punctuation, and a clear "c." position (and possibly change)
 #define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                           2.....                             optional beginning of position exp
 //                                   3.....                     beginning of position exp
 
 static struct hgvsVariant *hgvsParseGDotPos(char *term)
 /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */
 {
@@ -1002,44 +1005,50 @@
     return FALSE;
 boolean coordsOK = FALSE;
 char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc));
 if (accSeq)
     {
     // By convention, foundAcc is always versionless because it's accompanied by foundVersion.
     if (retFoundAcc)
         *retFoundAcc = cloneFirstWordByDelimiter(acc, '.');
     int seqLen = strlen(accSeq);
     int start, end;
     hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
     if (hgvs->type == hgvstCoding)
         {
         struct genbankCds cds;
         coordsOK = getCds(db, acc, &cds);
-        if (coordsOK && retDiffRefAllele)
+        if (coordsOK)
             {
             start += (hgvs->startIsUtr3 ? cds.end : cds.start);
-            if (hgvs->startOffset == 0 && start >= 0 && start < seqLen)
+            end += (hgvs->endIsUtr3 ? cds.end : cds.start);
+            if (start > end)
+                coordsOK = FALSE;
+            else if (retDiffRefAllele && hgvs->startOffset == 0 && start >= 0 && start <= seqLen)
                 checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
             }
         }
     else
         {
+        if (start <= end)
+            {
             coordsOK = TRUE;
             if (retDiffRefAllele && hgvs->startOffset == 0 && start >= 0 && start < seqLen)
                 checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
             }
         }
+    }
 freeMem(accSeq);
 freeMem(acc);
 return coordsOK;
 }
 
 boolean hgvsValidate(char *db, struct hgvsVariant *hgvs,
                      char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
 /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
  * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so
  * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found.
  * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
  * of hgvs->seqAcc) or NULL if we can't find any match.
  * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
  * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
  * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords