a1c329068ad18aa4093972b501e6324f8bcdb398
angie
  Fri Sep 16 17:19:30 2016 -0700
Parse HGVS position ranges and coding terms' UTR and intron coordinates.
When the chromAlias table is present, this also now supports NC_*:g. terms.
The sequence change part is no longer parsed -- it's not necessary for mapping
to genomic ranges, although it will be necessary in order for hgVai to take
HGVS input.
This does not yet support ranges-of-ranges but neither does Mutalyzer.  This
also doesn't support uncertain positions ('?').  Like complex sequence changes,
those can wait until we have a sophisticated parser.
We also support some new not-quite-HGVS forms: geneSymbol and protein pos only,
or geneSymbol and c.<valid position range>.
refs #15071

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 38ed891..3074a77 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -1,531 +1,1033 @@
 /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */
 /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */
 
 /* Copyright (C) 2016 The Regents of the University of California
  * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "hgHgvs.h"
+#include "chromInfo.h"
 #include "genbank.h"
 #include "hdb.h"
 #include "pslTransMap.h"
 #include "regexHelper.h"
 
 // Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq:
 #define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])"
 #define lrgRegionExp "(LRG_[0-9+])"
 
 // NC = RefSeq reference assembly chromosome
 // NG = RefSeq incomplete genomic region (e.g. gene locus)
 // NM = RefSeq curated mRNA
 // NP = RefSeq curated protein
 // NR = RefSeq curated (non-coding) RNA
-#define geneSymbolExp "([A-Z0-9./_-]+)"
+#define geneSymbolExp "([A-Za-z0-9./_-]+)"
 #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?"
 #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp
 //                                 ........................       accession and optional dot version
 //                                              ...........       optional dot version
 //                                                         ...... optional gene symbol in ()s
 //                                                          ....  optional gene symbol
 #define versionedRefSeqNCExp versionedAccPrefixExp("NC")
 #define versionedRefSeqNGExp versionedAccPrefixExp("NG")
 #define versionedRefSeqNMExp versionedAccPrefixExp("NM")
 #define versionedRefSeqNPExp versionedAccPrefixExp("NP")
 #define versionedRefSeqNRExp versionedAccPrefixExp("NR")
 
-// Nucleotide substitution regexes
+// Nucleotide position regexes
 // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA)
-#define hgvsNtSubstExp "([0-9]+)([ACGT]+)>([ACGT]+)"
-//                       ......                     1-based position
-//                               .......            original sequence
-//                                         .......  replacement sequence
-#define hgvsCDotSubstExp "c\\." hgvsNtSubstExp
-#define hgvsGDotSubstExp "g\\." hgvsNtSubstExp
-#define hgvsMDotSubstExp "m\\." hgvsNtSubstExp
-#define hgvsNDotSubstExp "n\\." hgvsNtSubstExp
-#define hgvsRDotSubstExp "r\\." hgvsNtSubstExp
+#define posIntExp "([0-9]+)"
+#define hgvsNtPosExp posIntExp "(_" posIntExp ")?"
+//                   ......                     1-based start position
+//                           .............      optional range separator and end position
+//                               ......         1-based end position
+// Now for a regex that can handle positions like "26" or "*80" or "-24+75_-24+92"...
+#define anchorExp "([-*])?"
+#define offsetExp "([-+])"
+#define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?"
+#define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?"
+//                    ...                               optional anchor "-" or "*"
+//                        ...                           mandatory first number
+//                            .......                   optional offset separator and offset
+//                            ...                       offset separator
+//                                ...                   offset number
+//                                    ...............   optional range separator and complex num
+//                                    ...               optional anchor "-" or "*"
+//                                        ...           first number
+//                                            .......   optional offset separator and offset
+//                                            ...       offset separator
+//                                                ...   offset number
+
+#define hgvsCDotPosExp "c\\." hgvsCdsPosExp
+#define hgvsGDotPosExp "g\\." hgvsNtPosExp
+#define hgvsMDotPosExp "m\\." hgvsNtPosExp
+#define hgvsNDotPosExp "n\\." hgvsNtPosExp
+#define hgvsRDotPosExp "r\\." hgvsNtPosExp
 
 // Protein substitution regex
-#define AA3 "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter"
-#define hgvsAminoAcidExp "([ARNDCQEGHILKMFPSTWYVX*]|" AA3 ")"
-#define hgvsAminoAcidSubstExp hgvsAminoAcidExp "([0-9]+)" hgvsAminoAcidExp
+#define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter"
+#define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp
+#define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)"
 #define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp
 //                                 ...                                  // original sequence
 //                                              ......                  // 1-based position
 //                                                           ...        // replacement sequence
 
+// Protein range (or just single pos) regex
+#define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)"
+#define hgvsPDotRangeExp "p\\." hgvsAaRangeExp
+//  original start AA           ...
+//  1-based start position                       ...
+//  optional range sep and AA+pos                          .....................................
+//  original end AA                                              ...
+//  1-based end position                                                          ...
+//  change description                                                                    ...
 
 // Complete HGVS term regexes combining sequence identifiers and change operations
-#define hgvsFullRegex(seq, op) "^" seq ":" op "$"
+#define hgvsFullRegex(seq, op) "^" seq ":" op
 
-#define hgvsLrgCDotSubstExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotSubstExp)
+#define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGDotPosExp)
+#define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)"
+// substring numbering:
+//      0.....................................  whole matching string
+//      1.................                      accession and optional dot version
+//               2........                      optional dot version
+//                       3......                (n/a) optional gene symbol in ()s
+//                        4....                 (n/a) optional gene symbol
+//                               5...           1-based start position
+//                                   6....      optional range separator and end position
+//                                    7...      1-based end position
+//                                       8....  change description
+
+#define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp)
+#define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)"
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1...................                                    LRG transcript
-//                                   2.....                     1-based position
-//                                           3......            original sequence
-//                                                     4......  replacement sequence
+//                   2...                                       optional anchor "-" or "*"
+//                       3...                                   mandatory first number
+//                           4.......                           optional offset separator and offset
+//                           5...                               offset separator
+//                               6...                           offset number
+//                                   7...............           optional range sep and complex num
+//                                   8...                       optional anchor "-" or "*"
+//                                       9...                   first number
+//                                          10.......           optional offset separator and offset
+//                                          11...               offset separator
+//                                              12...           offset number
+//                                                  13........  sequence change
 
-#define hgvsRefSeqNMCDotSubstExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotSubstExp)
+#define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp)
+#define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)"
 // substring numbering:
-//      0.....................................................  whole matching string
-//      1...............                                        accession and optional dot version
+//      0...............................................................  whole matching string
+//      1...............                                                  acc & optional dot version
 //             2........                                                  optional dot version
-//                         3.....                               optional gene symbol in ()s
+//                       3.....                                           optional gene sym in ()s
 //                        4...                                            optional gene symbol
-//                                   5.....                     1-based position
-//                                           6......            original sequence
-//                                                     7......  replacement sequence
+//                              5...                                      optional anchor
+//                                  6...                                  mandatory first number
+//                                      7.......                          optional offset
+//                                      8...                              offset separator
+//                                          9...                          offset number
+//                                             10...............          optional range
+//                                             11...                      optional anchor
+//                                                12...                   first number
+//                                                     13.......          optional offset
+//                                                     14...              offset separator
+//                                                         15...          offset number
+//                                                             16........ sequence change
 
 #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1................                                       accession and optional dot version
 //                 2.....                                       optional dot version
 //                         3.....                               optional gene symbol in ()s
 //                          4...                                optional gene symbol
 //                                   5.....                     original sequence
 //                                           6......            1-based position
 //                                                     7......  replacement sequence
 
+#define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp)
+// substring numbering:
+//      0.....................................................  whole matching string
+//      1................                                       accession and optional dot version
+//                 2.....                                       optional dot version
+//                         3.....                               optional gene symbol in ()s
+//                          4...                                optional gene symbol
+//                                 5...                         original start AA
+//                                       6...                   1-based start position
+//                                          7.................  optional range sep and AA+pos
+//                                            8...              original end AA
+//                                                 9...         1-based end position
+//                                                     10.....  change description
+
 // Pseudo-HGVS in common usage
-#define pseudoHgvsGeneSymbolSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp
+#define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp
 //      0.....................................................  whole matching string
 //      1...................                                    gene symbol
 //                                   2.....                     original sequence
 //                                           3......            1-based position
 //                                                     4......  replacement sequence
 
-static struct hgvsVariant *hgvsParseCDotSubst(char *term)
-/* If term is parseable as an HGVS CDS substitution, return the parsed representation,
- * otherwise NULL. */
+#define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp "[: p.]+" hgvsAaRangeExp
+//      0.....................................................  whole matching string
+//      1...................                                    gene symbol
+//                                 2...                         original start AA
+//                                       3...                   1-based start position
+//                                           4................  optional range sep and AA+pos
+//                                             5...             original end AA
+//                                                  6...         1-based end position
+//                                                       7.....  change description
+
+#define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp "[: p.]+" posIntExp
+//      0..........................                             whole matching string
+//      1...................                                    gene symbol
+//                           2.....                             1-based position
+
+
+#define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp
+//      0.....................................................  whole matching string
+//      1...................                                    gene symbol
+//                           2.....                             optional beginning of position exp
+//                                   3.....                     beginning of position exp
+
+
+static struct hgvsVariant *hgvsParseGDotPos(char *term)
+/* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */
+{
+struct hgvsVariant *hgvs = NULL;
+regmatch_t substrs[17];
+if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs)))
+    {
+    int accIx = 1;
+    int startPosIx = 5;
+    int endPosIx = 7;
+    int changeIx = 8;
+    AllocVar(hgvs);
+    hgvs->type = hgvstGenomic;
+    hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
+    hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
+    if (regexSubstrMatched(substrs[endPosIx]))
+        hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
+    else
+        hgvs->end = hgvs->start1;
+    hgvs->change = regexSubstringClone(term, substrs[changeIx]);
+    }
+return hgvs;
+}
+
+static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset,
+                              boolean *retIsUtr3, int *retPos, int *retOffset)
+/* Extract matches from substrs starting at substrOffset to parse a complex number
+ * description into anchor type, 1-based position and optional offset. */
+{
+int anchorIx = substrOffset;
+int posIx = substrOffset + 1;
+int offsetOpIx = substrOffset + 3;
+int offsetIx = substrOffset + 4;
+// Determine what startPos is relative to, based on optional anchor and optional offset
+char anchor[16]; // string length 0 or 1
+regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor));
+char offsetOp[16]; // string length 0 or 1
+regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp));
+*retIsUtr3 = (anchor[0] == '*');
+*retPos = regexSubstringInt(term, substrs[posIx]);
+// Is pos negative?
+if (anchor[0] == '-')
+    *retPos = -*retPos;
+// Grab offset, if there is one.
+if (isNotEmpty(offsetOp))
+    {
+    *retOffset = regexSubstringInt(term, substrs[offsetIx]);
+    if (offsetOp[0] == '-')
+        *retOffset = -*retOffset;
+    }
+}
+
+static struct hgvsVariant *hgvsParseCDotPos(char *term)
+/* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 boolean matches = FALSE;
 int accIx = 1;
-int posIx = 0;
-int refIx = 0;
-int altIx = 0;
+int startAnchorIx = 2;
+int endAnchorIx = 8;
+int endPosIx = 9;
+int changeIx = 13;
+// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that
+// affects all substring offsets after the accession.
+int refSeqExtra = 3;
 int geneSymbolIx = -1;
-regmatch_t substrs[8];
-if (regexMatchSubstrNoCase(term, hgvsLrgCDotSubstExp, substrs, ArraySize(substrs)))
+regmatch_t substrs[17];
+if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs)))
     {
     matches = TRUE;
-    posIx = 2;
-    refIx = 3;
-    altIx = 4;
     }
-else if (regexMatchSubstrNoCase(term, hgvsRefSeqNMCDotSubstExp, substrs, ArraySize(substrs)))
+else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs)))
     {
     matches = TRUE;
     geneSymbolIx = 4;
-    posIx = 5;
-    refIx = 6;
-    altIx = 7;
+    startAnchorIx += refSeqExtra;
+    endAnchorIx += refSeqExtra;
+    endPosIx += refSeqExtra;
+    changeIx += refSeqExtra;
     }
 if (matches)
     {
     AllocVar(hgvs);
     hgvs->type = hgvstCoding;
-    hgvs->changeSymbol = cloneString(">");
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
-    hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
-    hgvs->refAllele = regexSubstringClone(term, substrs[refIx]);
-    hgvs->altAllele = regexSubstringClone(term, substrs[altIx]);
+    extractComplexNum(term, substrs, startAnchorIx,
+                      &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset);
     if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
         hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
-    hgvs->end = hgvs->start1 - 1 + strlen(hgvs->refAllele);
-    }
-return hgvs;
-}
-
-static char *aaAbbrsToLetters(char *aaStr)
-/* If the input string is composed of AA abbreviations like "Ala", "Asp" etc., convert them to
- * single letter AAs like "A", "D" etc.  Otherwise return a copy of aaStr. */
-{
-char *letters = NULL;
-if (regexMatch(aaStr, "^(" AA3 ")+$"))
+    if (regexSubstrMatched(substrs[endPosIx]))
         {
-    int len = strlen(aaStr) / 3;
-    letters = needMem(len + 1);
-    int i;
-    for (i = 0;  i < len;  i++)
-        letters[i] = aaAbbrToLetter(&aaStr[i*3]);
+        extractComplexNum(term, substrs, endAnchorIx,
+                          &hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset);
         }
     else
-    letters = cloneString(aaStr);
-return letters;
+        {
+        hgvs->end = hgvs->start1;
+        hgvs->endIsUtr3 = hgvs->startIsUtr3;
+        hgvs->endOffset = hgvs->startOffset;
+        }
+    hgvs->change = regexSubstringClone(term, substrs[changeIx]);
+    }
+return hgvs;
 }
 
 static struct hgvsVariant *hgvsParsePDotSubst(char *term)
 /* If term is parseable as an HGVS protein substitution, return the parsed representation,
  * otherwise NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 regmatch_t substrs[8];
-if (regexMatchSubstrNoCase(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs)))
+if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs)))
     {
     int accIx = 1;
     int geneSymbolIx = 4;
     int refIx = 5;
     int posIx = 6;
     int altIx = 7;
     AllocVar(hgvs);
     hgvs->type = hgvstProtein;
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
+    int changeStart = substrs[refIx].rm_so;
+    int changeEnd = substrs[altIx].rm_eo;
+    int len = changeEnd - changeStart;
+    char change[len+1];
+    safencpy(change, sizeof(change), term+changeStart, len);
+    hgvs->change = cloneString(change);
     if (regexSubstrMatched(substrs[geneSymbolIx]))
         hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
     hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
-    // Convert 3-letter abbreviations to single-letter codes like "Ala" -> "A",
-    // so strlen(allele) is the number of amino acids.
-    char buf[2048];
-    regexSubstringCopy(term, substrs[refIx], buf, sizeof(buf));
-    hgvs->refAllele = aaAbbrsToLetters(buf);
-    regexSubstringCopy(term, substrs[altIx], buf, sizeof(buf));
-    hgvs->altAllele = aaAbbrsToLetters(buf);
-    hgvs->end = hgvs->start1 - 1 + strlen(hgvs->refAllele);
+    hgvs->end = hgvs->start1;
+    }
+return hgvs;
+}
+
+static struct hgvsVariant *hgvsParsePDotRange(char *term)
+/* If term is parseable as an HGVS protein range change, return the parsed representation,
+ * otherwise NULL. */
+{
+struct hgvsVariant *hgvs = NULL;
+regmatch_t substrs[11];
+if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs)))
+    {
+    int accIx = 1;
+    int geneSymbolIx = 4;
+    int startRefIx = 5;
+    int startPosIx = 6;
+    int endPosIx = 9;
+    int changeIx = 10;
+    AllocVar(hgvs);
+    hgvs->type = hgvstProtein;
+    hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
+    int changeStart = substrs[startRefIx].rm_so;
+    int changeEnd = substrs[changeIx].rm_eo;
+    int len = changeEnd - changeStart;
+    char change[len+1];
+    safencpy(change, sizeof(change), term+changeStart, len);
+    hgvs->change = cloneString(change);
+    if (regexSubstrMatched(substrs[geneSymbolIx]))
+        hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
+    hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
+    if (regexSubstrMatched(substrs[endPosIx]))
+        hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
+    else
+        hgvs->end = hgvs->start1;
     }
 return hgvs;
 }
 
 struct hgvsVariant *hgvsParseTerm(char *term)
 /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL.
- * This does not check validity of accessions or alleles. */
+ * This does not check validity of accessions, coordinates or alleles. */
 {
-struct hgvsVariant *hgvs = hgvsParseCDotSubst(term);
+struct hgvsVariant *hgvs = hgvsParseCDotPos(term);
 if (hgvs == NULL)
     hgvs = hgvsParsePDotSubst(term);
+if (hgvs == NULL)
+    hgvs = hgvsParsePDotRange(term);
+if (hgvs == NULL)
+    hgvs = hgvsParseGDotPos(term);
 return hgvs;
 }
 
-struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
-/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */
+static char *npForGeneSymbol(char *db, char *geneSymbol)
+/* Given a gene symbol, look it up to find its NP_ accession; if not found return NULL. */
 {
-struct hgvsVariant *hgvs = NULL;
-regmatch_t substrs[5];
-if (regexMatchSubstrNoCase(term, pseudoHgvsGeneSymbolSubstExp, substrs, ArraySize(substrs)))
-    {
-    int geneSymbolIx = 1;
-    int refIx = 2;
-    int posIx = 3;
-    int altIx = 4;
-    char *geneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
-    // Try to find an NP_ for geneSymbol, using refGene to make sure it's an NP for this species.
 struct sqlConnection *conn = hAllocConn(db);
 char query[2048];
+// Use refGene to make sure it's an NP for this species.
 sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
          "where l.name = '%s' and l.mrnaAcc = r.name "
-             "and l.protAcc != '' order by l.protAcc"
+         "and l.protAcc != '' order by length(l.protAcc), l.protAcc"
          , refLinkTable, geneSymbol);
 char *npAcc = sqlQuickString(conn, query);
 hFreeConn(&conn);
-    if (isNotEmpty(npAcc))
-        {
-        AllocVar(hgvs);
-        hgvs->type = hgvstProtein;
-        hgvs->seqAcc = npAcc;
-        hgvs->seqGeneSymbol = geneSymbol;
-        hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
-        // Convert 3-letter abbreviations to single-letter codes like "Ala" -> "A",
-        // so strlen(allele) is the number of amino acids.
-        char buf[2048];
-        regexSubstringCopy(term, substrs[refIx], buf, sizeof(buf));
-        hgvs->refAllele = aaAbbrsToLetters(buf);
-        regexSubstringCopy(term, substrs[altIx], buf, sizeof(buf));
-        hgvs->altAllele = aaAbbrsToLetters(buf);
-        int refLen = strlen(hgvs->refAllele);
-        hgvs->end = hgvs->start1 - 1 + refLen;
-        }
-    // Note: this doesn't support non-coding gene symbol substs (which should have nt alleles)
+return npAcc;
 }
-return hgvs;
+
+static char *nmForGeneSymbol(char *db, char *geneSymbol)
+/* Given a gene symbol, look it up to find its NM_ accession; if not found return NULL. */
+{
+struct sqlConnection *conn = hAllocConn(db);
+char query[2048];
+sqlSafef(query, sizeof(query), "select name from refGene where name2 = '%s' "
+         "order by length(name), name", geneSymbol);
+char *nmAcc = sqlQuickString(conn, query);
+hFreeConn(&conn);
+return nmAcc;
 }
 
-static char *getCdnaSeq(char *db, char *acc,  char **retFoundAcc)
-/* Return cdna sequence for acc, or NULL if not found.  If retFoundAcc is not NULL,
+static char *getProteinSeq(char *db, char *acc,  char **retFoundAcc)
+/* Return amino acid sequence for acc, or NULL if not found.  If retFoundAcc is not NULL,
  * set it to the accession for which we grabbed seq (might have a .version chopped off). */
 {
 char *seq = NULL;
 char *foundAcc = NULL;
 if (startsWith("LRG_", acc))
     {
-    if (hTableExists(db, "lrgCdna"))
+    if (hTableExists(db, "lrgPep"))
         {
         char query[2048];
-        sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
+        sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         if (seq)
             foundAcc = cloneString(acc);
         }
     }
 else
     {
     char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.');
-    struct dnaSeq *gbSeq = hGenBankGetMrna(db, trimmedAcc, gbSeqTable);
-    if (gbSeq)
+    aaSeq *aaSeq = hGenBankGetPep(db, trimmedAcc, gbSeqTable);
+    if (aaSeq)
         {
-        seq = gbSeq->dna;
+        seq = aaSeq->dna;
         foundAcc = trimmedAcc;
         }
     }
 if (retFoundAcc)
     *retFoundAcc = foundAcc;
 return seq;
 }
 
-static char *getProteinSeq(char *db, char *acc,  char **retFoundAcc)
-/* Return amino acid sequence for acc, or NULL if not found.  If retFoundAcc is not NULL,
+static char refBaseForNp(char *db, char *npAcc, int pos)
+// Get the amino acid base in NP_'s sequence at 1-based offset pos.
+{
+char *seq = getProteinSeq(db, npAcc, NULL);
+char base = seq[pos-1];
+freeMem(seq);
+return base;
+}
+
+struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
+/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */
+// Note: this doesn't support non-coding gene symbol terms (which should have nt alleles)
+{
+struct hgvsVariant *hgvs = NULL;
+regmatch_t substrs[11];
+int geneSymbolIx = 1;
+boolean isSubst = regexMatchSubstr(term, pseudoHgvsGeneSymbolProtSubstExp,
+                                   substrs, ArraySize(substrs));
+if (isSubst ||
+    regexMatchSubstr(term, pseudoHgvsGeneSymbolProtRangeExp, substrs, ArraySize(substrs)))
+    {
+    int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
+    char geneSymbol[len+1];
+    safencpy(geneSymbol, sizeof(geneSymbol), term, len);
+    char *npAcc = npForGeneSymbol(db, geneSymbol);
+    if (isNotEmpty(npAcc))
+        {
+        // Make it a real HGVS term with the NP and pass that on to the usual parser.
+        int descStartIx = 2;
+        char *description = term + substrs[descStartIx].rm_so;
+        struct dyString *npTerm = dyStringCreate("%s(%s):p.%s",
+                                                 npAcc, geneSymbol, description);
+        hgvs = hgvsParseTerm(npTerm->string);
+        dyStringFree(&npTerm);
+        }
+    }
+else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolProtPosExp, substrs, ArraySize(substrs)))
+    {
+    int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
+    char geneSymbol[len+1];
+    safencpy(geneSymbol, sizeof(geneSymbol), term, len);
+    char *npAcc = npForGeneSymbol(db, geneSymbol);
+    if (isNotEmpty(npAcc))
+        {
+        // Only position was provided, no change.  Look up ref base and make a synonymous subst
+        // so it's parseable HGVS.
+        int posIx = 2;
+        int pos = regexSubstringInt(term, substrs[posIx]);
+        char refBase = refBaseForNp(db, npAcc, pos);
+        struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=",
+                                                 npAcc, geneSymbol, refBase, pos);
+        hgvs = hgvsParseTerm(npTerm->string);
+        dyStringFree(&npTerm);
+        }
+    }
+else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs)))
+    {
+    int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
+    char geneSymbol[len+1];
+    safencpy(geneSymbol, sizeof(geneSymbol), term, len);
+    char *nmAcc = nmForGeneSymbol(db, geneSymbol);
+    if (isNotEmpty(nmAcc))
+        {
+        // Make it a real HGVS term with the NM and pass that on to the usual parser.
+        int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3;
+        char *description = term + substrs[descStartIx].rm_so;
+        struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s",
+                                                 nmAcc, geneSymbol, description);
+        hgvs = hgvsParseTerm(nmTerm->string);
+        dyStringFree(&nmTerm);
+        }
+    }
+return hgvs;
+}
+
+static char refBaseFromNucSubst(char *change)
+/* If sequence change is a nucleotide substitution, return the reference base, else null char. */
+{
+if (regexMatchNoCase(change, "^([ACGTU])>"))
+    return toupper(change[0]);
+return '\0';
+}
+
+static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs,
+                                   char **retFoundAcc, int *retFoundVersion,
+                                   char **retDiffRefAllele)
+/* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom.
+ * If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL.
+ * If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL.
+ * If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to
+ * the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to
+ * genomic sequence if they differ. */
+{
+boolean coordsOK = FALSE;
+if (retDiffRefAllele)
+    *retDiffRefAllele = NULL;
+if (retFoundAcc)
+    *retFoundAcc = NULL;
+if (retFoundVersion)
+    *retFoundVersion = 0;
+char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
+if (isNotEmpty(chrom))
+    {
+    struct chromInfo *ci = hGetChromInfo(db, chrom);
+    if ((hgvs->start1 >= 1 && hgvs->start1 <= ci->size) &&
+        (hgvs->end >=1 && hgvs->end <= ci->size))
+        {
+        coordsOK = TRUE;
+        if (retDiffRefAllele)
+            {
+            char hgvsBase = refBaseFromNucSubst(hgvs->change);
+            if (hgvsBase != '\0')
+                {
+                struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom,
+                                                   hgvs->start1-1, hgvs->start1);
+                touppers(refBase->dna);
+                if (refBase->dna[0] != hgvsBase)
+                    *retDiffRefAllele = cloneString(refBase->dna);
+                freeMem(refBase);
+                }
+            }
+        }
+    // Since we found hgvs->seqAcc, split it into versionless and version parts.
+    if (retFoundAcc)
+        *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
+    if (retFoundVersion)
+        {
+        char *p = strchr(hgvs->seqAcc, '.');
+        if (p)
+            *retFoundVersion = atoi(p+1);
+        }
+    // Don't free chromInfo, it's cached!
+    }
+return coordsOK;
+}
+
+static char *getCdnaSeq(char *db, char *acc,  char **retFoundAcc)
+/* Return cdna sequence for acc, or NULL if not found.  If retFoundAcc is not NULL,
  * set it to the accession for which we grabbed seq (might have a .version chopped off). */
 {
 char *seq = NULL;
 char *foundAcc = NULL;
 if (startsWith("LRG_", acc))
     {
-    if (hTableExists(db, "lrgPep"))
+    if (hTableExists(db, "lrgCdna"))
         {
         char query[2048];
-        sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc);
+        sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         if (seq)
             foundAcc = cloneString(acc);
         }
     }
 else
     {
     char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.');
-    aaSeq *aaSeq = hGenBankGetPep(db, trimmedAcc, gbSeqTable);
-    if (aaSeq)
+    struct dnaSeq *gbSeq = hGenBankGetMrna(db, trimmedAcc, gbSeqTable);
+    if (gbSeq)
         {
-        seq = aaSeq->dna;
+        seq = gbSeq->dna;
         foundAcc = trimmedAcc;
         }
     }
 if (retFoundAcc)
     *retFoundAcc = foundAcc;
 return seq;
 }
 
-static int getCdsStart(char *db, char *acc)
-/* Return the start coordinate for CDS in genbank or LRG acc, or -1 if not found. */
+static boolean getCds(char *db, char *acc, struct genbankCds *retCds)
+/* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */
 {
-int cdsStart = -1;
+boolean gotCds = FALSE;
 char query[1024];
 if (startsWith("LRG_", acc))
     sqlSafef(query, sizeof(query),
              "select cds from lrgCds where id = '%s'", acc);
 else
     sqlSafef(query, sizeof(query),
              "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND "
              "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc);
 struct sqlConnection *conn = hAllocConn(db);
 char cdsBuf[2048];
 char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
 hFreeConn(&conn);
 if (isNotEmpty(cdsStr))
-    {
-    struct genbankCds cds;
-    if (genbankCdsParse(cdsStr, &cds) && cds.startComplete && cds.start != cds.end)
-        cdsStart = cds.start;
-    }
-return cdsStart;
+    gotCds = (genbankCdsParse(cdsStr, retCds) &&
+              retCds->startComplete && retCds->start != retCds->end);
+return gotCds;
 }
 
 static int getGbVersion(char *db, char *acc)
 /* Return the local version that we have for acc. */
 {
 char query[2048];
 sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc);
 struct sqlConnection *conn = hAllocConn(db);
 int version = sqlQuickNum(conn, query);
 hFreeConn(&conn);
 return version;
 }
 
-boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion,
+static char refBaseFromProt(char *change)
+/* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code,
+ * else null char. */
+{
+regmatch_t substrs[1];
+if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs)))
+    {
+    char match[256];  // 1- or 3-letter code
+    regexSubstringCopy(change, substrs[0], match, sizeof(match));
+    if (strlen(match) == 1)
+        return toupper(match[0]);
+    else
+        return aaAbbrToLetter(match);
+    }
+return '\0';
+}
+
+static char *cloneStringFromChar(char c)
+/* Return a cloned string from a single character. */
+{
+char str[2];
+str[0] = c;
+str[1] = '\0';
+return cloneString(str);
+}
+
+static void checkRefAllele(struct hgvsVariant *hgvs, int start1, char *accSeq,
                            char **retDiffRefAllele)
+/* If hgvs change includes a reference allele, and if accSeq at start1 does not match,
+ *  then set retDiffRefAllele to the accSeq version.  retDiffRefAllele must be non-NULL. */
+{
+char hgvsRefBase = (hgvs->type == hgvstProtein) ? refBaseFromProt(hgvs->change) :
+                                                  refBaseFromNucSubst(hgvs->change);
+if (hgvsRefBase != '\0')
+    {
+    char seqRefBase = toupper(accSeq[start1-1]);
+    if (seqRefBase != hgvsRefBase)
+        *retDiffRefAllele = cloneStringFromChar(seqRefBase);
+    }
+if (hgvs->type == hgvstProtein && *retDiffRefAllele == NULL)
+    {
+    // Protein ranges have a second ref allele base to check
+    char *p = strchr(hgvs->change, '_');
+    if (p != NULL)
+        {
+        hgvsRefBase = refBaseFromProt(p+1);
+        if (hgvsRefBase != '\0')
+            {
+            int end1 = atoi(skipToNumeric(p+1));
+            char seqRefBase = toupper(accSeq[end1-1]);
+            if (seqRefBase != hgvsRefBase)
+                *retDiffRefAllele = cloneStringFromChar(seqRefBase);
+            }
+        }
+    }
+}
+
+static boolean hgvsValidateGene(char *db, struct hgvsVariant *hgvs,
+                                char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
 /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
+ * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so
+ * those can't be checked without mapping the term to the genome.
  * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
  * of hgvs->seqAcc) or NULL if we can't find any match.
  * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
  * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
  * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
  * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
 {
+char *foundAcc = NULL;
+int foundVersion = 0;
 boolean coordsOK = FALSE;
 char *acc = hgvs->seqAcc;
-char *foundAcc = NULL;
 char *accSeq = NULL;
-int cdsOffset = 0;
-int refLen = strlen(hgvs->refAllele);
 if (hgvs->type == hgvstProtein)
     accSeq = getProteinSeq(db, acc, &foundAcc);
 else
     accSeq = getCdnaSeq(db, acc, &foundAcc);
-if (retFoundVersion)
-    {
-    *retFoundVersion = 0;
 if (foundAcc && ! startsWith("LRG_", foundAcc))
-        *retFoundVersion = getGbVersion(db, foundAcc);
-    }
-if (hgvs->type == hgvstCoding && foundAcc)
-    cdsOffset = getCdsStart(db, foundAcc);
-if (accSeq && cdsOffset >= 0 &&
-    (cdsOffset + hgvs->start1 - 1 + refLen) <= strlen(accSeq))
+    foundVersion = getGbVersion(db, foundAcc);
+if (foundAcc)
     {
-    coordsOK = TRUE;
-    }
+    int seqLen = strlen(accSeq);
+    if (hgvs->type == hgvstCoding)
+        {
+        // Coding term coords can extend beyond the bounds of the transcript so
+        // we can't check them without mapping to the genome.  However, if the coords
+        // are in bounds and a reference allele is provided, we can check that.
+        struct genbankCds cds;
+        coordsOK = getCds(db, foundAcc, &cds);
         if (coordsOK && retDiffRefAllele)
             {
-    char *ourSeq = cloneStringZ(accSeq + cdsOffset + hgvs->start1 - 1, refLen);
-    toUpperN(ourSeq, refLen);
-    if (sameString(hgvs->refAllele, ourSeq))
-        *retDiffRefAllele = NULL;
+            int start = hgvs->start1 + (hgvs->startIsUtr3 ? cds.end : cds.start);
+            if (hgvs->startOffset == 0 && start >= 1 && start <= seqLen)
+                checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
+            }
+        }
     else
-        *retDiffRefAllele = ourSeq;
+        {
+        if (hgvs->start1 >= 1 && hgvs->start1 <= seqLen &&
+            hgvs->end >= 1 && hgvs->end <= seqLen)
+            {
+            coordsOK = TRUE;
+            if (retDiffRefAllele)
+                checkRefAllele(hgvs, hgvs->start1, accSeq, retDiffRefAllele);
+            }
+        }
     }
 if (retFoundAcc)
     *retFoundAcc = foundAcc;
+if (retFoundVersion)
+    *retFoundVersion = foundVersion;
 return coordsOK;
 }
 
-static struct psl *pslFromHgvs(struct hgvsVariant *hgvs, int accSize, int cdsStart)
-/* Allocate and return a PSL modeling the variant in transcript named acc. */
+boolean hgvsValidate(char *db, struct hgvsVariant *hgvs,
+                     char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
+/* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
+ * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so
+ * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found.
+ * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
+ * of hgvs->seqAcc) or NULL if we can't find any match.
+ * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
+ * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
+ * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
+ * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
+{
+if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito)
+    return hgvsValidateGenomic(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele);
+else
+    return hgvsValidateGene(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele);
+}
+
+static struct bed3 *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
+/* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */
+{
+struct bed3 *region = NULL;
+char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
+if (isNotEmpty(chrom))
+    {
+    AllocVar(region);
+    region->chrom = chrom;
+    region->chromStart = hgvs->start1 - 1;
+    region->chromEnd = hgvs->end;
+    }
+if (retPslTable)
+    *retPslTable = NULL;
+return region;
+}
+
+static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs,
+                                          int maxCoord, struct genbankCds cds,
+                                          int *retStart, int *retEnd,
+                                          int *retUpstreamBases, int *retDownstreamBases)
+/* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths
+ * for when the coding HGVS has coordinates that extend beyond its sequence boundaries.
+ * ret* args must be non-NULL. */
+{
+// If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive.
+int start = hgvs->start1;
+if (start > 0)
+    start -= 1;
+// If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end.
+int end = hgvs->end;
+if (end < 0)
+    end += 1;
+// If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart
+if (hgvs->startIsUtr3)
+    *retStart = cds.end + start;
+else
+    *retStart = cds.start + start;
+if (hgvs->endIsUtr3)
+    *retEnd = cds.end + end;
+else
+    *retEnd = cds.start + end;
+// Now check for coords that extend beyond the transcript('s alignment to the genome)
+if (*retStart < 0)
+    {
+    // hgvs->start1 is upstream of coding transcript.
+    *retUpstreamBases = -*retStart;
+    *retStart = 0;
+    }
+else if (*retStart > maxCoord)
+    {
+    // Even the start coord is downstream of coding transcript -- make a negative "upstream"
+    // for adjusting start.
+    *retUpstreamBases = -(*retStart - maxCoord + 1);
+    *retStart = maxCoord - 1;
+    }
+else
+    *retUpstreamBases = 0;
+if (*retEnd > maxCoord)
+    {
+    // hgvs->end is downstream of coding transcript.
+    *retDownstreamBases = *retEnd - maxCoord;
+    *retEnd = maxCoord;
+    }
+else if (*retEnd < 0)
+    {
+    // Even the end coord is upstream of coding transcript -- make a negative "downstream"
+    // for adjusting end.
+    *retDownstreamBases = *retEnd;
+    *retEnd = 0;
+    }
+else
+    *retDownstreamBases = 0;
+}
+
+static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, int accSize, int accEnd,
+                                  struct genbankCds cds,
+                                  int *retUpstreamBases, int *retDownstreamBases)
+/* Allocate and return a PSL modeling the variant in nucleotide sequence acc.
+ * The PSL target is the sequence and the query is the changed part of the sequence.
+ * accEnd is given in case accSize includes an unaligned poly-A tail.
+ * If hgvs is coding ("c.") then the caller must pass in a valid cds.
+ * In case the start or end position is outside the bounds of the sequence, set retUpstreamBases
+ * or retDownstreamBases to the number of bases beyond the beginning or end of sequence.
+ * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point
+ * and the caller will have to map that to the genome and then apply the intron offset. */
 {
 if (hgvs == NULL)
     return NULL;
+if (hgvs->type == hgvstProtein)
+    errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein.");
 struct psl *psl;
 AllocVar(psl);
-int refLen = strlen(hgvs->refAllele);
-int altLen = strlen(hgvs->altAllele);
-struct dyString *dy = dyStringCreate("%s%s%s",
-                                     hgvs->refAllele, hgvs->changeSymbol, hgvs->altAllele);
-psl->qName = dyStringCannibalize(&dy);
+psl->tName = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
+safecpy(psl->strand, sizeof(psl->strand), "+");
+psl->tSize = accSize;
+if (hgvs->type != hgvstCoding)
+    {
+    // Sane 1-based fully closed coords.
+    psl->tStart = hgvs->start1 - 1;
+    psl->tEnd = hgvs->end;
+    }
+else
+    {
+    // Simple or insanely complex CDS-relative coords.
+    hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd,
+                                  retUpstreamBases, retDownstreamBases);
+    }
+int refLen = psl->tEnd - psl->tStart;
+// Just use refLen for alt until we parse the sequence change portion of the term:
+int altLen = refLen;
+psl->qName = cloneString(hgvs->change);
 psl->qSize = altLen;
 psl->qStart = 0;
 psl->qEnd = altLen;
-psl->tName = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
-psl->tSize = accSize;
-safecpy(psl->strand, sizeof(psl->strand), "+");
-psl->tStart = cdsStart + hgvs->start1 - 1;
-psl->tEnd = psl->tStart + refLen;
 psl->blockCount = 1;
 AllocArray(psl->blockSizes, psl->blockCount);
 AllocArray(psl->qStarts, psl->blockCount);
 AllocArray(psl->tStarts, psl->blockCount);
 psl->blockSizes[0] = refLen <= altLen ? refLen : altLen;
 psl->qStarts[0] = psl->qStart;
 psl->tStarts[0] = psl->tStart;
 return psl;
 }
 
-static struct psl *hgvsMapCDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
-/* Return a psl with target coords from db, mapping the variant to the genome.
- * Return NULL if unable to map.
+#define limitToRange(val, min, max) { if (val < min) { val = min; }  \
+                                      if (val > max) { val = max; } }
+
+static struct bed3 *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
+/* Return a bed3 with the variant's span on the genome, or NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
-struct psl *mappedToGenome = NULL;
+struct bed3 *region = NULL;
 char *trackTable = NULL;
 char *pslTable = NULL;
 char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
-int cdsStart = -1;
+if (hgvs->type == hgvstGenomic)
+    return hgvsMapGDotToGenome(db, hgvs, retPslTable);
 if (startsWith("NM_", acc))
     {
     // TODO: replace these with NCBI's alignments
     trackTable = "refGene";
     pslTable = "refSeqAli";
     }
 else if (startsWith("LRG_", acc))
     {
     trackTable = "lrgTranscriptAli";
     pslTable = "lrgTranscriptAli";
     }
-cdsStart = getCdsStart(db, acc);
-if (trackTable && pslTable && cdsStart >= 0 &&
+struct genbankCds cds;
+boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE;
+if (trackTable && pslTable && (hgvs->type != hgvstCoding || gotCds) &&
     hTableExists(db, trackTable) && hTableExists(db, pslTable))
     {
     struct dyString *dyQuery = sqlDyStringCreate("select * from %s where qName = '%s'",
                                                  pslTable, acc);
     struct sqlConnection *conn = hAllocConn(db);
     struct sqlResult *sr = sqlGetResult(conn, dyQuery->string);
     int bin = hIsBinned(db, pslTable);
     char **row;
     if (sr && (row = sqlNextRow(sr)))
         {
         struct psl *txAli = pslLoad(row+bin);
-        struct psl *variantPsl = pslFromHgvs(hgvs, txAli->qSize, cdsStart);
-        mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli);
+        // variantPsl contains the anchor if a non-cdsStart anchor is used because
+        // the actual position might be outside the bounds of the transcript sequence (intron/UTR)
+        int upstream = 0, downstream = 0;
+        struct psl *variantPsl = pslFromHgvsNuc(hgvs, txAli->qSize, txAli->qEnd, cds,
+                                                &upstream, &downstream);
+        struct psl *mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli);
+        if (mappedToGenome)
+            {
+            AllocVar(region);
+            region->chrom = cloneString(mappedToGenome->tName);
+            region->chromStart = mappedToGenome->tStart;
+            region->chromEnd = mappedToGenome->tEnd;
+            // If the start and/or end is in an intron, add the intron offset now.
+            boolean revStrand = (mappedToGenome->strand[0] == '-');
+            if (hgvs->startOffset != 0)
+                {
+                if (revStrand)
+                    region->chromEnd -= hgvs->startOffset;
+                else
+                    region->chromStart += hgvs->startOffset;
+                }
+            if (hgvs->endOffset != 0)
+                {
+                if (revStrand)
+                    region->chromStart -= hgvs->endOffset;
+                else
+                    region->chromEnd += hgvs->endOffset;
+                }
+            // Apply extra up/downstream offsets (usually 0)
+            if (revStrand)
+                {
+                region->chromStart -= downstream;
+                region->chromEnd += upstream;
+                }
+            else
+                {
+                region->chromStart -= upstream;
+                region->chromEnd += downstream;
+                }
+            limitToRange(region->chromStart, 0, mappedToGenome->tSize)
+            limitToRange(region->chromEnd, 0, mappedToGenome->tSize)
+            freez(&mappedToGenome);
+            }
         }
     sqlFreeResult(&sr);
     hFreeConn(&conn);
     }
-if (mappedToGenome && retPslTable)
+if (region && retPslTable)
     *retPslTable = cloneString(pslTable);
-return mappedToGenome;
-}
-
-static char *aaToArbitraryCodons(char *aaStr)
-/* Translate amino acid string into codon string where we just take the first codon we
- * find for each amino. */
-{
-int aaLen = strlen(aaStr);
-char *codonStr = needMem(aaLen*3 + 1);
-int i;
-for (i = 0;  i < aaLen;  i++)
-    aaToArbitraryCodon(aaStr[i], &codonStr[i*3]);
-return codonStr;
+return region;
 }
 
-static struct psl *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
-/* Return a psl with target coords from db, mapping the variant to the genome.
- * Return NULL if unable to map.
+static struct bed3 *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
+/* Return a bed3 with the variant's span on the genome, or NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
-struct psl *mappedToGenome = NULL;
+struct bed3 *region = NULL;
 char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
 if (startsWith("NP_", acc))
     {
     // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome.
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
     sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
           "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
     char *nmAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     if (nmAcc)
         {
         struct hgvsVariant cDot;
         zeroBytes(&cDot, sizeof(cDot));
         cDot.type = hgvstCoding;
         cDot.seqAcc = nmAcc;
-        cDot.changeSymbol = ">";
-        // These aren't necessarily right since AA doesn't have as much info as codon.
-        // We could get the refAllele right using the reference sequence -- but altAllele
-        // could still be ambiguous.  In the meantime we're not really using the alleles
-        // for anything besides length so far...
-        cDot.refAllele = aaToArbitraryCodons(hgvs->refAllele);
-        cDot.altAllele = aaToArbitraryCodons(hgvs->altAllele);
         cDot.start1 = ((hgvs->start1 - 1) * 3) + 1;
         cDot.end = ((hgvs->end - 1) * 3) + 3;
-        mappedToGenome = hgvsMapCDotToGenome(db, &cDot, retPslTable);
+        cDot.change = hgvs->change;
+        region = hgvsMapNucToGenome(db, &cDot, retPslTable);
         }
     }
-return mappedToGenome;
+return region;
 }
 
-struct psl *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
-/* Return a psl with target coords from db, mapping the variant to the genome.
- * Return NULL if unable to map.
+struct bed3 *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
+/* Return a bed3 with the variant's span on the genome, or NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
-if (hgvs == NULL || !hgvsValidate(db, hgvs, NULL, NULL, NULL))
+if (hgvs == NULL)
     return NULL;
-struct psl *mappedToGenome = NULL;
-if (hgvs->type == hgvstCoding)
-    mappedToGenome = hgvsMapCDotToGenome(db, hgvs, retPslTable);
-else if (hgvs->type == hgvstProtein)
-    mappedToGenome = hgvsMapPDotToGenome(db, hgvs, retPslTable);
-return mappedToGenome;
+struct bed3 *region = NULL;
+if (hgvs->type == hgvstProtein)
+    region = hgvsMapPDotToGenome(db, hgvs, retPslTable);
+else
+    region = hgvsMapNucToGenome(db, hgvs, retPslTable);
+return region;
 }