5f9ba1691bd3051146f2f9464c9d1375e42f6ff4
angie
  Thu Mar 8 11:36:57 2018 -0800
Added support for ENS*T* transcript IDs in HGVS position search, using the latest Gencode.
Added support for parsing ENS*P* protein IDs, but can't yet map those to the genome because
our Gencode tables don't yet include a mapping between transcript and protein IDs.
refs #21076

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index c295038..44ab4f3 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -16,35 +16,44 @@
 #include "trackHub.h"
 
 void hgvsVariantFree(struct hgvsVariant **pHgvs)
 // Free *pHgvs and its contents, and set *pHgvs to NULL.
 {
 if (pHgvs && *pHgvs)
     {
     struct hgvsVariant *hgvs = *pHgvs;
     freez(&hgvs->seqAcc);
     freez(&hgvs->seqGeneSymbol);
     freez(&hgvs->changes);
     freez(pHgvs);
     }
 }
 
-// Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq:
+// Regular expressions for HGVS-recognized sequence accessions: LRG, ENS* or versioned RefSeq:
 #define lrgTranscriptExp "(LRG_[0-9]+t[0-9]+)"
 #define lrgProteinExp "(LRG_[0-9]+p[0-9]+)"
 #define lrgRegionExp "(LRG_[0-9]+)"
 
+#define ensTranscriptExp "(ENS([A-Z]{3})?T[0-9]+\\.[0-9]+)"
+//      0.....................................  whole matching string
+//      1.....................................  ENS transcript ID including optional lift suffix
+//         2...                                 optional non-human species code e.g. MUS for mouse
+#define ensProteinExp "(ENS([A-Z]{3})?P[0-9]+\\.[0-9]+)"
+//      0.....................................  whole matching string
+//      1.....................................  ENS protein ID
+//         2...                                 optional non-human species code e.g. MUS for mouse
+
 // NC = RefSeq reference assembly chromosome (NT = contig (e.g. alt), NW = patch)
 // NG = RefSeq incomplete genomic region (e.g. gene locus)
 // NM = RefSeq curated mRNA
 // NP = RefSeq curated protein
 // NR = RefSeq curated (non-coding) RNA
 #define geneSymbolExp "([A-Za-z0-9./_-]+)"
 #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?"
 #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp
 //                                 ........................       accession and optional dot version
 //                                              ...........       optional dot version
 //                                                         ...... optional gene symbol in ()s
 //                                                          ....  optional gene symbol
 #define versionedRefSeqNCExp versionedAccPrefixExp("[NX][CTW]")
 #define versionedRefSeqNGExp versionedAccPrefixExp("[NX]G")
 #define versionedRefSeqNMExp versionedAccPrefixExp("[NX]M")
@@ -112,144 +121,202 @@
 //      0.....................................  whole matching string
 //      1.......................                LRG region
 //                              2.              g or m
 //                                3..           1-based start position
 //                                   4...       optional range separator and end position
 //                                     5..      1-based end position
 //                                        6...  change description
 
 
 #define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp)
 #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)"
 // substring numbering:
 //      0.....................................  whole matching string
 //      1.................                      accession and optional dot version
 //               2........                      optional dot version
-//                       3......                (n/a) optional gene symbol in ()s
-//                        4....                 (n/a) optional gene symbol
+//                       3......                optional gene symbol in ()s
+//                        4....                 optional gene symbol
 //                              5.              g or m
 //                                6..           1-based start position
 //                                   7...       optional range separator and end position
 //                                     8..      1-based end position
 //                                        9...  change description
 
 #define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp)
 #define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)"
 // substring numbering:
 //      0.....................................................  whole matching string
-//      1...................                                    LRG transcript
+//      1.............                                          LRG transcript
 //                   2...                                       optional UTR anchor "-" or "*"
 //                       3...                                   mandatory first number
 //                           4.......                           optional offset separator and offset
 //                           5...                               offset separator
 //                               6...                           offset number
 //                                   7...............           optional range sep and complex num
 //                                   8...                       optional UTR anchor "-" or "*"
 //                                       9...                   first number
 //                                          10.......           optional offset separator and offset
 //                                          11...               offset separator
 //                                              12...           offset number
 //                                                  13........  sequence change
 
+#define hgvsEnsCDotPosExp hgvsFullRegex(ensTranscriptExp, hgvsCDotPosExp)
+#define hgvsEnsCDotExp hgvsEnsCDotPosExp "(.*)"
+// substring numbering:
+//      0.....................................................  whole matching string
+//      1............                                           ENS transcript ID
+//         2...                                                 optional non-human species code
+//                   3...                                       optional UTR anchor "-" or "*"
+//                       4...                                   mandatory first number
+//                           5.......                           optional offset separator and offset
+//                           6...                               offset separator
+//                               7...                           offset number
+//                                   8...............           optional range sep and complex num
+//                                   9...                       optional UTR anchor "-" or "*"
+//                                      10...                   first number
+//                                          11.......           optional offset separator and offset
+//                                          12...               offset separator
+//                                              13...           offset number
+//                                                  14........  sequence change
+
 #define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp)
 #define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)"
 // substring numbering:
 //      0...............................................................  whole matching string
 //      1...............                                                  acc & optional dot version
 //             2........                                                  optional dot version
 //                       3.....                                           optional gene sym in ()s
 //                        4...                                            optional gene symbol
 //                              5...                                      optional UTR anchor
 //                                  6...                                  mandatory first number
 //                                      7.......                          optional offset
 //                                      8...                              offset separator
 //                                          9...                          offset number
 //                                             10...............          optional range
 //                                             11...                      optional UTR anchor
 //                                                12...                   first number
 //                                                     13.......          optional offset
 //                                                     14...              offset separator
 //                                                         15...          offset number
 //                                                             16........ sequence change
 
 #define hgvsLrgNDotExp hgvsFullRegex(lrgTranscriptExp, hgvsNDotPosExp) "(.*)"
 // substring numbering:
 //      0.....................................................  whole matching string
-//      1...................                                    LRG transcript
-//                   2...                                       n/a 4 n.: UTR anchor "-" or "*"
+//      1.............                                          LRG transcript
+//                   2...                                       UTR anchor "-" or "*"
 //                       3...                                   mandatory first number
 //                           4.......                           optional offset separator and offset
 //                           5...                               offset separator
 //                               6...                           offset number
 //                                   7...............           optional range sep and complex num
-//                                   8...                       n/a 4 n.: UTR anchor "-" or "*"
+//                                   8...                       UTR anchor "-" or "*"
 //                                       9...                   first number
 //                                          10.......           optional offset separator and offset
 //                                          11...               offset separator
 //                                              12...           offset number
 //                                                  13........  sequence change
 
+#define hgvsEnsNDotExp hgvsFullRegex(ensTranscriptExp, hgvsNDotPosExp) "(.*)"
+// substring numbering:
+//      0.....................................................  whole matching string
+//      1............                                           ENS transcript ID
+//         2...                                                 optional non-human species code
+//                   3...                                       UTR anchor "-" or "*"
+//                       4...                                   mandatory first number
+//                           5.......                           optional offset separator and offset
+//                           6...                               offset separator
+//                               7...                           offset number
+//                                   8...............           optional range sep and complex num
+//                                   9...                       UTR anchor "-" or "*"
+//                                      10...                   first number
+//                                          11.......           optional offset separator and offset
+//                                          12...               offset separator
+//                                              13...           offset number
+//                                                  14........  sequence change
+
 #define hgvsRefSeqNMRNDotExp hgvsFullRegex(versionedRefSeqNMRExp, hgvsNDotPosExp) "(.*)"
 // substring numbering:
 //      0...............................................................  whole matching string
 //      1...............                                                  acc & optional dot version
 //             2........                                                  optional dot version
 //                       3.....                                           optional gene sym in ()s
 //                        4...                                            optional gene symbol
-//                              5...                                      n/a 4 n.: UTR anchor
+//                              5...                                      UTR anchor
 //                                  6...                                  mandatory first number
 //                                      7.......                          optional offset
 //                                      8...                              offset separator
 //                                          9...                          offset number
 //                                             10...............          optional range
-//                                             11...                      n/a 4 n.: UTR anchor
+//                                             11...                      UTR anchor
 //                                                12...                   first number
 //                                                     13.......          optional offset
 //                                                     14...              offset separator
 //                                                         15...          offset number
 //                                                             16........ sequence change
 
 #define hgvsLrgPDotSubstExp hgvsFullRegex(lrgProteinExp, hgvsPDotSubstExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1............................                           LRG protein
 //                                   2.....                     original sequence
 //                                           3......            1-based position
 //                                                     4......  replacement sequence
 
+#define hgvsEnsPDotSubstExp hgvsFullRegex(ensProteinExp, hgvsPDotSubstExp)
+// substring numbering:
+//      0.....................................................  whole matching string
+//      1...........................                            ENS protein ID
+//         2...                                                 optional non-human species code
+//                                   3.....                     original sequence
+//                                           4......            1-based position
+//                                                     5......  replacement sequence
+
 #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1................                                       accession and optional dot version
 //                 2.....                                       optional dot version
 //                         3.....                               optional gene symbol in ()s
 //                          4...                                optional gene symbol
 //                                   5.....                     original sequence
 //                                           6......            1-based position
 //                                                     7......  replacement sequence
 
 #define hgvsLrgPDotRangeExp hgvsFullRegex(lrgProteinExp, hgvsPDotRangeExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1..........................                             accession and optional dot version
 //                                 2...                         original start AA
 //                                       3...                   1-based start position
 //                                          4.................  optional range sep and AA+pos
 //                                            5...              original end AA
 //                                                 6...         1-based end position
 //                                                     7......  change description
 
+#define hgvsEnsPDotRangeExp hgvsFullRegex(ensProteinExp, hgvsPDotRangeExp)
+// substring numbering:
+//      0.....................................................  whole matching string
+//      1...........................                            ENS protein ID
+//         2...                                                 optional non-human species code
+//                                 3...                         original start AA
+//                                       4...                   1-based start position
+//                                          5.................  optional range sep and AA+pos
+//                                            6...              original end AA
+//                                                 7...         1-based end position
+//                                                     8......  change description
+
 #define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp)
 // substring numbering:
 //      0.....................................................  whole matching string
 //      1................                                       accession and optional dot version
 //                 2.....                                       optional dot version
 //                         3.....                               optional gene symbol in ()s
 //                          4...                                optional gene symbol
 //                                 5...                         original start AA
 //                                       6...                   1-based start position
 //                                          7.................  optional range sep and AA+pos
 //                                            8...              original end AA
 //                                                 9...         1-based end position
 //                                                     10.....  change description
 
 // Pseudo-HGVS in common usage
@@ -390,49 +457,59 @@
         *retOffset = -*retOffset;
     }
 }
 
 static struct hgvsVariant *hgvsParseCNDotPos(char *term)
 /* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 boolean isNoncoding = FALSE;
 boolean matches = FALSE;
 int accIx = 1;
 int startAnchorIx = 2;
 int endAnchorIx = 8;
 int endPosIx = 9;
 int changeIx = 13;
-// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that
-// affects all substring offsets after the accession.
-int refSeqExtra = 3;
+// The LRG accession regex has only one substring but the RefSeq acc regex has 4 and ENS has 2,
+// so that affects all substring offsets after the accession.
+int ixExtra = 0;
 int geneSymbolIx = -1;
 regmatch_t substrs[17];
 if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs)) ||
     (isNoncoding = regexMatchSubstr(term, hgvsLrgNDotExp, substrs, ArraySize(substrs))))
     {
     matches = TRUE;
     }
+else if (regexMatchSubstr(term, hgvsEnsCDotExp, substrs, ArraySize(substrs)) ||
+    (isNoncoding = regexMatchSubstr(term, hgvsEnsNDotExp, substrs, ArraySize(substrs))))
+    {
+    matches = TRUE;
+    ixExtra = 1;
+    }
 else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs)) ||
          (isNoncoding = regexMatchSubstr(term, hgvsRefSeqNMRNDotExp, substrs, ArraySize(substrs))))
     {
     matches = TRUE;
+    ixExtra = 3;
     geneSymbolIx = 4;
-    startAnchorIx += refSeqExtra;
-    endAnchorIx += refSeqExtra;
-    endPosIx += refSeqExtra;
-    changeIx += refSeqExtra;
+    }
+if (ixExtra > 0)
+    {
+    startAnchorIx += ixExtra;
+    endAnchorIx += ixExtra;
+    endPosIx += ixExtra;
+    changeIx += ixExtra;
     }
 if (matches)
     {
     AllocVar(hgvs);
     hgvs->type = isNoncoding ? hgvstNoncoding : hgvstCoding;
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
     extractComplexNum(term, substrs, startAnchorIx,
                       &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset);
     if (isNoncoding && hgvs->startIsUtr3)
         {
         warn("hgvsParseCNDotPos: noncoding term '%s' appears to start in UTR3 (*), "
              "not applicable for noncoding", term);
         hgvs->startIsUtr3 = FALSE;
         }
     if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
@@ -458,44 +535,53 @@
     }
 return hgvs;
 }
 
 static struct hgvsVariant *hgvsParsePDotSubst(char *term)
 /* If term is parseable as an HGVS protein substitution, return the parsed representation,
  * otherwise NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 boolean matches = FALSE;
 int accIx = 1;
 int refIx = 2;
 int posIx = 3;
 int altIx = 4;
 int geneSymbolIx = -1;
+// The LRG accession regex has only one substring but the RefSeq acc regex has 4 and ENS has 2,
+// so that affects all substring offsets after the accession.
+int ixExtra = 0;
 regmatch_t substrs[8];
 if (regexMatchSubstr(term, hgvsLrgPDotSubstExp, substrs, ArraySize(substrs)))
     matches = TRUE;
+else if (regexMatchSubstr(term, hgvsEnsPDotSubstExp, substrs, ArraySize(substrs)))
+    {
+    ixExtra = 1;
+    matches = TRUE;
+    }
 else if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs)))
     {
     matches = TRUE;
-    // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that
-    // affects all substring offsets after the accession.
-    int refSeqExtra = 3;
-    refIx += refSeqExtra;
-    posIx += refSeqExtra;
-    altIx += refSeqExtra;
+    ixExtra = 3;
     geneSymbolIx = 4;
     }
+if (ixExtra > 0)
+    {
+    refIx += ixExtra;
+    posIx += ixExtra;
+    altIx += ixExtra;
+    }
 if (matches)
     {
     AllocVar(hgvs);
     hgvs->type = hgvstProtein;
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
     int changeStart = substrs[refIx].rm_so;
     int changeEnd = substrs[altIx].rm_eo;
     int len = changeEnd - changeStart;
     char change[len+1];
     safencpy(change, sizeof(change), term+changeStart, len);
     hgvs->changes = cloneString(change);
     if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
         hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
     hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
     hgvs->end = hgvs->start1;
@@ -503,45 +589,54 @@
 return hgvs;
 }
 
 static struct hgvsVariant *hgvsParsePDotRange(char *term)
 /* If term is parseable as an HGVS protein range change, return the parsed representation,
  * otherwise NULL. */
 {
 struct hgvsVariant *hgvs = NULL;
 boolean matches = FALSE;
 int accIx = 1;
 int startRefIx = 2;
 int startPosIx = 3;
 int endPosIx = 6;
 int changeIx = 7;
 int geneSymbolIx = -1;
+// The LRG accession regex has only one substring but the RefSeq acc regex has 4 and ENS has 2,
+// so that affects all substring offsets after the accession.
+int ixExtra = 0;
 regmatch_t substrs[11];
 if (regexMatchSubstr(term, hgvsLrgPDotRangeExp, substrs, ArraySize(substrs)))
     matches = TRUE;
+else if (regexMatchSubstr(term, hgvsEnsPDotRangeExp, substrs, ArraySize(substrs)))
+    {
+    matches = TRUE;
+    ixExtra = 1;
+    }
 else if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs)))
     {
     matches = TRUE;
-    // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that
-    // affects all substring offsets after the accession.
-    int refSeqExtra = 3;
-    startRefIx += refSeqExtra;
-    startPosIx += refSeqExtra;
-    endPosIx += refSeqExtra;
-    changeIx += refSeqExtra;
+    ixExtra = 3;
     geneSymbolIx = 4;
     }
+if (ixExtra > 0)
+    {
+    startRefIx += ixExtra;
+    startPosIx += ixExtra;
+    endPosIx += ixExtra;
+    changeIx += ixExtra;
+    }
 if (matches)
     {
     AllocVar(hgvs);
     hgvs->type = hgvstProtein;
     hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
     int changeStart = substrs[startRefIx].rm_so;
     int changeEnd = substrs[changeIx].rm_eo;
     int len = changeEnd - changeStart;
     char change[len+1];
     safencpy(change, sizeof(change), term+changeStart, len);
     hgvs->changes = cloneString(change);
     if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
         hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
     hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
     if (regexSubstrMatched(substrs[endPosIx]))
@@ -958,83 +1053,156 @@
         // Since we found hgvs->seqAcc, split it into versionless and version parts.
         if (retFoundAcc)
             *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
         if (retFoundVersion)
             {
             char *p = strchr(hgvs->seqAcc, '.');
             if (p)
                 *retFoundVersion = atoi(p+1);
             }
         // Don't free chromInfo, it's cached!
         }
     }
 return coordsOK;
 }
 
+static char *txSeqFromGp(char *db, struct genePred *gp)
+/* Return transcribed-from-genome sequence for gp */
+{
+int seqLen = 0;
+int i;
+for (i = 0;  i < gp->exonCount;  i++)
+    seqLen += (gp->exonEnds[i] - gp->exonStarts[i]);
+char *seq = needMem(seqLen+1);
+int offset = 0;
+for (i = 0;  i < gp->exonCount;  i++)
+    {
+    int exonStart = gp->exonStarts[i];
+    int exonEnd = gp->exonEnds[i];
+    struct dnaSeq *exonSeq = hChromSeq(db, gp->chrom, exonStart, exonEnd);
+    safencpy(seq+offset, seqLen+1-offset, exonSeq->dna, exonSeq->size);
+    offset += exonSeq->size;
+    dnaSeqFree(&exonSeq);
+    }
+if (gp->strand[0] == '-')
+    reverseComplement(seq, seqLen);
+return seq;
+}
+
+static struct genePred *getGencodeGp(char *db, char *acc)
+/* Return the genePred for acc in the latest wgEncodeGencodeBasicV* table, or NULL if not found. */
+{
+struct genePred *gp = NULL;
+struct sqlConnection *conn = hAllocConn(db);
+char *gencodeTable = hFindLatestGencodeTableConn(conn, "Basic");
+if (gencodeTable)
+    {
+    int hasBin = 1;
+    char query[2048];
+    sqlSafef(query, sizeof(query), "select * from %s where name = '%s'", gencodeTable, acc);
+    struct sqlResult *sr = sqlGetResult(conn, query);
+    char **row;
+    if ((row = sqlNextRow(sr)) != NULL)
+        gp = genePredExtLoad(row+hasBin, GENEPREDX_NUM_COLS);
+    sqlFreeResult(&sr);
+    }
+hFreeConn(&conn);
+return gp;
+}
+
+static char *getGencodeSeq(char *db, char *acc)
+/* Return transcribed-from-genome sequence for acc, or NULL if not found. */
+{
+char *seq = NULL;
+struct genePred *gp = getGencodeGp(db, acc);
+if (gp)
+    seq = txSeqFromGp(db, gp);
+return seq;
+}
+
 static char *getCdnaSeq(char *db, char *acc)
 /* Return cdna sequence for acc, or NULL if not found. */
 {
 if (trackHubDatabase(db))
     return NULL;
 char *seq = NULL;
 if (startsWith("LRG_", acc))
     {
     if (hTableExists(db, "lrgCdna"))
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     }
+else if (startsWith("ENS", acc ))
+    {
+    // Construct it from the genome I guess?
+    seq = getGencodeSeq(db, acc);
+    }
 else
     {
     struct dnaSeq *cdnaSeq = NULL;
     if (hDbHasNcbiRefSeq(db))
         cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq");
     else
         cdnaSeq = hGenBankGetMrna(db, acc, NULL);
     if (cdnaSeq)
         seq = dnaSeqCannibalize(&cdnaSeq);
     }
 return seq;
 }
 
 static boolean getCds(char *db, char *acc, struct genbankCds *retCds)
-/* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */
+/* Get the CDS info for genbank/LRG/ENS acc; return FALSE if not found or not applicable. */
 {
 if (trackHubDatabase(db))
     return FALSE;
 boolean gotCds = FALSE;
+char *cdsStr = NULL;
 char query[1024];
+if (startsWith("ENS", acc))
+    {
+    // Infer CDS from genePred coords
+    struct genePred *gp = getGencodeGp(db, acc);
+    if (gp)
+        {
+        genePredToCds(gp, retCds);
+        gotCds = (retCds->end > retCds->start);
+        }
+    }
+else
+    {
     if (startsWith("LRG_", acc))
         sqlSafef(query, sizeof(query),
                  "select cds from lrgCds where id = '%s'", acc);
     else if (hDbHasNcbiRefSeq(db) &&
              // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete:
              strchr(acc, '.'))
         sqlSafef(query, sizeof(query),
                  "select cds from ncbiRefSeqCds where id = '%s'", acc);
     else
         sqlSafef(query, sizeof(query),
                  "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND "
                  "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc);
     struct sqlConnection *conn = hAllocConn(db);
     char cdsBuf[2048];
-char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
+    cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
     hFreeConn(&conn);
+    }
 if (isNotEmpty(cdsStr))
     gotCds = (genbankCdsParse(cdsStr, retCds) &&
               retCds->startComplete && retCds->start != retCds->end);
 return gotCds;
 }
 
 static char refBaseFromProt(char *change)
 /* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code,
  * else null char. */
 {
 regmatch_t substrs[1];
 if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs)))
     {
     char match[256];  // 1- or 3-letter code
     regexSubstringCopy(change, substrs[0], match, sizeof(match));
@@ -1089,44 +1257,52 @@
 static int getGbVersion(char *db, char *acc)
 /* Return the local version that we have for acc. */
 {
 if (trackHubDatabase(db))
     return 0;
 char query[2048];
 sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc);
 struct sqlConnection *conn = hAllocConn(db);
 int version = sqlQuickNum(conn, query);
 hFreeConn(&conn);
 return version;
 }
 
 static char *normalizeVersion(char *db, char *acc, int *retFoundVersion)
 /* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0.
+ * ENS accessions must have versions.
  * The user may give us a RefSeq accession with or without a .version.
  * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary.
  * If we look it up and can't find it, this returns NULL.
  * If instead we're using genbank tables, return acc with no version.
  * That ensures that acc will be found in our local tables. */
 {
 if (trackHubDatabase(db))
     return NULL;
 char *normalizedAcc = NULL;
 int foundVersion = 0;
 if (startsWith("LRG_", acc))
     {
     normalizedAcc = cloneString(acc);
     }
+else if (startsWith("ENS", acc))
+    {
+    normalizedAcc = cloneString(acc);
+    char *p = strchr(acc, '.');
+    if (p)
+        foundVersion = atoi(p+1);
+    }
 else if (hDbHasNcbiRefSeq(db))
     {
     // ncbiRefSeq tables need versioned accessions.
     if (strchr(acc, '.'))
         normalizedAcc = cloneString(acc);
     else
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         normalizedAcc = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     if (isNotEmpty(normalizedAcc))
         {
@@ -1501,65 +1677,51 @@
     int tBlockEnd = txAli->tStarts[i] + txAli->blockSizes[i];
     int tNextBlockStart = txAli->tStarts[i+1];
     if (vStart >= qBlockEnd && vEnd <= qNextBlockStart)
         {
         if (tBlockEnd == tNextBlockStart)
             return pslDelFromCoord(txAli, tBlockEnd, variantPsl);
         else
             return pslFromGap(txAli, i, variantPsl);
         }
     }
 // Not contained in a deletion from reference genome (txAli target) -- return NULL.
 return NULL;
 }
 
 
-static struct psl *mapPsl(char *db, struct hgvsVariant *hgvs, char *pslTable, char *acc,
+static struct psl *mapPsl(struct hgvsVariant *hgvs, char *acc, struct psl *txAli,
                           struct genbankCds *cds, int *retUpstream, int *retDownstream)
-/* If acc is found in pslTable, use pslTransMap to map hgvs onto the genome. */
+/* Use pslTransMap to map hgvs onto the genome. */
 {
-struct psl *mappedToGenome = NULL;
-char query[2048];
-sqlSafef(query, sizeof(query), "select * from %s where qName = '%s'", pslTable, acc);
-struct sqlConnection *conn = hAllocConn(db);
-struct sqlResult *sr = sqlGetResult(conn, query);
-char **row;
-if (sr && (row = sqlNextRow(sr)))
-    {
-    int bin = 1; // All PSL tables used here, and any made in the future, use the bin column.
-    struct psl *txAli = pslLoad(row+bin);
 // variantPsl contains the anchor if a non-cdsStart anchor is used because
 // the actual position might be outside the bounds of the transcript sequence (intron/UTR)
 struct psl *variantPsl = pslFromHgvsNuc(hgvs, acc, txAli->qSize, txAli->qEnd, cds,
                                         retUpstream, retDownstream);
-    mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli);
+struct psl *mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli);
 // If there is a deletion in the genome / insertion in the transcript then pslTransMap cannot
 // map those bases to the genome.  In that case take a harder look and return the deletion
 // coords.
 if (mappedToGenome == NULL)
     mappedToGenome = mapToDeletion(variantPsl, txAli);
 pslFree(&variantPsl);
-    pslFree(&txAli);
-    }
-sqlFreeResult(&sr);
-hFreeConn(&conn);
 return mappedToGenome;
 }
 
 static char *pslTableForAcc(char *db, char *acc)
-/* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where
- * acc should be found (table may or may not exist).  Don't free the returned string. */
+/* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL (or genePred for GENCODE)
+ * table where acc should be found (table may or may not exist).  Don't free the returned string. */
 {
 char *pslTable = NULL;
 if (startsWith("LRG_", acc))
     pslTable = "lrgTranscriptAli";
 else if (startsWith("NM_", acc) || startsWith("NR_", acc) ||
          startsWith("XM_", acc) || startsWith("XR_", acc))
     {
     // Use NCBI's alignments if they are available
     if (hDbHasNcbiRefSeq(db))
         pslTable = "ncbiRefSeqPsl";
     else
         pslTable = "refSeqAli";
     }
 return pslTable;
 }
@@ -1593,88 +1755,156 @@
 if (revStrand)
     {
     region->chromStart -= downstream;
     region->chromEnd += upstream;
     }
 else
     {
     region->chromStart -= upstream;
     region->chromEnd += downstream;
     }
 limitToRange(region->chromStart, 0, psl->tSize);
 limitToRange(region->chromEnd, 0, psl->tSize);
 return region;
 }
 
+static struct psl *pslForQName(char *db, char *pslTable, char *acc)
+/* Look up acc in pslTable's qName column; return psl or NULL if not found. */
+{
+struct psl *psl = NULL;
+char query[2048];
+struct sqlConnection *conn = hAllocConn(db);
+char **row;
+sqlSafef(query, sizeof(query), "select * from %s where qName = '%s'", pslTable, acc);
+struct sqlResult *sr = sqlGetResult(conn, query);
+if (sr && (row = sqlNextRow(sr)))
+    {
+    // All PSL tables used here, and any made in the future, use the bin column.
+    int bin = 1;
+    psl = pslLoad(row+bin);
+    }
+sqlFreeResult(&sr);
+hFreeConn(&conn);
+return psl;
+}
+
+static struct psl *getPslAndCds(char *db, struct hgvsVariant *hgvs, char **pAcc,
+                                struct genbankCds *cds, char **retPslTable)
+/* Get PSL and cds (if applicable) for acc. */
+{
+struct psl *txAli = NULL;
+cds->start = cds->end = -1;
+cds->startComplete = cds->endComplete = cds->complement = FALSE;
+char *acc = *pAcc;
+if (startsWith("ENS", acc))
+    {
+    struct genePred *gp = getGencodeGp(db, acc);
+    if (gp)
+        {
+        int qSize = 0; // infer -- no polyAs in Gencode because it's genome-based
+        txAli = genePredToPsl(gp, hChromSize(db, gp->chrom), qSize);
+        genePredToCds(gp, cds);
+        if (retPslTable)
+            {
+            // Well, it's not PSL but it's the track table:
+            struct sqlConnection *conn = hAllocConn(db);
+            *retPslTable = hFindLatestGencodeTableConn(conn, "Basic");
+            hFreeConn(&conn);
+            }
+        }
+    }
+else
+    {
+    char *pslTable = pslTableForAcc(db, acc);
+    if (pslTable && hTableExists(db, pslTable))
+        {
+        if (hgvs->type == hgvstCoding)
+            getCds(db, acc, cds);
+        txAli = pslForQName(db, pslTable, acc);
+        }
+    // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back
+    // on UCSC alignments.
+    if (!txAli && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli"))
+        {
+        char *accNoVersion = cloneFirstWordByDelimiter(acc, '.');
+        if (hgvs->type == hgvstCoding)
+            getCds(db, accNoVersion, cds);
+        txAli = pslForQName(db, "refSeqAli", accNoVersion);
+        if (txAli)
+            {
+            pslTable = "refSeqAli";
+            *pAcc = accNoVersion;
+            }
+        }
+    if (txAli && retPslTable != NULL)
+        *retPslTable = pslTable;
+    }
+return txAli;
+}
+
 static struct bed *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 if (hgvs->type == hgvstGenomic)
     return hgvsMapGDotToGenome(db, hgvs, retPslTable);
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
 if (isEmpty(acc))
     return NULL;
 struct bed *region = NULL;
-char *pslTable = pslTableForAcc(db, acc);
+char *pslTable = NULL;
 struct genbankCds cds;
-boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE;
-if (pslTable && (hgvs->type != hgvstCoding || gotCds) && hTableExists(db, pslTable))
+struct psl *txAli = getPslAndCds(db, hgvs, &acc, &cds, &pslTable);
+boolean gotCds = (cds.end > cds.start);
+if (txAli && (hgvs->type != hgvstCoding || gotCds))
     {
     int upstream = 0, downstream = 0;
-    struct psl *mappedToGenome = mapPsl(db, hgvs, pslTable, acc, &cds, &upstream, &downstream);
-    // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back
-    // on UCSC alignments.
-    if (!mappedToGenome && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli"))
-        {
-        char *accNoVersion = cloneFirstWordByDelimiter(acc, '.');
-        gotCds = (hgvs->type == hgvstCoding) ? getCds(db, accNoVersion, &cds) : FALSE;
-        if (hgvs->type != hgvstCoding || gotCds)
-            mappedToGenome = mapPsl(db, hgvs, "refSeqAli", accNoVersion, &cds,
-                                    &upstream, &downstream);
-        if (mappedToGenome)
-            {
-            pslTable = "refSeqAli";
-            acc = accNoVersion;
-            }
-        }
+    struct psl *mappedToGenome = mapPsl(hgvs, acc, txAli, &cds, &upstream, &downstream);
     if (mappedToGenome)
         {
         region = pslAndFriendsToRegion(mappedToGenome, hgvs, upstream, downstream);
         pslFree(&mappedToGenome);
         }
+    pslFree(&txAli);
     }
 if (region)
     {
     adjustInsStartEnd(hgvs, &region->chromStart, &region->chromEnd);
     if (retPslTable)
         *retPslTable = cloneString(pslTable);
     }
 return region;
 }
 
 static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
 if (isEmpty(acc))
     return NULL;
 struct bed *region = NULL;
 char *txAcc = NULL;
 if (startsWith("LRG_", acc))
     txAcc = lrgProteinToTx(db, acc);
+else if (startsWith("ENS", acc))
+    {
+    // Look up ENS*T* txAcc for ENS*P* acc
+
+    //#*** TODO when we have a Gencode table analogous to ensGtp
+
+    }
 else if (startsWith("NP_", acc) || startsWith("XP_", acc))
     {
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
     if (hDbHasNcbiRefSeq(db))
         sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'",
                  acc);
     else if (hTableExists(db, "refGene"))
         sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
                  "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
     else
         return NULL;
     txAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     }