31744b6e479fbe5e49b3188e651a328cf3374ed0
angie
  Thu Mar 8 13:40:51 2018 -0800
Added support for ENS*P* accessions in HGVS position search.  Thanks Mark for the new Attrs proteinId column!  refs #21076

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 44ab4f3..ef8889b 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -741,37 +741,116 @@
 return npAcc;
 }
 
 static char *lrgProteinToTx(char *db, char *protAcc)
 /* Return the LRG_ transcript accession for protAcc.  Each LRG_NpM has a corresponding LRG_NtM. */
 {
 int accLen = strlen(protAcc);
 char txAcc[accLen+1];
 safecpy(txAcc, sizeof(txAcc), protAcc);
 char *p = strrchr(txAcc, 'p');
 if (p)
     *p = 't';
 return cloneString(txAcc);
 }
 
+static char *gencodeProteinToTx(char *db, char *protAcc)
+/* Return the ENS*T transcript accession for protAcc, or NULL if not found. */
+{
+char *txAcc = NULL;
+struct sqlConnection *conn = hAllocConn(db);
+char *attrsTable = hFindLatestGencodeTableConn(conn, "Attrs");
+if (attrsTable && hHasField(db, attrsTable, "proteinId"))
+    {
+    char query[2048];
+    sqlSafef(query, sizeof(query), "select transcriptId from %s where proteinId = '%s'",
+             attrsTable, protAcc);
+    txAcc = sqlQuickString(conn, query);
+    }
+hFreeConn(&conn);
+return txAcc;
+}
+
+static struct genePred *getGencodeGp(char *db, char *acc)
+/* Return the genePred for acc in the latest wgEncodeGencodeBasicV* table, or NULL if not found. */
+{
+struct genePred *gp = NULL;
+struct sqlConnection *conn = hAllocConn(db);
+char *gencodeTable = hFindLatestGencodeTableConn(conn, "Basic");
+if (gencodeTable)
+    {
+    int hasBin = 1;
+    char query[2048];
+    sqlSafef(query, sizeof(query), "select * from %s where name = '%s'", gencodeTable, acc);
+    struct sqlResult *sr = sqlGetResult(conn, query);
+    char **row;
+    if ((row = sqlNextRow(sr)) != NULL)
+        gp = genePredExtLoad(row+hasBin, GENEPREDX_NUM_COLS);
+    sqlFreeResult(&sr);
+    }
+hFreeConn(&conn);
+return gp;
+}
+
+static char *txProtFromGp(char *db, struct genePred *gp)
+/* Return a string containing the translated CDS portion of gp using genomic sequence. */
+{
+int cdsLen = 0;
+int i, eCdsStart, eCdsEnd;
+for (i = 0;  i < gp->exonCount;  i++)
+    {
+    if (genePredCdsExon(gp, i, &eCdsStart, &eCdsEnd))
+        cdsLen += (eCdsEnd - eCdsStart);
+    }
+char cdsSeq[cdsLen+1];
+int offset = 0;
+for (i = 0;  i < gp->exonCount;  i++)
+    {
+    if (genePredCdsExon(gp, i, &eCdsStart, &eCdsEnd))
+        {
+        struct dnaSeq *exonSeq = hChromSeq(db, gp->chrom, eCdsStart, eCdsEnd);
+        safencpy(cdsSeq+offset, cdsLen+1-offset, exonSeq->dna, exonSeq->size);
+        offset += exonSeq->size;
+        dnaSeqFree(&exonSeq);
+        }
+    }
+if (gp->strand[0] == '-')
+    reverseComplement(cdsSeq, cdsLen);
+char *seq = needMem(cdsLen+1);
+dnaTranslateSome(cdsSeq, seq, cdsLen+1);
+return seq;
+}
+
 static char *getProteinSeq(char *db, char *acc)
 /* Return amino acid sequence for acc, or NULL if not found. */
 {
 if (trackHubDatabase(db))
     return NULL;
 char *seq = NULL;
-if (startsWith("LRG_", acc))
+if (startsWith("ENS", acc))
+    {
+    char *txAcc = gencodeProteinToTx(db, acc);
+    if (txAcc)
+        {
+        struct genePred *gp = getGencodeGp(db, txAcc);
+        if (gp)
+            seq = txProtFromGp(db, gp);
+        genePredFree(&gp);
+        freeMem(txAcc);
+        }
+    }
+else if (startsWith("LRG_", acc))
     {
     if (hTableExists(db, "lrgPep"))
         {
         char query[2048];
         // lrgPep is indexed by transcript ID not protein ID
         char *txAcc = lrgProteinToTx(db, acc);
         sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", txAcc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         freeMem(txAcc);
         }
     }
 else
     {
@@ -779,31 +858,30 @@
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable "
                  "where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     else
         {
         aaSeq *aaSeq = hGenBankGetPep(db, acc, NULL);
         if (aaSeq)
             seq = aaSeq->dna;
         }
     }
-
 return seq;
 }
 
 static char refBaseForNp(char *db, char *npAcc, int pos)
 // Get the amino acid base in NP_'s sequence at 1-based offset pos.
 {
 char *seq = getProteinSeq(db, npAcc);
 char base = seq ? seq[pos-1] : '\0';
 freeMem(seq);
 return base;
 }
 
 struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
 /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */
 // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles)
@@ -1076,51 +1154,30 @@
 int offset = 0;
 for (i = 0;  i < gp->exonCount;  i++)
     {
     int exonStart = gp->exonStarts[i];
     int exonEnd = gp->exonEnds[i];
     struct dnaSeq *exonSeq = hChromSeq(db, gp->chrom, exonStart, exonEnd);
     safencpy(seq+offset, seqLen+1-offset, exonSeq->dna, exonSeq->size);
     offset += exonSeq->size;
     dnaSeqFree(&exonSeq);
     }
 if (gp->strand[0] == '-')
     reverseComplement(seq, seqLen);
 return seq;
 }
 
-static struct genePred *getGencodeGp(char *db, char *acc)
-/* Return the genePred for acc in the latest wgEncodeGencodeBasicV* table, or NULL if not found. */
-{
-struct genePred *gp = NULL;
-struct sqlConnection *conn = hAllocConn(db);
-char *gencodeTable = hFindLatestGencodeTableConn(conn, "Basic");
-if (gencodeTable)
-    {
-    int hasBin = 1;
-    char query[2048];
-    sqlSafef(query, sizeof(query), "select * from %s where name = '%s'", gencodeTable, acc);
-    struct sqlResult *sr = sqlGetResult(conn, query);
-    char **row;
-    if ((row = sqlNextRow(sr)) != NULL)
-        gp = genePredExtLoad(row+hasBin, GENEPREDX_NUM_COLS);
-    sqlFreeResult(&sr);
-    }
-hFreeConn(&conn);
-return gp;
-}
-
 static char *getGencodeSeq(char *db, char *acc)
 /* Return transcribed-from-genome sequence for acc, or NULL if not found. */
 {
 char *seq = NULL;
 struct genePred *gp = getGencodeGp(db, acc);
 if (gp)
     seq = txSeqFromGp(db, gp);
 return seq;
 }
 
 static char *getCdnaSeq(char *db, char *acc)
 /* Return cdna sequence for acc, or NULL if not found. */
 {
 if (trackHubDatabase(db))
     return NULL;
@@ -1875,36 +1932,31 @@
 return region;
 }
 
 static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
 /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
 if (isEmpty(acc))
     return NULL;
 struct bed *region = NULL;
 char *txAcc = NULL;
 if (startsWith("LRG_", acc))
     txAcc = lrgProteinToTx(db, acc);
 else if (startsWith("ENS", acc))
-    {
-    // Look up ENS*T* txAcc for ENS*P* acc
-
-    //#*** TODO when we have a Gencode table analogous to ensGtp
-
-    }
+    txAcc = gencodeProteinToTx(db, acc);
 else if (startsWith("NP_", acc) || startsWith("XP_", acc))
     {
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
     if (hDbHasNcbiRefSeq(db))
         sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'",
                  acc);
     else if (hTableExists(db, "refGene"))
         sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
                  "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
     else
         return NULL;
     txAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     }