62ccb95d5c71a3bf7658327e6db2bc35c19d1d71
chmalee
  Wed Oct 11 09:57:36 2023 -0700
Adding back hgvs search support for historical refseq transcripts and
historical refSeq transcripts track. Should be more resilient code this
time.

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 23a2428..6822c6f 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -727,30 +727,36 @@
         sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'",
                  nmAcc);
     }
 else if (hTableExists(db, "refGene"))
     {
     // Trim .version if present since our genbank tables don't use versioned names.
     char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.');
     sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
              "where r.name = '%s' and l.mrnaAcc = r.name "
              "and l.protAcc != '' order by length(l.protAcc), l.protAcc",
              refLinkTable, trimmed);
     }
 else return NULL;
 struct sqlConnection *conn = hAllocConn(db);
 char *npAcc = sqlQuickString(conn, query);
+// if user passed in old versioned transcript, check in *Old tables:
+if (npAcc == NULL && hDbHasNcbiRefSeqHistorical(db) && strchr(nmAcc, '.'))
+    {
+    sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLinkHistorical where id = '%s'", nmAcc);
+    npAcc = sqlQuickString(conn, query);
+    }
 hFreeConn(&conn);
 return npAcc;
 }
 
 static char *lrgProteinToTx(char *db, char *protAcc)
 /* Return the LRG_ transcript accession for protAcc.  Each LRG_NpM has a corresponding LRG_NtM. */
 {
 int accLen = strlen(protAcc);
 char txAcc[accLen+1];
 safecpy(txAcc, sizeof(txAcc), protAcc);
 char *p = strrchr(txAcc, 'p');
 if (p)
     *p = 't';
 return cloneString(txAcc);
 }
@@ -1204,31 +1210,35 @@
         sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     }
 else if (startsWith("ENS", acc ))
     {
     // Construct it from the genome I guess?
     seq = getGencodeSeq(db, acc);
     }
 else
     {
     struct dnaSeq *cdnaSeq = NULL;
     if (hDbHasNcbiRefSeq(db))
+        {
         cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq");
+        if (cdnaSeq == NULL && hDbHasNcbiRefSeqHistorical(db))
+            cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeqHistorical", "extNcbiRefSeqHistorical");
+        }
     else
         cdnaSeq = hGenBankGetMrna(db, acc, NULL);
     if (cdnaSeq)
         seq = dnaSeqCannibalize(&cdnaSeq);
     }
 return seq;
 }
 
 static boolean getCds(char *db, char *acc, struct genbankCds *retCds)
 /* Get the CDS info for genbank/LRG/ENS acc; return FALSE if not found or not applicable. */
 {
 if (trackHubDatabase(db))
     return FALSE;
 boolean gotCds = FALSE;
 char *cdsStr = NULL;
@@ -1248,30 +1258,36 @@
     if (startsWith("LRG_", acc))
         sqlSafef(query, sizeof(query),
                  "select cds from lrgCds where id = '%s'", acc);
     else if (hDbHasNcbiRefSeq(db) &&
              // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete:
              strchr(acc, '.'))
         sqlSafef(query, sizeof(query),
                  "select cds from ncbiRefSeqCds where id = '%s'", acc);
     else
         sqlSafef(query, sizeof(query),
                  "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND "
                  "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc);
     struct sqlConnection *conn = hAllocConn(db);
     char cdsBuf[2048];
     cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
+    if (isEmpty(cdsStr) && strchr(acc, '.') && hDbHasNcbiRefSeqHistorical(db))
+        {
+        sqlSafef(query, sizeof(query),
+                 "select cds from ncbiRefSeqCdsHistorical where id = '%s'", acc);
+        cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
+        }
     hFreeConn(&conn);
     }
 if (isNotEmpty(cdsStr))
     gotCds = (genbankCdsParse(cdsStr, retCds) &&
               retCds->startComplete && retCds->start != retCds->end);
 return gotCds;
 }
 
 static char refBaseFromProt(char *change)
 /* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code,
  * else null char. */
 {
 regmatch_t substrs[1];
 if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs)))
     {
@@ -1331,60 +1347,68 @@
 if (trackHubDatabase(db))
     return 0;
 char query[2048];
 sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc);
 struct sqlConnection *conn = hAllocConn(db);
 int version = sqlQuickNum(conn, query);
 hFreeConn(&conn);
 return version;
 }
 
 static char *normalizeVersion(char *db, char *acc, int *retFoundVersion)
 /* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0.
  * ENS accessions must have versions.
  * The user may give us a RefSeq accession with or without a .version.
  * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary.
- * If we look it up and can't find it, this returns NULL.
+ * If we look it up and can't find it, check the *Old tables for the refSeqHistorical
+ *     track. If we can't find it there, this returns NULL.
  * If instead we're using genbank tables, return acc with no version.
  * That ensures that acc will be found in our local tables. */
 {
 if (trackHubDatabase(db))
     return NULL;
 char *normalizedAcc = NULL;
 int foundVersion = 0;
 if (startsWith("LRG_", acc))
     {
     normalizedAcc = cloneString(acc);
     }
 else if (startsWith("ENS", acc))
     {
     normalizedAcc = cloneString(acc);
     char *p = strchr(acc, '.');
     if (p)
         foundVersion = atoi(p+1);
     }
 else if (hDbHasNcbiRefSeq(db))
     {
     // ncbiRefSeq tables need versioned accessions.
     if (strchr(acc, '.'))
         normalizedAcc = cloneString(acc);
     else
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         normalizedAcc = sqlQuickString(conn, query);
+        if (isNotEmpty(normalizedAcc) && hDbHasNcbiRefSeqHistorical(db))
+            {
+            // maybe it is a deprecated transcript
+            sqlSafef(query, sizeof(query), "select name from ncbiRefSeqHistorical where name like '%s.%%'", acc);
+            struct sqlConnection *conn = hAllocConn(db);
+            normalizedAcc = sqlQuickString(conn, query);
+            }
         hFreeConn(&conn);
         }
     if (isNotEmpty(normalizedAcc))
         {
         char *p = strchr(normalizedAcc, '.');
         assert(p);
         foundVersion = atoi(p+1);
         }
     }
 else
     {
     // genbank tables -- no version
     normalizedAcc = cloneFirstWordByDelimiter(acc, '.');
     foundVersion = getGbVersion(db, normalizedAcc);
     }
@@ -1880,44 +1904,54 @@
             struct sqlConnection *conn = hAllocConn(db);
             *retPslTable = hFindLatestGencodeTableConn(conn, "Comp");
             hFreeConn(&conn);
             }
         }
     }
 else
     {
     char *pslTable = pslTableForAcc(db, acc);
     if (pslTable && hTableExists(db, pslTable))
         {
         if (hgvs->type == hgvstCoding)
             getCds(db, acc, cds);
         txAli = pslForQName(db, pslTable, acc);
         }
+    // try the old alignments if we can't find it in the current alignments
+    if (!txAli && sameString(pslTable, "ncbiRefSeqPsl"))
+        {
+        if (hDbHasNcbiRefSeqHistorical(db))
+            {
+            txAli = pslForQName(db, "ncbiRefSeqPslHistorical", acc);
+            if (txAli)
+                pslTable = "ncbiRefSeqPslHistorical";
+            }
         // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back
         // on UCSC alignments.
-    if (!txAli && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli"))
+        if (!txAli && hTableExists(db, "refSeqAli"))
             {
             char *accNoVersion = cloneFirstWordByDelimiter(acc, '.');
             if (hgvs->type == hgvstCoding)
                 getCds(db, accNoVersion, cds);
             txAli = pslForQName(db, "refSeqAli", accNoVersion);
             if (txAli)
                 {
                 pslTable = "refSeqAli";
                 *pAcc = accNoVersion;
                 }
             }
+        }
     if (txAli && retPslTable != NULL)
         *retPslTable = pslTable;
     }
 return txAli;
 }
 
 static struct bed *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable,
                                       boolean mustBeCds)
 /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
  * If mustBeCds, but we can't find cds or the variant coords are outside of it, return NULL.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 if (hgvs->type == hgvstGenomic)
     return hgvsMapGDotToGenome(db, hgvs, retPslTable);
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
@@ -1957,38 +1991,50 @@
 {
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
 if (isEmpty(acc))
     return NULL;
 struct bed *region = NULL;
 char *txAcc = NULL;
 if (startsWith("LRG_", acc))
     txAcc = lrgProteinToTx(db, acc);
 else if (startsWith("ENS", acc))
     txAcc = gencodeProteinToTx(db, acc);
 else if (startsWith("NP_", acc) || startsWith("XP_", acc))
     {
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
     if (hDbHasNcbiRefSeq(db))
+        {
         sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'",
                  acc);
+        txAcc = sqlQuickString(conn, query);
+        // user may have passed previous versioned transcript, check the *Old tables:
+        if (!txAcc && hDbHasNcbiRefSeqHistorical(db))
+            {
+            sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLinkHistorical where protAcc = '%s'",
+                     acc);
+            txAcc = sqlQuickString(conn, query);
+            }
+        }
     else if (hTableExists(db, "refGene"))
+        {
         sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
                  "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
+        txAcc = sqlQuickString(conn, query);
+        }
     else
         return NULL;
-    txAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     }
 if (txAcc)
     {
     // Translate the p. to c. and map c. to the genome.
     struct hgvsVariant cDot;
     zeroBytes(&cDot, sizeof(cDot));
     cDot.type = hgvstCoding;
     cDot.seqAcc = txAcc;
     cDot.start1 = ((hgvs->start1 - 1) * 3) + 1;
     cDot.end = ((hgvs->end - 1) * 3) + 3;
     cDot.changes = hgvs->changes;
     region = hgvsMapNucToGenome(db, &cDot, retPslTable, TRUE);
     freeMem(txAcc);
     }