bbca1ee6f90a46e7a139434849dbf32bd206b522
chmalee
  Thu Sep 21 12:27:34 2023 -0700
First cut of adding a historical refseq transcripts track, and getting hgvs searching to work with them, refs #26016

diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 23a2428..415becd 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -727,30 +727,36 @@
         sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'",
                  nmAcc);
     }
 else if (hTableExists(db, "refGene"))
     {
     // Trim .version if present since our genbank tables don't use versioned names.
     char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.');
     sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
              "where r.name = '%s' and l.mrnaAcc = r.name "
              "and l.protAcc != '' order by length(l.protAcc), l.protAcc",
              refLinkTable, trimmed);
     }
 else return NULL;
 struct sqlConnection *conn = hAllocConn(db);
 char *npAcc = sqlQuickString(conn, query);
+// if user passed in old versioned transcript, check in *Old tables:
+if (npAcc == NULL && hDbHasNcbiRefSeq(db) && strchr(nmAcc, '.'))
+    {
+    sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLinkOld where id = '%s'", nmAcc);
+    npAcc = sqlQuickString(conn, query);
+    }
 hFreeConn(&conn);
 return npAcc;
 }
 
 static char *lrgProteinToTx(char *db, char *protAcc)
 /* Return the LRG_ transcript accession for protAcc.  Each LRG_NpM has a corresponding LRG_NtM. */
 {
 int accLen = strlen(protAcc);
 char txAcc[accLen+1];
 safecpy(txAcc, sizeof(txAcc), protAcc);
 char *p = strrchr(txAcc, 'p');
 if (p)
     *p = 't';
 return cloneString(txAcc);
 }
@@ -851,30 +857,36 @@
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         freeMem(txAcc);
         }
     }
 else
     {
     if (hDbHasNcbiRefSeq(db))
         {
         char query[2048];
         sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable "
                  "where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
+        if (seq == NULL)
+            {
+            sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTableOld "
+                     "where name = '%s'", acc);
+            seq = sqlQuickString(conn, query);
+            }
         hFreeConn(&conn);
         }
     else
         {
         aaSeq *aaSeq = hGenBankGetPep(db, acc, NULL);
         if (aaSeq)
             seq = aaSeq->dna;
         }
     }
 return seq;
 }
 
 static char refBaseForNp(char *db, char *npAcc, int pos)
 // Get the amino acid base in NP_'s sequence at 1-based offset pos.
 {
@@ -1204,31 +1216,35 @@
         sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
         struct sqlConnection *conn = hAllocConn(db);
         seq = sqlQuickString(conn, query);
         hFreeConn(&conn);
         }
     }
 else if (startsWith("ENS", acc ))
     {
     // Construct it from the genome I guess?
     seq = getGencodeSeq(db, acc);
     }
 else
     {
     struct dnaSeq *cdnaSeq = NULL;
     if (hDbHasNcbiRefSeq(db))
+        {
         cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq");
+        if (cdnaSeq == NULL)
+            cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeqOld", "extNcbiRefSeqOld");
+        }
     else
         cdnaSeq = hGenBankGetMrna(db, acc, NULL);
     if (cdnaSeq)
         seq = dnaSeqCannibalize(&cdnaSeq);
     }
 return seq;
 }
 
 static boolean getCds(char *db, char *acc, struct genbankCds *retCds)
 /* Get the CDS info for genbank/LRG/ENS acc; return FALSE if not found or not applicable. */
 {
 if (trackHubDatabase(db))
     return FALSE;
 boolean gotCds = FALSE;
 char *cdsStr = NULL;
@@ -1248,30 +1264,36 @@
     if (startsWith("LRG_", acc))
         sqlSafef(query, sizeof(query),
                  "select cds from lrgCds where id = '%s'", acc);
     else if (hDbHasNcbiRefSeq(db) &&
              // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete:
              strchr(acc, '.'))
         sqlSafef(query, sizeof(query),
                  "select cds from ncbiRefSeqCds where id = '%s'", acc);
     else
         sqlSafef(query, sizeof(query),
                  "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND "
                  "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc);
     struct sqlConnection *conn = hAllocConn(db);
     char cdsBuf[2048];
     cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
+    if (isEmpty(cdsStr) && strchr(acc, '.'))
+        {
+        sqlSafef(query, sizeof(query),
+                 "select cds from ncbiRefSeqCdsOld where id = '%s'", acc);
+        cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
+        }
     hFreeConn(&conn);
     }
 if (isNotEmpty(cdsStr))
     gotCds = (genbankCdsParse(cdsStr, retCds) &&
               retCds->startComplete && retCds->start != retCds->end);
 return gotCds;
 }
 
 static char refBaseFromProt(char *change)
 /* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code,
  * else null char. */
 {
 regmatch_t substrs[1];
 if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs)))
     {
@@ -1880,44 +1902,54 @@
             struct sqlConnection *conn = hAllocConn(db);
             *retPslTable = hFindLatestGencodeTableConn(conn, "Comp");
             hFreeConn(&conn);
             }
         }
     }
 else
     {
     char *pslTable = pslTableForAcc(db, acc);
     if (pslTable && hTableExists(db, pslTable))
         {
         if (hgvs->type == hgvstCoding)
             getCds(db, acc, cds);
         txAli = pslForQName(db, pslTable, acc);
         }
+    // try the old alignments if we can't find it in the current alignments
+    if (!txAli && sameString(pslTable, "ncbiRefSeqPsl"))
+        {
+        if (hTableExists(db, "ncbiRefSeqPslOld"))
+            {
+            txAli = pslForQName(db, "ncbiRefSeqPslOld", acc);
+            if (txAli)
+                pslTable = "ncbiRefSeqPslOld";
+            }
         // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back
         // on UCSC alignments.
-    if (!txAli && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli"))
+        if (!txAli && hTableExists(db, "refSeqAli"))
             {
             char *accNoVersion = cloneFirstWordByDelimiter(acc, '.');
             if (hgvs->type == hgvstCoding)
                 getCds(db, accNoVersion, cds);
             txAli = pslForQName(db, "refSeqAli", accNoVersion);
             if (txAli)
                 {
                 pslTable = "refSeqAli";
                 *pAcc = accNoVersion;
                 }
             }
+        }
     if (txAli && retPslTable != NULL)
         *retPslTable = pslTable;
     }
 return txAli;
 }
 
 static struct bed *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable,
                                       boolean mustBeCds)
 /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
  * If mustBeCds, but we can't find cds or the variant coords are outside of it, return NULL.
  * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
 {
 if (hgvs->type == hgvstGenomic)
     return hgvsMapGDotToGenome(db, hgvs, retPslTable);
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
@@ -1957,38 +1989,50 @@
 {
 char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
 if (isEmpty(acc))
     return NULL;
 struct bed *region = NULL;
 char *txAcc = NULL;
 if (startsWith("LRG_", acc))
     txAcc = lrgProteinToTx(db, acc);
 else if (startsWith("ENS", acc))
     txAcc = gencodeProteinToTx(db, acc);
 else if (startsWith("NP_", acc) || startsWith("XP_", acc))
     {
     struct sqlConnection *conn = hAllocConn(db);
     char query[2048];
     if (hDbHasNcbiRefSeq(db))
+        {
         sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'",
                  acc);
+        txAcc = sqlQuickString(conn, query);
+        // user may have passed previous versioned transcript, check the *Old tables:
+        if (!txAcc)
+            {
+            sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLinkOld where protAcc = '%s'",
+                     acc);
+            txAcc = sqlQuickString(conn, query);
+            }
+        }
     else if (hTableExists(db, "refGene"))
+        {
         sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
                  "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
+        txAcc = sqlQuickString(conn, query);
+        }
     else
         return NULL;
-    txAcc = sqlQuickString(conn, query);
     hFreeConn(&conn);
     }
 if (txAcc)
     {
     // Translate the p. to c. and map c. to the genome.
     struct hgvsVariant cDot;
     zeroBytes(&cDot, sizeof(cDot));
     cDot.type = hgvstCoding;
     cDot.seqAcc = txAcc;
     cDot.start1 = ((hgvs->start1 - 1) * 3) + 1;
     cDot.end = ((hgvs->end - 1) * 3) + 3;
     cDot.changes = hgvs->changes;
     region = hgvsMapNucToGenome(db, &cDot, retPslTable, TRUE);
     freeMem(txAcc);
     }