bbca1ee6f90a46e7a139434849dbf32bd206b522 chmalee Thu Sep 21 12:27:34 2023 -0700 First cut of adding a historical refseq transcripts track, and getting hgvs searching to work with them, refs #26016 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 23a2428..415becd 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -727,30 +727,36 @@ sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'", nmAcc); } else if (hTableExists(db, "refGene")) { // Trim .version if present since our genbank tables don't use versioned names. char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.'); sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where r.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc", refLinkTable, trimmed); } else return NULL; struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); +// if user passed in old versioned transcript, check in *Old tables: +if (npAcc == NULL && hDbHasNcbiRefSeq(db) && strchr(nmAcc, '.')) + { + sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLinkOld where id = '%s'", nmAcc); + npAcc = sqlQuickString(conn, query); + } hFreeConn(&conn); return npAcc; } static char *lrgProteinToTx(char *db, char *protAcc) /* Return the LRG_ transcript accession for protAcc. Each LRG_NpM has a corresponding LRG_NtM. */ { int accLen = strlen(protAcc); char txAcc[accLen+1]; safecpy(txAcc, sizeof(txAcc), protAcc); char *p = strrchr(txAcc, 'p'); if (p) *p = 't'; return cloneString(txAcc); } @@ -851,30 +857,36 @@ struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); freeMem(txAcc); } } else { if (hDbHasNcbiRefSeq(db)) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable " "where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); + if (seq == NULL) + { + sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTableOld " + "where name = '%s'", acc); + seq = sqlQuickString(conn, query); + } hFreeConn(&conn); } else { aaSeq *aaSeq = hGenBankGetPep(db, acc, NULL); if (aaSeq) seq = aaSeq->dna; } } return seq; } static char refBaseForNp(char *db, char *npAcc, int pos) // Get the amino acid base in NP_'s sequence at 1-based offset pos. { @@ -1204,31 +1216,35 @@ sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } else if (startsWith("ENS", acc )) { // Construct it from the genome I guess? seq = getGencodeSeq(db, acc); } else { struct dnaSeq *cdnaSeq = NULL; if (hDbHasNcbiRefSeq(db)) + { cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq"); + if (cdnaSeq == NULL) + cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeqOld", "extNcbiRefSeqOld"); + } else cdnaSeq = hGenBankGetMrna(db, acc, NULL); if (cdnaSeq) seq = dnaSeqCannibalize(&cdnaSeq); } return seq; } static boolean getCds(char *db, char *acc, struct genbankCds *retCds) /* Get the CDS info for genbank/LRG/ENS acc; return FALSE if not found or not applicable. */ { if (trackHubDatabase(db)) return FALSE; boolean gotCds = FALSE; char *cdsStr = NULL; @@ -1248,30 +1264,36 @@ if (startsWith("LRG_", acc)) sqlSafef(query, sizeof(query), "select cds from lrgCds where id = '%s'", acc); else if (hDbHasNcbiRefSeq(db) && // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete: strchr(acc, '.')) sqlSafef(query, sizeof(query), "select cds from ncbiRefSeqCds where id = '%s'", acc); else sqlSafef(query, sizeof(query), "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND " "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc); struct sqlConnection *conn = hAllocConn(db); char cdsBuf[2048]; cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); + if (isEmpty(cdsStr) && strchr(acc, '.')) + { + sqlSafef(query, sizeof(query), + "select cds from ncbiRefSeqCdsOld where id = '%s'", acc); + cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); + } hFreeConn(&conn); } if (isNotEmpty(cdsStr)) gotCds = (genbankCdsParse(cdsStr, retCds) && retCds->startComplete && retCds->start != retCds->end); return gotCds; } static char refBaseFromProt(char *change) /* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code, * else null char. */ { regmatch_t substrs[1]; if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs))) { @@ -1880,44 +1902,54 @@ struct sqlConnection *conn = hAllocConn(db); *retPslTable = hFindLatestGencodeTableConn(conn, "Comp"); hFreeConn(&conn); } } } else { char *pslTable = pslTableForAcc(db, acc); if (pslTable && hTableExists(db, pslTable)) { if (hgvs->type == hgvstCoding) getCds(db, acc, cds); txAli = pslForQName(db, pslTable, acc); } + // try the old alignments if we can't find it in the current alignments + if (!txAli && sameString(pslTable, "ncbiRefSeqPsl")) + { + if (hTableExists(db, "ncbiRefSeqPslOld")) + { + txAli = pslForQName(db, "ncbiRefSeqPslOld", acc); + if (txAli) + pslTable = "ncbiRefSeqPslOld"; + } // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back // on UCSC alignments. - if (!txAli && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli")) + if (!txAli && hTableExists(db, "refSeqAli")) { char *accNoVersion = cloneFirstWordByDelimiter(acc, '.'); if (hgvs->type == hgvstCoding) getCds(db, accNoVersion, cds); txAli = pslForQName(db, "refSeqAli", accNoVersion); if (txAli) { pslTable = "refSeqAli"; *pAcc = accNoVersion; } } + } if (txAli && retPslTable != NULL) *retPslTable = pslTable; } return txAli; } static struct bed *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable, boolean mustBeCds) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If mustBeCds, but we can't find cds or the variant coords are outside of it, return NULL. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { if (hgvs->type == hgvstGenomic) return hgvsMapGDotToGenome(db, hgvs, retPslTable); char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); @@ -1957,38 +1989,50 @@ { char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (isEmpty(acc)) return NULL; struct bed *region = NULL; char *txAcc = NULL; if (startsWith("LRG_", acc)) txAcc = lrgProteinToTx(db, acc); else if (startsWith("ENS", acc)) txAcc = gencodeProteinToTx(db, acc); else if (startsWith("NP_", acc) || startsWith("XP_", acc)) { struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (hDbHasNcbiRefSeq(db)) + { sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); + txAcc = sqlQuickString(conn, query); + // user may have passed previous versioned transcript, check the *Old tables: + if (!txAcc) + { + sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLinkOld where protAcc = '%s'", + acc); + txAcc = sqlQuickString(conn, query); + } + } else if (hTableExists(db, "refGene")) + { sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); + txAcc = sqlQuickString(conn, query); + } else return NULL; - txAcc = sqlQuickString(conn, query); hFreeConn(&conn); } if (txAcc) { // Translate the p. to c. and map c. to the genome. struct hgvsVariant cDot; zeroBytes(&cDot, sizeof(cDot)); cDot.type = hgvstCoding; cDot.seqAcc = txAcc; cDot.start1 = ((hgvs->start1 - 1) * 3) + 1; cDot.end = ((hgvs->end - 1) * 3) + 3; cDot.changes = hgvs->changes; region = hgvsMapNucToGenome(db, &cDot, retPslTable, TRUE); freeMem(txAcc); }