62ccb95d5c71a3bf7658327e6db2bc35c19d1d71 chmalee Wed Oct 11 09:57:36 2023 -0700 Adding back hgvs search support for historical refseq transcripts and historical refSeq transcripts track. Should be more resilient code this time. diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 23a2428..6822c6f 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -727,30 +727,36 @@ sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'", nmAcc); } else if (hTableExists(db, "refGene")) { // Trim .version if present since our genbank tables don't use versioned names. char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.'); sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where r.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc", refLinkTable, trimmed); } else return NULL; struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); +// if user passed in old versioned transcript, check in *Old tables: +if (npAcc == NULL && hDbHasNcbiRefSeqHistorical(db) && strchr(nmAcc, '.')) + { + sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLinkHistorical where id = '%s'", nmAcc); + npAcc = sqlQuickString(conn, query); + } hFreeConn(&conn); return npAcc; } static char *lrgProteinToTx(char *db, char *protAcc) /* Return the LRG_ transcript accession for protAcc. Each LRG_NpM has a corresponding LRG_NtM. */ { int accLen = strlen(protAcc); char txAcc[accLen+1]; safecpy(txAcc, sizeof(txAcc), protAcc); char *p = strrchr(txAcc, 'p'); if (p) *p = 't'; return cloneString(txAcc); } @@ -1204,31 +1210,35 @@ sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } else if (startsWith("ENS", acc )) { // Construct it from the genome I guess? seq = getGencodeSeq(db, acc); } else { struct dnaSeq *cdnaSeq = NULL; if (hDbHasNcbiRefSeq(db)) + { cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq"); + if (cdnaSeq == NULL && hDbHasNcbiRefSeqHistorical(db)) + cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeqHistorical", "extNcbiRefSeqHistorical"); + } else cdnaSeq = hGenBankGetMrna(db, acc, NULL); if (cdnaSeq) seq = dnaSeqCannibalize(&cdnaSeq); } return seq; } static boolean getCds(char *db, char *acc, struct genbankCds *retCds) /* Get the CDS info for genbank/LRG/ENS acc; return FALSE if not found or not applicable. */ { if (trackHubDatabase(db)) return FALSE; boolean gotCds = FALSE; char *cdsStr = NULL; @@ -1248,30 +1258,36 @@ if (startsWith("LRG_", acc)) sqlSafef(query, sizeof(query), "select cds from lrgCds where id = '%s'", acc); else if (hDbHasNcbiRefSeq(db) && // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete: strchr(acc, '.')) sqlSafef(query, sizeof(query), "select cds from ncbiRefSeqCds where id = '%s'", acc); else sqlSafef(query, sizeof(query), "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND " "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc); struct sqlConnection *conn = hAllocConn(db); char cdsBuf[2048]; cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); + if (isEmpty(cdsStr) && strchr(acc, '.') && hDbHasNcbiRefSeqHistorical(db)) + { + sqlSafef(query, sizeof(query), + "select cds from ncbiRefSeqCdsHistorical where id = '%s'", acc); + cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); + } hFreeConn(&conn); } if (isNotEmpty(cdsStr)) gotCds = (genbankCdsParse(cdsStr, retCds) && retCds->startComplete && retCds->start != retCds->end); return gotCds; } static char refBaseFromProt(char *change) /* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code, * else null char. */ { regmatch_t substrs[1]; if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs))) { @@ -1331,60 +1347,68 @@ if (trackHubDatabase(db)) return 0; char query[2048]; sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc); struct sqlConnection *conn = hAllocConn(db); int version = sqlQuickNum(conn, query); hFreeConn(&conn); return version; } static char *normalizeVersion(char *db, char *acc, int *retFoundVersion) /* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0. * ENS accessions must have versions. * The user may give us a RefSeq accession with or without a .version. * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary. - * If we look it up and can't find it, this returns NULL. + * If we look it up and can't find it, check the *Old tables for the refSeqHistorical + * track. If we can't find it there, this returns NULL. * If instead we're using genbank tables, return acc with no version. * That ensures that acc will be found in our local tables. */ { if (trackHubDatabase(db)) return NULL; char *normalizedAcc = NULL; int foundVersion = 0; if (startsWith("LRG_", acc)) { normalizedAcc = cloneString(acc); } else if (startsWith("ENS", acc)) { normalizedAcc = cloneString(acc); char *p = strchr(acc, '.'); if (p) foundVersion = atoi(p+1); } else if (hDbHasNcbiRefSeq(db)) { // ncbiRefSeq tables need versioned accessions. if (strchr(acc, '.')) normalizedAcc = cloneString(acc); else { char query[2048]; sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc); struct sqlConnection *conn = hAllocConn(db); normalizedAcc = sqlQuickString(conn, query); + if (isNotEmpty(normalizedAcc) && hDbHasNcbiRefSeqHistorical(db)) + { + // maybe it is a deprecated transcript + sqlSafef(query, sizeof(query), "select name from ncbiRefSeqHistorical where name like '%s.%%'", acc); + struct sqlConnection *conn = hAllocConn(db); + normalizedAcc = sqlQuickString(conn, query); + } hFreeConn(&conn); } if (isNotEmpty(normalizedAcc)) { char *p = strchr(normalizedAcc, '.'); assert(p); foundVersion = atoi(p+1); } } else { // genbank tables -- no version normalizedAcc = cloneFirstWordByDelimiter(acc, '.'); foundVersion = getGbVersion(db, normalizedAcc); } @@ -1880,44 +1904,54 @@ struct sqlConnection *conn = hAllocConn(db); *retPslTable = hFindLatestGencodeTableConn(conn, "Comp"); hFreeConn(&conn); } } } else { char *pslTable = pslTableForAcc(db, acc); if (pslTable && hTableExists(db, pslTable)) { if (hgvs->type == hgvstCoding) getCds(db, acc, cds); txAli = pslForQName(db, pslTable, acc); } + // try the old alignments if we can't find it in the current alignments + if (!txAli && sameString(pslTable, "ncbiRefSeqPsl")) + { + if (hDbHasNcbiRefSeqHistorical(db)) + { + txAli = pslForQName(db, "ncbiRefSeqPslHistorical", acc); + if (txAli) + pslTable = "ncbiRefSeqPslHistorical"; + } // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back // on UCSC alignments. - if (!txAli && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli")) + if (!txAli && hTableExists(db, "refSeqAli")) { char *accNoVersion = cloneFirstWordByDelimiter(acc, '.'); if (hgvs->type == hgvstCoding) getCds(db, accNoVersion, cds); txAli = pslForQName(db, "refSeqAli", accNoVersion); if (txAli) { pslTable = "refSeqAli"; *pAcc = accNoVersion; } } + } if (txAli && retPslTable != NULL) *retPslTable = pslTable; } return txAli; } static struct bed *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable, boolean mustBeCds) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If mustBeCds, but we can't find cds or the variant coords are outside of it, return NULL. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { if (hgvs->type == hgvstGenomic) return hgvsMapGDotToGenome(db, hgvs, retPslTable); char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); @@ -1957,38 +1991,50 @@ { char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (isEmpty(acc)) return NULL; struct bed *region = NULL; char *txAcc = NULL; if (startsWith("LRG_", acc)) txAcc = lrgProteinToTx(db, acc); else if (startsWith("ENS", acc)) txAcc = gencodeProteinToTx(db, acc); else if (startsWith("NP_", acc) || startsWith("XP_", acc)) { struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (hDbHasNcbiRefSeq(db)) + { sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); + txAcc = sqlQuickString(conn, query); + // user may have passed previous versioned transcript, check the *Old tables: + if (!txAcc && hDbHasNcbiRefSeqHistorical(db)) + { + sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLinkHistorical where protAcc = '%s'", + acc); + txAcc = sqlQuickString(conn, query); + } + } else if (hTableExists(db, "refGene")) + { sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); + txAcc = sqlQuickString(conn, query); + } else return NULL; - txAcc = sqlQuickString(conn, query); hFreeConn(&conn); } if (txAcc) { // Translate the p. to c. and map c. to the genome. struct hgvsVariant cDot; zeroBytes(&cDot, sizeof(cDot)); cDot.type = hgvstCoding; cDot.seqAcc = txAcc; cDot.start1 = ((hgvs->start1 - 1) * 3) + 1; cDot.end = ((hgvs->end - 1) * 3) + 3; cDot.changes = hgvs->changes; region = hgvsMapNucToGenome(db, &cDot, retPslTable, TRUE); freeMem(txAcc); }