c2d33a4ecdb13f97ae828bc7099280f89f1e5dbd angie Wed Oct 19 09:55:33 2016 -0700 hgHgvs was assuming that refGene always exists, oops. hgHgvs was also assuming that db is a mysql database. Thanks Braney! refs #18240 note-5, note-9 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index ff10445..22729cf 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -1,27 +1,28 @@ /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */ /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hgHgvs.h" #include "chromInfo.h" #include "genbank.h" #include "hdb.h" #include "pslTransMap.h" #include "regexHelper.h" +#include "trackHub.h" // Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq: #define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])" #define lrgRegionExp "(LRG_[0-9+])" // NC = RefSeq reference assembly chromosome // NG = RefSeq incomplete genomic region (e.g. gene locus) // NM = RefSeq curated mRNA // NP = RefSeq curated protein // NR = RefSeq curated (non-coding) RNA #define geneSymbolExp "([A-Za-z0-9./_-]+)" #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version @@ -409,97 +410,112 @@ static boolean dbHasNcbiRefSeq(char *db) /* Test whether NCBI's RefSeq alignments are available in db. */ { // hTableExists() caches results so this shouldn't make for loads of new SQL queries if called // more than once. return (hTableExists(db, "ncbiRefSeq") && hTableExists(db, "ncbiRefSeqPsl") && hTableExists(db, "ncbiRefSeqCds") && hTableExists(db, "ncbiRefSeqLink") && hTableExists(db, "ncbiRefSeqPepTable") && hTableExists(db, "seqNcbiRefSeq") && hTableExists(db, "extNcbiRefSeq")); } static char *npForGeneSymbol(char *db, char *geneSymbol) /* Given a gene symbol, look up and return its NP_ accession; if not found return NULL. */ { -struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (dbHasNcbiRefSeq(db)) { sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where name = '%s' " "and protAcc != 'n/a' and protAcc != '' " "order by length(protAcc), protAcc", geneSymbol); } -else +else if (hTableExists(db, "refGene")) { // Join with refGene to make sure it's an NP for this species. sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where l.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc" , refLinkTable, geneSymbol); } +else + return NULL; +struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); hFreeConn(&conn); return npAcc; } static char *nmForGeneSymbol(char *db, char *geneSymbol) /* Given a gene symbol, look up and return its NM_ accession; if not found return NULL. */ { -struct sqlConnection *conn = hAllocConn(db); +if (trackHubDatabase(db)) + return NULL; char query[2048]; -char *geneTable = (dbHasNcbiRefSeq(db) ? "ncbiRefSeq" : "refGene"); +char *geneTable = NULL; +if (dbHasNcbiRefSeq(db)) + geneTable = "ncbiRefSeq"; +else if (hTableExists(db, "refGene")) + geneTable = "refGene"; +if (geneTable == NULL) + return NULL; sqlSafef(query, sizeof(query), "select name from %s where name2 = '%s' " "order by length(name), name", geneTable, geneSymbol); +struct sqlConnection *conn = hAllocConn(db); char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); return nmAcc; } static char *npForNm(char *db, char *nmAcc) /* Given an NM_ accession, look up and return its NP_ accession; if not found return NULL. */ { -struct sqlConnection *conn = hAllocConn(db); +if (trackHubDatabase(db)) + return NULL; char query[2048]; if (dbHasNcbiRefSeq(db)) { // ncbiRefSeq tables use versioned NM_ accs, but the user might have passed in a // versionless NM_, so adjust query accordingly: if (strchr(nmAcc, '.')) sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id = '%s'", nmAcc); else sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'", nmAcc); } -else +else if (hTableExists(db, "refGene")) { // Trim .version if present since our genbank tables don't use versioned names. char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.'); sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where r.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc", refLinkTable, trimmed); } +else return NULL; +struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); hFreeConn(&conn); return npAcc; } static char *getProteinSeq(char *db, char *acc) /* Return amino acid sequence for acc, or NULL if not found. */ { +if (trackHubDatabase(db)) + return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgPep")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } else { if (dbHasNcbiRefSeq(db)) @@ -514,31 +530,31 @@ else { aaSeq *aaSeq = hGenBankGetPep(db, acc, gbSeqTable); if (aaSeq) seq = aaSeq->dna; } } return seq; } static char refBaseForNp(char *db, char *npAcc, int pos) // Get the amino acid base in NP_'s sequence at 1-based offset pos. { char *seq = getProteinSeq(db, npAcc); -char base = seq[pos-1]; +char base = seq ? seq[pos-1] : '\0'; freeMem(seq); return base; } struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */ // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles) { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[11]; int geneSymbolIx = 1; boolean isSubst; if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp, substrs, ArraySize(substrs))) || regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs))) @@ -683,58 +699,62 @@ *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); if (retFoundVersion) { char *p = strchr(hgvs->seqAcc, '.'); if (p) *retFoundVersion = atoi(p+1); } // Don't free chromInfo, it's cached! } return coordsOK; } static char *getCdnaSeq(char *db, char *acc) /* Return cdna sequence for acc, or NULL if not found. */ { +if (trackHubDatabase(db)) + return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgCdna")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } else { struct dnaSeq *cdnaSeq = NULL; if (dbHasNcbiRefSeq(db)) cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq"); else cdnaSeq = hGenBankGetMrna(db, acc, gbSeqTable); if (cdnaSeq) seq = cdnaSeq->dna; } return seq; } static boolean getCds(char *db, char *acc, struct genbankCds *retCds) /* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */ { +if (trackHubDatabase(db)) + return FALSE; boolean gotCds = FALSE; char query[1024]; if (startsWith("LRG_", acc)) sqlSafef(query, sizeof(query), "select cds from lrgCds where id = '%s'", acc); else if (dbHasNcbiRefSeq(db) && // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete: strchr(acc, '.')) sqlSafef(query, sizeof(query), "select cds from ncbiRefSeqCds where id = '%s'", acc); else sqlSafef(query, sizeof(query), "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND " "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc); struct sqlConnection *conn = hAllocConn(db); @@ -795,46 +815,50 @@ hgvsRefBase = refBaseFromProt(p+1); if (hgvsRefBase != '\0') { int end1 = atoi(skipToNumeric(p+1)); char seqRefBase = toupper(accSeq[end1-1]); if (seqRefBase != hgvsRefBase) *retDiffRefAllele = cloneStringFromChar(seqRefBase); } } } } static int getGbVersion(char *db, char *acc) /* Return the local version that we have for acc. */ { +if (trackHubDatabase(db)) + return 0; char query[2048]; sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc); struct sqlConnection *conn = hAllocConn(db); int version = sqlQuickNum(conn, query); hFreeConn(&conn); return version; } static char *normalizeVersion(char *db, char *acc, int *retFoundVersion) /* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0. * The user may give us a RefSeq accession with or without a .version. * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary. * If we look it up and can't find it, this returns NULL. * If instead we're using genbank tables, return acc with no version. * That ensures that acc will be found in our local tables. */ { +if (trackHubDatabase(db)) + return NULL; char *normalizedAcc = NULL; int foundVersion = 0; if (startsWith("LRG_", acc)) { normalizedAcc = cloneString(acc); } else if (dbHasNcbiRefSeq(db)) { // ncbiRefSeq tables need versioned accessions. if (strchr(acc, '.')) normalizedAcc = cloneString(acc); else { char query[2048]; sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc); @@ -1073,31 +1097,31 @@ // variantPsl contains the anchor if a non-cdsStart anchor is used because // the actual position might be outside the bounds of the transcript sequence (intron/UTR) struct psl *variantPsl = pslFromHgvsNuc(hgvs, acc, txAli->qSize, txAli->qEnd, cds, retUpstream, retDownstream); mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); pslFree(&variantPsl); pslFree(&txAli); } sqlFreeResult(&sr); hFreeConn(&conn); return mappedToGenome; } static char *pslTableForAcc(char *db, char *acc) /* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where - * acc should be found. Don't free the returned string. */ + * acc should be found (table may or may not exist). Don't free the returned string. */ { char *pslTable = NULL; if (startsWith("LRG_", acc)) pslTable = "lrgTranscriptAli"; else if (startsWith("NM_", acc)) { // Use NCBI's alignments if they are available if (dbHasNcbiRefSeq(db)) pslTable = "ncbiRefSeqPsl"; else pslTable = "refSeqAli"; } return pslTable; } @@ -1187,33 +1211,35 @@ static struct bed3 *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { struct bed3 *region = NULL; char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (acc && startsWith("NP_", acc)) { // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome. struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (dbHasNcbiRefSeq(db)) sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); - else + else if (hTableExists(db, "refGene")) sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); + else + return NULL; char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); if (nmAcc) { struct hgvsVariant cDot; zeroBytes(&cDot, sizeof(cDot)); cDot.type = hgvstCoding; cDot.seqAcc = nmAcc; cDot.start1 = ((hgvs->start1 - 1) * 3) + 1; cDot.end = ((hgvs->end - 1) * 3) + 3; cDot.change = hgvs->change; region = hgvsMapNucToGenome(db, &cDot, retPslTable); freeMem(nmAcc); } }