c2d33a4ecdb13f97ae828bc7099280f89f1e5dbd angie Wed Oct 19 09:55:33 2016 -0700 hgHgvs was assuming that refGene always exists, oops. hgHgvs was also assuming that db is a mysql database. Thanks Braney! refs #18240 note-5, note-9 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index ff10445..22729cf 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -1,1235 +1,1261 @@ /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */ /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hgHgvs.h" #include "chromInfo.h" #include "genbank.h" #include "hdb.h" #include "pslTransMap.h" #include "regexHelper.h" +#include "trackHub.h" // Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq: #define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])" #define lrgRegionExp "(LRG_[0-9+])" // NC = RefSeq reference assembly chromosome // NG = RefSeq incomplete genomic region (e.g. gene locus) // NM = RefSeq curated mRNA // NP = RefSeq curated protein // NR = RefSeq curated (non-coding) RNA #define geneSymbolExp "([A-Za-z0-9./_-]+)" #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version // ...... optional gene symbol in ()s // .... optional gene symbol #define versionedRefSeqNCExp versionedAccPrefixExp("NC") #define versionedRefSeqNGExp versionedAccPrefixExp("NG") #define versionedRefSeqNMExp versionedAccPrefixExp("NM") #define versionedRefSeqNPExp versionedAccPrefixExp("NP") #define versionedRefSeqNRExp versionedAccPrefixExp("NR") // Nucleotide position regexes // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA) #define posIntExp "([0-9]+)" #define hgvsNtPosExp posIntExp "(_" posIntExp ")?" // ...... 1-based start position // ............. optional range separator and end position // ...... 1-based end position // Now for a regex that can handle positions like "26" or "*80" or "-24+75_-24+92"... #define anchorExp "([-*])?" #define offsetExp "([-+])" #define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?" #define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?" // ... optional anchor "-" or "*" // ... mandatory first number // ....... optional offset separator and offset // ... offset separator // ... offset number // ............... optional range separator and complex num // ... optional anchor "-" or "*" // ... first number // ....... optional offset separator and offset // ... offset separator // ... offset number // It's pretty common for users to omit the '.' so if it's missing but the rest of the regex fits, // roll with it. #define hgvsCDotPosExp "c\\.?" hgvsCdsPosExp #define hgvsGDotPosExp "g\\.?" hgvsNtPosExp #define hgvsMDotPosExp "m\\.?" hgvsNtPosExp #define hgvsNDotPosExp "n\\.?" hgvsNtPosExp #define hgvsRDotPosExp "r\\.?" hgvsNtPosExp // Protein substitution regex #define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter" #define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp #define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)" #define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp // ... // original sequence // ...... // 1-based position // ... // replacement sequence // Protein range (or just single pos) regex #define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)" #define hgvsPDotRangeExp "p\\." hgvsAaRangeExp // original start AA ... // 1-based start position ... // optional range sep and AA+pos ..................................... // original end AA ... // 1-based end position ... // change description ... // Complete HGVS term regexes combining sequence identifiers and change operations #define hgvsFullRegex(seq, op) "^" seq ":" op #define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGDotPosExp) #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)" // substring numbering: // 0..................................... whole matching string // 1................. accession and optional dot version // 2........ optional dot version // 3...... (n/a) optional gene symbol in ()s // 4.... (n/a) optional gene symbol // 5... 1-based start position // 6.... optional range separator and end position // 7... 1-based end position // 8.... change description #define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) #define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)" // substring numbering: // 0..................................................... whole matching string // 1................... LRG transcript // 2... optional anchor "-" or "*" // 3... mandatory first number // 4....... optional offset separator and offset // 5... offset separator // 6... offset number // 7............... optional range sep and complex num // 8... optional anchor "-" or "*" // 9... first number // 10....... optional offset separator and offset // 11... offset separator // 12... offset number // 13........ sequence change #define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp) #define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)" // substring numbering: // 0............................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5... optional anchor // 6... mandatory first number // 7....... optional offset // 8... offset separator // 9... offset number // 10............... optional range // 11... optional anchor // 12... first number // 13....... optional offset // 14... offset separator // 15... offset number // 16........ sequence change #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position // 7...... replacement sequence #define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5... original start AA // 6... 1-based start position // 7................. optional range sep and AA+pos // 8... original end AA // 9... 1-based end position // 10..... change description // Pseudo-HGVS in common usage // Sometimes users give an NM_ accession, but a protein change. #define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp "[: p.]+" hgvsAminoAcidSubstExp // substring numbering: // 0..................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position // 7...... replacement sequence #define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp "[: p.]+" hgvsAaRangeExp // substring numbering: // 0..................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5... original start AA // 6... 1-based start position // 7.......... optional range sep and AA+pos // 8... original end AA // 9... 1-based end position // 10.... change description // Common: gene symbol followed by space and/or punctuation followed by protein change #define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... original sequence // 3...... 1-based position // 4...... replacement sequence #define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp "[: p.]+" hgvsAaRangeExp // 0..................................................... whole matching string // 1................... gene symbol // 2... original start AA // 3... 1-based start position // 4................ optional range sep and AA+pos // 5... original end AA // 6... 1-based end position // 7..... change description // As above but omitting the protein change #define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp "[: p.]+" posIntExp // 0.......................... whole matching string // 1................... gene symbol // 2..... 1-based position // Gene symbol, maybe punctuation, and a clear "c." position (and possibly change) #define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... optional beginning of position exp // 3..... beginning of position exp static struct hgvsVariant *hgvsParseGDotPos(char *term) /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */ { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[17]; if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs))) { int accIx = 1; int startPosIx = 5; int endPosIx = 7; int changeIx = 8; AllocVar(hgvs); hgvs->type = hgvstGenomic; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; hgvs->change = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset, boolean *retIsUtr3, int *retPos, int *retOffset) /* Extract matches from substrs starting at substrOffset to parse a complex number * description into anchor type, 1-based position and optional offset. */ { int anchorIx = substrOffset; int posIx = substrOffset + 1; int offsetOpIx = substrOffset + 3; int offsetIx = substrOffset + 4; // Determine what startPos is relative to, based on optional anchor and optional offset char anchor[16]; // string length 0 or 1 regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor)); char offsetOp[16]; // string length 0 or 1 regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp)); *retIsUtr3 = (anchor[0] == '*'); *retPos = regexSubstringInt(term, substrs[posIx]); // Is pos negative? if (anchor[0] == '-') *retPos = -*retPos; // Grab offset, if there is one. if (isNotEmpty(offsetOp)) { *retOffset = regexSubstringInt(term, substrs[offsetIx]); if (offsetOp[0] == '-') *retOffset = -*retOffset; } } static struct hgvsVariant *hgvsParseCDotPos(char *term) /* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; boolean matches = FALSE; int accIx = 1; int startAnchorIx = 2; int endAnchorIx = 8; int endPosIx = 9; int changeIx = 13; // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that // affects all substring offsets after the accession. int refSeqExtra = 3; int geneSymbolIx = -1; regmatch_t substrs[17]; if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs))) { matches = TRUE; } else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs))) { matches = TRUE; geneSymbolIx = 4; startAnchorIx += refSeqExtra; endAnchorIx += refSeqExtra; endPosIx += refSeqExtra; changeIx += refSeqExtra; } if (matches) { AllocVar(hgvs); hgvs->type = hgvstCoding; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); extractComplexNum(term, substrs, startAnchorIx, &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset); if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); if (regexSubstrMatched(substrs[endPosIx])) { extractComplexNum(term, substrs, endAnchorIx, &hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset); } else { hgvs->end = hgvs->start1; hgvs->endIsUtr3 = hgvs->startIsUtr3; hgvs->endOffset = hgvs->startOffset; } hgvs->change = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } static struct hgvsVariant *hgvsParsePDotSubst(char *term) /* If term is parseable as an HGVS protein substitution, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[8]; if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs))) { int accIx = 1; int geneSymbolIx = 4; int refIx = 5; int posIx = 6; int altIx = 7; AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); int changeStart = substrs[refIx].rm_so; int changeEnd = substrs[altIx].rm_eo; int len = changeEnd - changeStart; char change[len+1]; safencpy(change, sizeof(change), term+changeStart, len); hgvs->change = cloneString(change); if (regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[posIx]); hgvs->end = hgvs->start1; } return hgvs; } static struct hgvsVariant *hgvsParsePDotRange(char *term) /* If term is parseable as an HGVS protein range change, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[11]; if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs))) { int accIx = 1; int geneSymbolIx = 4; int startRefIx = 5; int startPosIx = 6; int endPosIx = 9; int changeIx = 10; AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); int changeStart = substrs[startRefIx].rm_so; int changeEnd = substrs[changeIx].rm_eo; int len = changeEnd - changeStart; char change[len+1]; safencpy(change, sizeof(change), term+changeStart, len); hgvs->change = cloneString(change); if (regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; } return hgvs; } struct hgvsVariant *hgvsParseTerm(char *term) /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions, coordinates or alleles. */ { struct hgvsVariant *hgvs = hgvsParseCDotPos(term); if (hgvs == NULL) hgvs = hgvsParsePDotSubst(term); if (hgvs == NULL) hgvs = hgvsParsePDotRange(term); if (hgvs == NULL) hgvs = hgvsParseGDotPos(term); return hgvs; } static boolean dbHasNcbiRefSeq(char *db) /* Test whether NCBI's RefSeq alignments are available in db. */ { // hTableExists() caches results so this shouldn't make for loads of new SQL queries if called // more than once. return (hTableExists(db, "ncbiRefSeq") && hTableExists(db, "ncbiRefSeqPsl") && hTableExists(db, "ncbiRefSeqCds") && hTableExists(db, "ncbiRefSeqLink") && hTableExists(db, "ncbiRefSeqPepTable") && hTableExists(db, "seqNcbiRefSeq") && hTableExists(db, "extNcbiRefSeq")); } static char *npForGeneSymbol(char *db, char *geneSymbol) /* Given a gene symbol, look up and return its NP_ accession; if not found return NULL. */ { -struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (dbHasNcbiRefSeq(db)) { sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where name = '%s' " "and protAcc != 'n/a' and protAcc != '' " "order by length(protAcc), protAcc", geneSymbol); } -else +else if (hTableExists(db, "refGene")) { // Join with refGene to make sure it's an NP for this species. sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where l.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc" , refLinkTable, geneSymbol); } +else + return NULL; +struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); hFreeConn(&conn); return npAcc; } static char *nmForGeneSymbol(char *db, char *geneSymbol) /* Given a gene symbol, look up and return its NM_ accession; if not found return NULL. */ { -struct sqlConnection *conn = hAllocConn(db); +if (trackHubDatabase(db)) + return NULL; char query[2048]; -char *geneTable = (dbHasNcbiRefSeq(db) ? "ncbiRefSeq" : "refGene"); +char *geneTable = NULL; +if (dbHasNcbiRefSeq(db)) + geneTable = "ncbiRefSeq"; +else if (hTableExists(db, "refGene")) + geneTable = "refGene"; +if (geneTable == NULL) + return NULL; sqlSafef(query, sizeof(query), "select name from %s where name2 = '%s' " "order by length(name), name", geneTable, geneSymbol); +struct sqlConnection *conn = hAllocConn(db); char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); return nmAcc; } static char *npForNm(char *db, char *nmAcc) /* Given an NM_ accession, look up and return its NP_ accession; if not found return NULL. */ { -struct sqlConnection *conn = hAllocConn(db); +if (trackHubDatabase(db)) + return NULL; char query[2048]; if (dbHasNcbiRefSeq(db)) { // ncbiRefSeq tables use versioned NM_ accs, but the user might have passed in a // versionless NM_, so adjust query accordingly: if (strchr(nmAcc, '.')) sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id = '%s'", nmAcc); else sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'", nmAcc); } -else +else if (hTableExists(db, "refGene")) { // Trim .version if present since our genbank tables don't use versioned names. char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.'); sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where r.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc", refLinkTable, trimmed); } +else return NULL; +struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); hFreeConn(&conn); return npAcc; } static char *getProteinSeq(char *db, char *acc) /* Return amino acid sequence for acc, or NULL if not found. */ { +if (trackHubDatabase(db)) + return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgPep")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } else { if (dbHasNcbiRefSeq(db)) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable " "where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } else { aaSeq *aaSeq = hGenBankGetPep(db, acc, gbSeqTable); if (aaSeq) seq = aaSeq->dna; } } return seq; } static char refBaseForNp(char *db, char *npAcc, int pos) // Get the amino acid base in NP_'s sequence at 1-based offset pos. { char *seq = getProteinSeq(db, npAcc); -char base = seq[pos-1]; +char base = seq ? seq[pos-1] : '\0'; freeMem(seq); return base; } struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */ // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles) { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[11]; int geneSymbolIx = 1; boolean isSubst; if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp, substrs, ArraySize(substrs))) || regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs))) { // User gave an NM_ accession but a protein change -- swap in the right NP_. int nmAccIx = 1; int geneSymbolIx = 4; int len = substrs[nmAccIx].rm_eo - substrs[nmAccIx].rm_so; char nmAcc[len+1]; safencpy(nmAcc, sizeof(nmAcc), term, len); char *npAcc = npForNm(db, nmAcc); if (isNotEmpty(npAcc)) { // Make it a real HGVS term with the NP and pass that on to the usual parser. int descStartIx = 5; char *description = term + substrs[descStartIx].rm_so; struct dyString *npTerm; if (regexSubstrMatched(substrs[geneSymbolIx])) { len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); npTerm = dyStringCreate("%s(%s):p.%s", npAcc, geneSymbol, description); } else npTerm = dyStringCreate("%s:p.%s", npAcc, description); hgvs = hgvsParseTerm(npTerm->string); dyStringFree(&npTerm); freeMem(npAcc); } } else if ((isSubst = regexMatchSubstr(term, pseudoHgvsGeneSymbolProtSubstExp, substrs, ArraySize(substrs))) || regexMatchSubstr(term, pseudoHgvsGeneSymbolProtRangeExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); char *npAcc = npForGeneSymbol(db, geneSymbol); if (isNotEmpty(npAcc)) { // Make it a real HGVS term with the NP and pass that on to the usual parser. int descStartIx = 2; char *description = term + substrs[descStartIx].rm_so; struct dyString *npTerm = dyStringCreate("%s(%s):p.%s", npAcc, geneSymbol, description); hgvs = hgvsParseTerm(npTerm->string); dyStringFree(&npTerm); freeMem(npAcc); } } else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolProtPosExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); char *npAcc = npForGeneSymbol(db, geneSymbol); if (isNotEmpty(npAcc)) { // Only position was provided, no change. Look up ref base and make a synonymous subst // so it's parseable HGVS. int posIx = 2; int pos = regexSubstringInt(term, substrs[posIx]); char refBase = refBaseForNp(db, npAcc, pos); struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=", npAcc, geneSymbol, refBase, pos); hgvs = hgvsParseTerm(npTerm->string); dyStringFree(&npTerm); freeMem(npAcc); } } else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); char *nmAcc = nmForGeneSymbol(db, geneSymbol); if (isNotEmpty(nmAcc)) { // Make it a real HGVS term with the NM and pass that on to the usual parser. int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3; char *description = term + substrs[descStartIx].rm_so; struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s", nmAcc, geneSymbol, description); hgvs = hgvsParseTerm(nmTerm->string); dyStringFree(&nmTerm); freeMem(nmAcc); } } return hgvs; } static char refBaseFromNucSubst(char *change) /* If sequence change is a nucleotide substitution, return the reference base, else null char. */ { if (regexMatchNoCase(change, "^([ACGTU])>")) return toupper(change[0]); return '\0'; } static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom. * If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL. * If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL. * If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to * the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to * genomic sequence if they differ. */ { boolean coordsOK = FALSE; if (retDiffRefAllele) *retDiffRefAllele = NULL; if (retFoundAcc) *retFoundAcc = NULL; if (retFoundVersion) *retFoundVersion = 0; char *chrom = hgOfficialChromName(db, hgvs->seqAcc); if (isNotEmpty(chrom)) { struct chromInfo *ci = hGetChromInfo(db, chrom); if ((hgvs->start1 >= 1 && hgvs->start1 <= ci->size) && (hgvs->end >=1 && hgvs->end <= ci->size)) { coordsOK = TRUE; if (retDiffRefAllele) { char hgvsBase = refBaseFromNucSubst(hgvs->change); if (hgvsBase != '\0') { struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom, hgvs->start1-1, hgvs->start1); touppers(refBase->dna); if (refBase->dna[0] != hgvsBase) *retDiffRefAllele = cloneString(refBase->dna); dnaSeqFree(&refBase); } } } // Since we found hgvs->seqAcc, split it into versionless and version parts. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); if (retFoundVersion) { char *p = strchr(hgvs->seqAcc, '.'); if (p) *retFoundVersion = atoi(p+1); } // Don't free chromInfo, it's cached! } return coordsOK; } static char *getCdnaSeq(char *db, char *acc) /* Return cdna sequence for acc, or NULL if not found. */ { +if (trackHubDatabase(db)) + return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgCdna")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } else { struct dnaSeq *cdnaSeq = NULL; if (dbHasNcbiRefSeq(db)) cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq"); else cdnaSeq = hGenBankGetMrna(db, acc, gbSeqTable); if (cdnaSeq) seq = cdnaSeq->dna; } return seq; } static boolean getCds(char *db, char *acc, struct genbankCds *retCds) /* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */ { +if (trackHubDatabase(db)) + return FALSE; boolean gotCds = FALSE; char query[1024]; if (startsWith("LRG_", acc)) sqlSafef(query, sizeof(query), "select cds from lrgCds where id = '%s'", acc); else if (dbHasNcbiRefSeq(db) && // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete: strchr(acc, '.')) sqlSafef(query, sizeof(query), "select cds from ncbiRefSeqCds where id = '%s'", acc); else sqlSafef(query, sizeof(query), "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND " "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc); struct sqlConnection *conn = hAllocConn(db); char cdsBuf[2048]; char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); hFreeConn(&conn); if (isNotEmpty(cdsStr)) gotCds = (genbankCdsParse(cdsStr, retCds) && retCds->startComplete && retCds->start != retCds->end); return gotCds; } static char refBaseFromProt(char *change) /* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code, * else null char. */ { regmatch_t substrs[1]; if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs))) { char match[256]; // 1- or 3-letter code regexSubstringCopy(change, substrs[0], match, sizeof(match)); if (strlen(match) == 1) return toupper(match[0]); else return aaAbbrToLetter(match); } return '\0'; } static char *cloneStringFromChar(char c) /* Return a cloned string from a single character. */ { char str[2]; str[0] = c; str[1] = '\0'; return cloneString(str); } static void checkRefAllele(struct hgvsVariant *hgvs, int start1, char *accSeq, char **retDiffRefAllele) /* If hgvs change includes a reference allele, and if accSeq at start1 does not match, * then set retDiffRefAllele to the accSeq version. retDiffRefAllele must be non-NULL. */ { char hgvsRefBase = (hgvs->type == hgvstProtein) ? refBaseFromProt(hgvs->change) : refBaseFromNucSubst(hgvs->change); if (hgvsRefBase != '\0') { char seqRefBase = toupper(accSeq[start1-1]); if (seqRefBase != hgvsRefBase) *retDiffRefAllele = cloneStringFromChar(seqRefBase); } if (hgvs->type == hgvstProtein && *retDiffRefAllele == NULL) { // Protein ranges have a second ref allele base to check char *p = strchr(hgvs->change, '_'); if (p != NULL) { hgvsRefBase = refBaseFromProt(p+1); if (hgvsRefBase != '\0') { int end1 = atoi(skipToNumeric(p+1)); char seqRefBase = toupper(accSeq[end1-1]); if (seqRefBase != hgvsRefBase) *retDiffRefAllele = cloneStringFromChar(seqRefBase); } } } } static int getGbVersion(char *db, char *acc) /* Return the local version that we have for acc. */ { +if (trackHubDatabase(db)) + return 0; char query[2048]; sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc); struct sqlConnection *conn = hAllocConn(db); int version = sqlQuickNum(conn, query); hFreeConn(&conn); return version; } static char *normalizeVersion(char *db, char *acc, int *retFoundVersion) /* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0. * The user may give us a RefSeq accession with or without a .version. * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary. * If we look it up and can't find it, this returns NULL. * If instead we're using genbank tables, return acc with no version. * That ensures that acc will be found in our local tables. */ { +if (trackHubDatabase(db)) + return NULL; char *normalizedAcc = NULL; int foundVersion = 0; if (startsWith("LRG_", acc)) { normalizedAcc = cloneString(acc); } else if (dbHasNcbiRefSeq(db)) { // ncbiRefSeq tables need versioned accessions. if (strchr(acc, '.')) normalizedAcc = cloneString(acc); else { char query[2048]; sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc); struct sqlConnection *conn = hAllocConn(db); normalizedAcc = sqlQuickString(conn, query); hFreeConn(&conn); } if (isNotEmpty(normalizedAcc)) { char *p = strchr(normalizedAcc, '.'); assert(p); foundVersion = atoi(p+1); } } else { // genbank tables -- no version normalizedAcc = cloneFirstWordByDelimiter(acc, '.'); foundVersion = getGbVersion(db, normalizedAcc); } if (retFoundVersion) *retFoundVersion = foundVersion; return normalizedAcc; } static boolean hgvsValidateGene(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ { char *acc = normalizeVersion(db, hgvs->seqAcc, retFoundVersion); if (isEmpty(acc)) return FALSE; boolean coordsOK = FALSE; char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc)); if (accSeq) { // By convention, foundAcc is always versionless because it's accompanied by foundVersion. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(acc, '.'); int seqLen = strlen(accSeq); if (hgvs->type == hgvstCoding) { // Coding term coords can extend beyond the bounds of the transcript so // we can't check them without mapping to the genome. However, if the coords // are in bounds and a reference allele is provided, we can check that. struct genbankCds cds; coordsOK = getCds(db, acc, &cds); if (coordsOK && retDiffRefAllele) { int start = hgvs->start1 + (hgvs->startIsUtr3 ? cds.end : cds.start); if (hgvs->startOffset == 0 && start >= 1 && start <= seqLen) checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); } } else { if (hgvs->start1 >= 1 && hgvs->start1 <= seqLen && hgvs->end >= 1 && hgvs->end <= seqLen) { coordsOK = TRUE; if (retDiffRefAllele) checkRefAllele(hgvs, hgvs->start1, accSeq, retDiffRefAllele); } } } freeMem(accSeq); freeMem(acc); return coordsOK; } boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ { if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito) return hgvsValidateGenomic(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele); else return hgvsValidateGene(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele); } static struct bed3 *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */ { struct bed3 *region = NULL; char *chrom = hgOfficialChromName(db, hgvs->seqAcc); if (isNotEmpty(chrom)) { AllocVar(region); region->chrom = chrom; region->chromStart = hgvs->start1 - 1; region->chromEnd = hgvs->end; } if (retPslTable) *retPslTable = NULL; return region; } static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int maxCoord, struct genbankCds *cds, int *retStart, int *retEnd, int *retUpstreamBases, int *retDownstreamBases) /* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths * for when the coding HGVS has coordinates that extend beyond its sequence boundaries. * ret* args must be non-NULL. */ { // If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive. int start = hgvs->start1; if (start > 0) start -= 1; // If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end. int end = hgvs->end; if (end < 0) end += 1; // If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart if (hgvs->startIsUtr3) *retStart = cds->end + start; else *retStart = cds->start + start; if (hgvs->endIsUtr3) *retEnd = cds->end + end; else *retEnd = cds->start + end; // Now check for coords that extend beyond the transcript('s alignment to the genome) if (*retStart < 0) { // hgvs->start1 is upstream of coding transcript. *retUpstreamBases = -*retStart; *retStart = 0; } else if (*retStart > maxCoord) { // Even the start coord is downstream of coding transcript -- make a negative "upstream" // for adjusting start. *retUpstreamBases = -(*retStart - maxCoord + 1); *retStart = maxCoord - 1; } else *retUpstreamBases = 0; if (*retEnd > maxCoord) { // hgvs->end is downstream of coding transcript. *retDownstreamBases = *retEnd - maxCoord; *retEnd = maxCoord; } else if (*retEnd < 0) { // Even the end coord is upstream of coding transcript -- make a negative "downstream" // for adjusting end. *retDownstreamBases = *retEnd; *retEnd = 0; } else *retDownstreamBases = 0; } static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, char *acc, int accSize, int accEnd, struct genbankCds *cds, int *retUpstreamBases, int *retDownstreamBases) /* Allocate and return a PSL modeling the variant in nucleotide sequence acc. * The PSL target is the sequence and the query is the changed part of the sequence. * accEnd is given in case accSize includes an unaligned poly-A tail. * If hgvs is coding ("c.") then the caller must pass in a valid cds. * In case the start or end position is outside the bounds of the sequence, set retUpstreamBases * or retDownstreamBases to the number of bases beyond the beginning or end of sequence. * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point * and the caller will have to map that to the genome and then apply the intron offset. */ { if (hgvs == NULL) return NULL; if (hgvs->type == hgvstProtein) errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein."); struct psl *psl; AllocVar(psl); psl->tName = cloneString(acc); safecpy(psl->strand, sizeof(psl->strand), "+"); psl->tSize = accSize; if (hgvs->type != hgvstCoding) { // Sane 1-based fully closed coords. psl->tStart = hgvs->start1 - 1; psl->tEnd = hgvs->end; } else { // Simple or insanely complex CDS-relative coords. hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd, retUpstreamBases, retDownstreamBases); } int refLen = psl->tEnd - psl->tStart; // Just use refLen for alt until we parse the sequence change portion of the term: int altLen = refLen; psl->qName = cloneString(hgvs->change); psl->qSize = altLen; psl->qStart = 0; psl->qEnd = altLen; psl->blockCount = 1; AllocArray(psl->blockSizes, psl->blockCount); AllocArray(psl->qStarts, psl->blockCount); AllocArray(psl->tStarts, psl->blockCount); psl->blockSizes[0] = refLen <= altLen ? refLen : altLen; psl->qStarts[0] = psl->qStart; psl->tStarts[0] = psl->tStart; return psl; } static struct psl *mapPsl(char *db, struct hgvsVariant *hgvs, char *pslTable, char *acc, struct genbankCds *cds, int *retUpstream, int *retDownstream) /* If acc is found in pslTable, use pslTransMap to map hgvs onto the genome. */ { struct psl *mappedToGenome = NULL; char query[2048]; sqlSafef(query, sizeof(query), "select * from %s where qName = '%s'", pslTable, acc); struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr = sqlGetResult(conn, query); char **row; if (sr && (row = sqlNextRow(sr))) { int bin = 1; // All PSL tables used here, and any made in the future, use the bin column. struct psl *txAli = pslLoad(row+bin); // variantPsl contains the anchor if a non-cdsStart anchor is used because // the actual position might be outside the bounds of the transcript sequence (intron/UTR) struct psl *variantPsl = pslFromHgvsNuc(hgvs, acc, txAli->qSize, txAli->qEnd, cds, retUpstream, retDownstream); mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); pslFree(&variantPsl); pslFree(&txAli); } sqlFreeResult(&sr); hFreeConn(&conn); return mappedToGenome; } static char *pslTableForAcc(char *db, char *acc) /* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where - * acc should be found. Don't free the returned string. */ + * acc should be found (table may or may not exist). Don't free the returned string. */ { char *pslTable = NULL; if (startsWith("LRG_", acc)) pslTable = "lrgTranscriptAli"; else if (startsWith("NM_", acc)) { // Use NCBI's alignments if they are available if (dbHasNcbiRefSeq(db)) pslTable = "ncbiRefSeqPsl"; else pslTable = "refSeqAli"; } return pslTable; } #define limitToRange(val, min, max) { if (val < min) { val = min; } \ if (val > max) { val = max; } } static struct bed3 *pslAndFriendsToRegion(struct psl *psl, struct hgvsVariant *hgvs, int upstream, int downstream) /* If hgvs has any intron offsets and/or upstream/downstream offsets, add those to the * anchor coords in psl and return the variant's region of the genome. */ { struct bed3 *region = bed3New(psl->tName, psl->tStart, psl->tEnd); // If the start and/or end is in an intron, add the intron offset now. boolean revStrand = (psl->strand[0] == '-'); if (hgvs->startOffset != 0) { if (revStrand) region->chromEnd -= hgvs->startOffset; else region->chromStart += hgvs->startOffset; } if (hgvs->endOffset != 0) { if (revStrand) region->chromStart -= hgvs->endOffset; else region->chromEnd += hgvs->endOffset; } // Apply extra up/downstream offsets (usually 0) if (revStrand) { region->chromStart -= downstream; region->chromEnd += upstream; } else { region->chromStart -= upstream; region->chromEnd += downstream; } limitToRange(region->chromStart, 0, psl->tSize); limitToRange(region->chromEnd, 0, psl->tSize); return region; } static struct bed3 *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (isEmpty(acc)) return NULL; if (hgvs->type == hgvstGenomic) return hgvsMapGDotToGenome(db, hgvs, retPslTable); struct bed3 *region = NULL; char *pslTable = pslTableForAcc(db, acc); struct genbankCds cds; boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE; if (pslTable && (hgvs->type != hgvstCoding || gotCds) && hTableExists(db, pslTable)) { int upstream = 0, downstream = 0; struct psl *mappedToGenome = mapPsl(db, hgvs, pslTable, acc, &cds, &upstream, &downstream); // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back // on UCSC alignments. if (!mappedToGenome && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli")) { char *accNoVersion = cloneFirstWordByDelimiter(acc, '.'); gotCds = (hgvs->type == hgvstCoding) ? getCds(db, accNoVersion, &cds) : FALSE; if (hgvs->type != hgvstCoding || gotCds) mappedToGenome = mapPsl(db, hgvs, "refSeqAli", accNoVersion, &cds, &upstream, &downstream); if (mappedToGenome) { pslTable = "refSeqAli"; acc = accNoVersion; } } if (mappedToGenome) { region = pslAndFriendsToRegion(mappedToGenome, hgvs, upstream, downstream); pslFree(&mappedToGenome); } } if (region && retPslTable) *retPslTable = cloneString(pslTable); return region; } static struct bed3 *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { struct bed3 *region = NULL; char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (acc && startsWith("NP_", acc)) { // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome. struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (dbHasNcbiRefSeq(db)) sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); - else + else if (hTableExists(db, "refGene")) sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); + else + return NULL; char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); if (nmAcc) { struct hgvsVariant cDot; zeroBytes(&cDot, sizeof(cDot)); cDot.type = hgvstCoding; cDot.seqAcc = nmAcc; cDot.start1 = ((hgvs->start1 - 1) * 3) + 1; cDot.end = ((hgvs->end - 1) * 3) + 3; cDot.change = hgvs->change; region = hgvsMapNucToGenome(db, &cDot, retPslTable); freeMem(nmAcc); } } return region; } struct bed3 *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { if (hgvs == NULL) return NULL; struct bed3 *region = NULL; if (hgvs->type == hgvstProtein) region = hgvsMapPDotToGenome(db, hgvs, retPslTable); else region = hgvsMapNucToGenome(db, hgvs, retPslTable); return region; }