43db133d50ad5cd505cc1ea853b701b271054e91 angie Mon Sep 26 16:56:04 2016 -0700 Added support for HGVS-like syntax that b0b got from a user: no '.' and NM_ accession with protein change description. Also, if the new ncbiRefSeq* tables are present, use those instead of refSeqAli and Genbank metadata tables. At this point, ncbiRefSeqPsl is missing some transcripts that are present in other ncbiRefSeq* tables, so if an accession is not found there, try again in refSeqAli and Genbank. refs #15071, #13673 note-443 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 3074a77..ff10445 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -1,1033 +1,1235 @@ /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */ /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hgHgvs.h" #include "chromInfo.h" #include "genbank.h" #include "hdb.h" #include "pslTransMap.h" #include "regexHelper.h" // Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq: #define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])" #define lrgRegionExp "(LRG_[0-9+])" // NC = RefSeq reference assembly chromosome // NG = RefSeq incomplete genomic region (e.g. gene locus) // NM = RefSeq curated mRNA // NP = RefSeq curated protein // NR = RefSeq curated (non-coding) RNA #define geneSymbolExp "([A-Za-z0-9./_-]+)" #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version // ...... optional gene symbol in ()s // .... optional gene symbol #define versionedRefSeqNCExp versionedAccPrefixExp("NC") #define versionedRefSeqNGExp versionedAccPrefixExp("NG") #define versionedRefSeqNMExp versionedAccPrefixExp("NM") #define versionedRefSeqNPExp versionedAccPrefixExp("NP") #define versionedRefSeqNRExp versionedAccPrefixExp("NR") // Nucleotide position regexes // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA) #define posIntExp "([0-9]+)" #define hgvsNtPosExp posIntExp "(_" posIntExp ")?" // ...... 1-based start position // ............. optional range separator and end position // ...... 1-based end position // Now for a regex that can handle positions like "26" or "*80" or "-24+75_-24+92"... #define anchorExp "([-*])?" #define offsetExp "([-+])" #define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?" #define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?" // ... optional anchor "-" or "*" // ... mandatory first number // ....... optional offset separator and offset // ... offset separator // ... offset number // ............... optional range separator and complex num // ... optional anchor "-" or "*" // ... first number // ....... optional offset separator and offset // ... offset separator // ... offset number -#define hgvsCDotPosExp "c\\." hgvsCdsPosExp -#define hgvsGDotPosExp "g\\." hgvsNtPosExp -#define hgvsMDotPosExp "m\\." hgvsNtPosExp -#define hgvsNDotPosExp "n\\." hgvsNtPosExp -#define hgvsRDotPosExp "r\\." hgvsNtPosExp +// It's pretty common for users to omit the '.' so if it's missing but the rest of the regex fits, +// roll with it. +#define hgvsCDotPosExp "c\\.?" hgvsCdsPosExp +#define hgvsGDotPosExp "g\\.?" hgvsNtPosExp +#define hgvsMDotPosExp "m\\.?" hgvsNtPosExp +#define hgvsNDotPosExp "n\\.?" hgvsNtPosExp +#define hgvsRDotPosExp "r\\.?" hgvsNtPosExp // Protein substitution regex #define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter" #define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp #define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)" #define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp // ... // original sequence // ...... // 1-based position // ... // replacement sequence // Protein range (or just single pos) regex #define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)" #define hgvsPDotRangeExp "p\\." hgvsAaRangeExp // original start AA ... // 1-based start position ... // optional range sep and AA+pos ..................................... // original end AA ... // 1-based end position ... // change description ... // Complete HGVS term regexes combining sequence identifiers and change operations #define hgvsFullRegex(seq, op) "^" seq ":" op #define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGDotPosExp) #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)" // substring numbering: // 0..................................... whole matching string // 1................. accession and optional dot version // 2........ optional dot version // 3...... (n/a) optional gene symbol in ()s // 4.... (n/a) optional gene symbol // 5... 1-based start position // 6.... optional range separator and end position // 7... 1-based end position // 8.... change description #define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) #define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)" // substring numbering: // 0..................................................... whole matching string // 1................... LRG transcript // 2... optional anchor "-" or "*" // 3... mandatory first number // 4....... optional offset separator and offset // 5... offset separator // 6... offset number // 7............... optional range sep and complex num // 8... optional anchor "-" or "*" // 9... first number // 10....... optional offset separator and offset // 11... offset separator // 12... offset number // 13........ sequence change #define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp) #define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)" // substring numbering: // 0............................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5... optional anchor // 6... mandatory first number // 7....... optional offset // 8... offset separator // 9... offset number // 10............... optional range // 11... optional anchor // 12... first number // 13....... optional offset // 14... offset separator // 15... offset number // 16........ sequence change #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position // 7...... replacement sequence #define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5... original start AA // 6... 1-based start position // 7................. optional range sep and AA+pos // 8... original end AA // 9... 1-based end position // 10..... change description // Pseudo-HGVS in common usage +// Sometimes users give an NM_ accession, but a protein change. +#define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp "[: p.]+" hgvsAminoAcidSubstExp +// substring numbering: +// 0..................................................... whole matching string +// 1............... acc & optional dot version +// 2........ optional dot version +// 3..... optional gene sym in ()s +// 4... optional gene symbol +// 5..... original sequence +// 6...... 1-based position +// 7...... replacement sequence + +#define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp "[: p.]+" hgvsAaRangeExp +// substring numbering: +// 0..................................................... whole matching string +// 1............... acc & optional dot version +// 2........ optional dot version +// 3..... optional gene sym in ()s +// 4... optional gene symbol +// 5... original start AA +// 6... 1-based start position +// 7.......... optional range sep and AA+pos +// 8... original end AA +// 9... 1-based end position +// 10.... change description + +// Common: gene symbol followed by space and/or punctuation followed by protein change #define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... original sequence // 3...... 1-based position // 4...... replacement sequence #define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp "[: p.]+" hgvsAaRangeExp // 0..................................................... whole matching string // 1................... gene symbol // 2... original start AA // 3... 1-based start position // 4................ optional range sep and AA+pos // 5... original end AA // 6... 1-based end position // 7..... change description +// As above but omitting the protein change #define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp "[: p.]+" posIntExp // 0.......................... whole matching string // 1................... gene symbol // 2..... 1-based position +// Gene symbol, maybe punctuation, and a clear "c." position (and possibly change) #define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... optional beginning of position exp // 3..... beginning of position exp static struct hgvsVariant *hgvsParseGDotPos(char *term) /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */ { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[17]; if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs))) { int accIx = 1; int startPosIx = 5; int endPosIx = 7; int changeIx = 8; AllocVar(hgvs); hgvs->type = hgvstGenomic; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; hgvs->change = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset, boolean *retIsUtr3, int *retPos, int *retOffset) /* Extract matches from substrs starting at substrOffset to parse a complex number * description into anchor type, 1-based position and optional offset. */ { int anchorIx = substrOffset; int posIx = substrOffset + 1; int offsetOpIx = substrOffset + 3; int offsetIx = substrOffset + 4; // Determine what startPos is relative to, based on optional anchor and optional offset char anchor[16]; // string length 0 or 1 regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor)); char offsetOp[16]; // string length 0 or 1 regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp)); *retIsUtr3 = (anchor[0] == '*'); *retPos = regexSubstringInt(term, substrs[posIx]); // Is pos negative? if (anchor[0] == '-') *retPos = -*retPos; // Grab offset, if there is one. if (isNotEmpty(offsetOp)) { *retOffset = regexSubstringInt(term, substrs[offsetIx]); if (offsetOp[0] == '-') *retOffset = -*retOffset; } } static struct hgvsVariant *hgvsParseCDotPos(char *term) /* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; boolean matches = FALSE; int accIx = 1; int startAnchorIx = 2; int endAnchorIx = 8; int endPosIx = 9; int changeIx = 13; // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that // affects all substring offsets after the accession. int refSeqExtra = 3; int geneSymbolIx = -1; regmatch_t substrs[17]; if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs))) { matches = TRUE; } else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs))) { matches = TRUE; geneSymbolIx = 4; startAnchorIx += refSeqExtra; endAnchorIx += refSeqExtra; endPosIx += refSeqExtra; changeIx += refSeqExtra; } if (matches) { AllocVar(hgvs); hgvs->type = hgvstCoding; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); extractComplexNum(term, substrs, startAnchorIx, &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset); if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); if (regexSubstrMatched(substrs[endPosIx])) { extractComplexNum(term, substrs, endAnchorIx, &hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset); } else { hgvs->end = hgvs->start1; hgvs->endIsUtr3 = hgvs->startIsUtr3; hgvs->endOffset = hgvs->startOffset; } hgvs->change = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } static struct hgvsVariant *hgvsParsePDotSubst(char *term) /* If term is parseable as an HGVS protein substitution, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[8]; if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs))) { int accIx = 1; int geneSymbolIx = 4; int refIx = 5; int posIx = 6; int altIx = 7; AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); int changeStart = substrs[refIx].rm_so; int changeEnd = substrs[altIx].rm_eo; int len = changeEnd - changeStart; char change[len+1]; safencpy(change, sizeof(change), term+changeStart, len); hgvs->change = cloneString(change); if (regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[posIx]); hgvs->end = hgvs->start1; } return hgvs; } static struct hgvsVariant *hgvsParsePDotRange(char *term) /* If term is parseable as an HGVS protein range change, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[11]; if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs))) { int accIx = 1; int geneSymbolIx = 4; int startRefIx = 5; int startPosIx = 6; int endPosIx = 9; int changeIx = 10; AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); int changeStart = substrs[startRefIx].rm_so; int changeEnd = substrs[changeIx].rm_eo; int len = changeEnd - changeStart; char change[len+1]; safencpy(change, sizeof(change), term+changeStart, len); hgvs->change = cloneString(change); if (regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; } return hgvs; } struct hgvsVariant *hgvsParseTerm(char *term) /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions, coordinates or alleles. */ { struct hgvsVariant *hgvs = hgvsParseCDotPos(term); if (hgvs == NULL) hgvs = hgvsParsePDotSubst(term); if (hgvs == NULL) hgvs = hgvsParsePDotRange(term); if (hgvs == NULL) hgvs = hgvsParseGDotPos(term); return hgvs; } +static boolean dbHasNcbiRefSeq(char *db) +/* Test whether NCBI's RefSeq alignments are available in db. */ +{ +// hTableExists() caches results so this shouldn't make for loads of new SQL queries if called +// more than once. +return (hTableExists(db, "ncbiRefSeq") && hTableExists(db, "ncbiRefSeqPsl") && + hTableExists(db, "ncbiRefSeqCds") && hTableExists(db, "ncbiRefSeqLink") && + hTableExists(db, "ncbiRefSeqPepTable") && + hTableExists(db, "seqNcbiRefSeq") && hTableExists(db, "extNcbiRefSeq")); +} + static char *npForGeneSymbol(char *db, char *geneSymbol) -/* Given a gene symbol, look it up to find its NP_ accession; if not found return NULL. */ +/* Given a gene symbol, look up and return its NP_ accession; if not found return NULL. */ { struct sqlConnection *conn = hAllocConn(db); char query[2048]; -// Use refGene to make sure it's an NP for this species. +if (dbHasNcbiRefSeq(db)) + { + sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where name = '%s' " + "and protAcc != 'n/a' and protAcc != '' " + "order by length(protAcc), protAcc", + geneSymbol); + } +else + { + // Join with refGene to make sure it's an NP for this species. sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where l.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc" , refLinkTable, geneSymbol); + } char *npAcc = sqlQuickString(conn, query); hFreeConn(&conn); return npAcc; } static char *nmForGeneSymbol(char *db, char *geneSymbol) -/* Given a gene symbol, look it up to find its NM_ accession; if not found return NULL. */ +/* Given a gene symbol, look up and return its NM_ accession; if not found return NULL. */ { struct sqlConnection *conn = hAllocConn(db); char query[2048]; -sqlSafef(query, sizeof(query), "select name from refGene where name2 = '%s' " - "order by length(name), name", geneSymbol); +char *geneTable = (dbHasNcbiRefSeq(db) ? "ncbiRefSeq" : "refGene"); +sqlSafef(query, sizeof(query), "select name from %s where name2 = '%s' " + "order by length(name), name", geneTable, geneSymbol); char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); return nmAcc; } -static char *getProteinSeq(char *db, char *acc, char **retFoundAcc) -/* Return amino acid sequence for acc, or NULL if not found. If retFoundAcc is not NULL, - * set it to the accession for which we grabbed seq (might have a .version chopped off). */ +static char *npForNm(char *db, char *nmAcc) +/* Given an NM_ accession, look up and return its NP_ accession; if not found return NULL. */ +{ +struct sqlConnection *conn = hAllocConn(db); +char query[2048]; +if (dbHasNcbiRefSeq(db)) + { + // ncbiRefSeq tables use versioned NM_ accs, but the user might have passed in a + // versionless NM_, so adjust query accordingly: + if (strchr(nmAcc, '.')) + sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id = '%s'", nmAcc); + else + sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'", + nmAcc); + } +else + { + // Trim .version if present since our genbank tables don't use versioned names. + char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.'); + sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " + "where r.name = '%s' and l.mrnaAcc = r.name " + "and l.protAcc != '' order by length(l.protAcc), l.protAcc", + refLinkTable, trimmed); + } +char *npAcc = sqlQuickString(conn, query); +hFreeConn(&conn); +return npAcc; +} + +static char *getProteinSeq(char *db, char *acc) +/* Return amino acid sequence for acc, or NULL if not found. */ { char *seq = NULL; -char *foundAcc = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgPep")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); - if (seq) - foundAcc = cloneString(acc); } } else { - char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.'); - aaSeq *aaSeq = hGenBankGetPep(db, trimmedAcc, gbSeqTable); - if (aaSeq) + if (dbHasNcbiRefSeq(db)) { + char query[2048]; + sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable " + "where name = '%s'", acc); + struct sqlConnection *conn = hAllocConn(db); + seq = sqlQuickString(conn, query); + hFreeConn(&conn); + } + else + { + aaSeq *aaSeq = hGenBankGetPep(db, acc, gbSeqTable); + if (aaSeq) seq = aaSeq->dna; - foundAcc = trimmedAcc; } } -if (retFoundAcc) - *retFoundAcc = foundAcc; + return seq; } static char refBaseForNp(char *db, char *npAcc, int pos) // Get the amino acid base in NP_'s sequence at 1-based offset pos. { -char *seq = getProteinSeq(db, npAcc, NULL); +char *seq = getProteinSeq(db, npAcc); char base = seq[pos-1]; freeMem(seq); return base; } struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */ // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles) { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[11]; int geneSymbolIx = 1; -boolean isSubst = regexMatchSubstr(term, pseudoHgvsGeneSymbolProtSubstExp, - substrs, ArraySize(substrs)); -if (isSubst || +boolean isSubst; +if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp, + substrs, ArraySize(substrs))) || + regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs))) + { + // User gave an NM_ accession but a protein change -- swap in the right NP_. + int nmAccIx = 1; + int geneSymbolIx = 4; + int len = substrs[nmAccIx].rm_eo - substrs[nmAccIx].rm_so; + char nmAcc[len+1]; + safencpy(nmAcc, sizeof(nmAcc), term, len); + char *npAcc = npForNm(db, nmAcc); + if (isNotEmpty(npAcc)) + { + // Make it a real HGVS term with the NP and pass that on to the usual parser. + int descStartIx = 5; + char *description = term + substrs[descStartIx].rm_so; + struct dyString *npTerm; + if (regexSubstrMatched(substrs[geneSymbolIx])) + { + len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; + char geneSymbol[len+1]; + safencpy(geneSymbol, sizeof(geneSymbol), term, len); + npTerm = dyStringCreate("%s(%s):p.%s", npAcc, geneSymbol, description); + } + else + npTerm = dyStringCreate("%s:p.%s", npAcc, description); + hgvs = hgvsParseTerm(npTerm->string); + dyStringFree(&npTerm); + freeMem(npAcc); + } + } +else if ((isSubst = regexMatchSubstr(term, pseudoHgvsGeneSymbolProtSubstExp, + substrs, ArraySize(substrs))) || regexMatchSubstr(term, pseudoHgvsGeneSymbolProtRangeExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); char *npAcc = npForGeneSymbol(db, geneSymbol); if (isNotEmpty(npAcc)) { // Make it a real HGVS term with the NP and pass that on to the usual parser. int descStartIx = 2; char *description = term + substrs[descStartIx].rm_so; struct dyString *npTerm = dyStringCreate("%s(%s):p.%s", npAcc, geneSymbol, description); hgvs = hgvsParseTerm(npTerm->string); dyStringFree(&npTerm); + freeMem(npAcc); } } else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolProtPosExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); char *npAcc = npForGeneSymbol(db, geneSymbol); if (isNotEmpty(npAcc)) { // Only position was provided, no change. Look up ref base and make a synonymous subst // so it's parseable HGVS. int posIx = 2; int pos = regexSubstringInt(term, substrs[posIx]); char refBase = refBaseForNp(db, npAcc, pos); struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=", npAcc, geneSymbol, refBase, pos); hgvs = hgvsParseTerm(npTerm->string); dyStringFree(&npTerm); + freeMem(npAcc); } } else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); char *nmAcc = nmForGeneSymbol(db, geneSymbol); if (isNotEmpty(nmAcc)) { // Make it a real HGVS term with the NM and pass that on to the usual parser. int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3; char *description = term + substrs[descStartIx].rm_so; struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s", nmAcc, geneSymbol, description); hgvs = hgvsParseTerm(nmTerm->string); dyStringFree(&nmTerm); + freeMem(nmAcc); } } return hgvs; } static char refBaseFromNucSubst(char *change) /* If sequence change is a nucleotide substitution, return the reference base, else null char. */ { if (regexMatchNoCase(change, "^([ACGTU])>")) return toupper(change[0]); return '\0'; } static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom. * If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL. * If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL. * If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to * the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to * genomic sequence if they differ. */ { boolean coordsOK = FALSE; if (retDiffRefAllele) *retDiffRefAllele = NULL; if (retFoundAcc) *retFoundAcc = NULL; if (retFoundVersion) *retFoundVersion = 0; char *chrom = hgOfficialChromName(db, hgvs->seqAcc); if (isNotEmpty(chrom)) { struct chromInfo *ci = hGetChromInfo(db, chrom); if ((hgvs->start1 >= 1 && hgvs->start1 <= ci->size) && (hgvs->end >=1 && hgvs->end <= ci->size)) { coordsOK = TRUE; if (retDiffRefAllele) { char hgvsBase = refBaseFromNucSubst(hgvs->change); if (hgvsBase != '\0') { struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom, hgvs->start1-1, hgvs->start1); touppers(refBase->dna); if (refBase->dna[0] != hgvsBase) *retDiffRefAllele = cloneString(refBase->dna); - freeMem(refBase); + dnaSeqFree(&refBase); } } } // Since we found hgvs->seqAcc, split it into versionless and version parts. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); if (retFoundVersion) { char *p = strchr(hgvs->seqAcc, '.'); if (p) *retFoundVersion = atoi(p+1); } // Don't free chromInfo, it's cached! } return coordsOK; } -static char *getCdnaSeq(char *db, char *acc, char **retFoundAcc) -/* Return cdna sequence for acc, or NULL if not found. If retFoundAcc is not NULL, - * set it to the accession for which we grabbed seq (might have a .version chopped off). */ +static char *getCdnaSeq(char *db, char *acc) +/* Return cdna sequence for acc, or NULL if not found. */ { char *seq = NULL; -char *foundAcc = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgCdna")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); - if (seq) - foundAcc = cloneString(acc); } } else { - char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.'); - struct dnaSeq *gbSeq = hGenBankGetMrna(db, trimmedAcc, gbSeqTable); - if (gbSeq) - { - seq = gbSeq->dna; - foundAcc = trimmedAcc; - } + struct dnaSeq *cdnaSeq = NULL; + if (dbHasNcbiRefSeq(db)) + cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq"); + else + cdnaSeq = hGenBankGetMrna(db, acc, gbSeqTable); + if (cdnaSeq) + seq = cdnaSeq->dna; } -if (retFoundAcc) - *retFoundAcc = foundAcc; return seq; } static boolean getCds(char *db, char *acc, struct genbankCds *retCds) /* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */ { boolean gotCds = FALSE; char query[1024]; if (startsWith("LRG_", acc)) sqlSafef(query, sizeof(query), "select cds from lrgCds where id = '%s'", acc); +else if (dbHasNcbiRefSeq(db) && + // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete: + strchr(acc, '.')) + sqlSafef(query, sizeof(query), + "select cds from ncbiRefSeqCds where id = '%s'", acc); else sqlSafef(query, sizeof(query), "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND " "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc); struct sqlConnection *conn = hAllocConn(db); char cdsBuf[2048]; char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); hFreeConn(&conn); if (isNotEmpty(cdsStr)) gotCds = (genbankCdsParse(cdsStr, retCds) && retCds->startComplete && retCds->start != retCds->end); return gotCds; } -static int getGbVersion(char *db, char *acc) -/* Return the local version that we have for acc. */ -{ -char query[2048]; -sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc); -struct sqlConnection *conn = hAllocConn(db); -int version = sqlQuickNum(conn, query); -hFreeConn(&conn); -return version; -} - static char refBaseFromProt(char *change) /* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code, * else null char. */ { regmatch_t substrs[1]; if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs))) { char match[256]; // 1- or 3-letter code regexSubstringCopy(change, substrs[0], match, sizeof(match)); if (strlen(match) == 1) return toupper(match[0]); else return aaAbbrToLetter(match); } return '\0'; } static char *cloneStringFromChar(char c) /* Return a cloned string from a single character. */ { char str[2]; str[0] = c; str[1] = '\0'; return cloneString(str); } static void checkRefAllele(struct hgvsVariant *hgvs, int start1, char *accSeq, char **retDiffRefAllele) /* If hgvs change includes a reference allele, and if accSeq at start1 does not match, * then set retDiffRefAllele to the accSeq version. retDiffRefAllele must be non-NULL. */ { char hgvsRefBase = (hgvs->type == hgvstProtein) ? refBaseFromProt(hgvs->change) : refBaseFromNucSubst(hgvs->change); if (hgvsRefBase != '\0') { char seqRefBase = toupper(accSeq[start1-1]); if (seqRefBase != hgvsRefBase) *retDiffRefAllele = cloneStringFromChar(seqRefBase); } if (hgvs->type == hgvstProtein && *retDiffRefAllele == NULL) { // Protein ranges have a second ref allele base to check char *p = strchr(hgvs->change, '_'); if (p != NULL) { hgvsRefBase = refBaseFromProt(p+1); if (hgvsRefBase != '\0') { int end1 = atoi(skipToNumeric(p+1)); char seqRefBase = toupper(accSeq[end1-1]); if (seqRefBase != hgvsRefBase) *retDiffRefAllele = cloneStringFromChar(seqRefBase); } } } } +static int getGbVersion(char *db, char *acc) +/* Return the local version that we have for acc. */ +{ +char query[2048]; +sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc); +struct sqlConnection *conn = hAllocConn(db); +int version = sqlQuickNum(conn, query); +hFreeConn(&conn); +return version; +} + +static char *normalizeVersion(char *db, char *acc, int *retFoundVersion) +/* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0. + * The user may give us a RefSeq accession with or without a .version. + * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary. + * If we look it up and can't find it, this returns NULL. + * If instead we're using genbank tables, return acc with no version. + * That ensures that acc will be found in our local tables. */ +{ +char *normalizedAcc = NULL; +int foundVersion = 0; +if (startsWith("LRG_", acc)) + { + normalizedAcc = cloneString(acc); + } +else if (dbHasNcbiRefSeq(db)) + { + // ncbiRefSeq tables need versioned accessions. + if (strchr(acc, '.')) + normalizedAcc = cloneString(acc); + else + { + char query[2048]; + sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc); + struct sqlConnection *conn = hAllocConn(db); + normalizedAcc = sqlQuickString(conn, query); + hFreeConn(&conn); + } + if (isNotEmpty(normalizedAcc)) + { + char *p = strchr(normalizedAcc, '.'); + assert(p); + foundVersion = atoi(p+1); + } + } +else + { + // genbank tables -- no version + normalizedAcc = cloneFirstWordByDelimiter(acc, '.'); + foundVersion = getGbVersion(db, normalizedAcc); + } +if (retFoundVersion) + *retFoundVersion = foundVersion; +return normalizedAcc; +} + static boolean hgvsValidateGene(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ { -char *foundAcc = NULL; -int foundVersion = 0; +char *acc = normalizeVersion(db, hgvs->seqAcc, retFoundVersion); +if (isEmpty(acc)) + return FALSE; boolean coordsOK = FALSE; -char *acc = hgvs->seqAcc; -char *accSeq = NULL; -if (hgvs->type == hgvstProtein) - accSeq = getProteinSeq(db, acc, &foundAcc); -else - accSeq = getCdnaSeq(db, acc, &foundAcc); -if (foundAcc && ! startsWith("LRG_", foundAcc)) - foundVersion = getGbVersion(db, foundAcc); -if (foundAcc) +char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc)); +if (accSeq) { + // By convention, foundAcc is always versionless because it's accompanied by foundVersion. + if (retFoundAcc) + *retFoundAcc = cloneFirstWordByDelimiter(acc, '.'); int seqLen = strlen(accSeq); if (hgvs->type == hgvstCoding) { // Coding term coords can extend beyond the bounds of the transcript so // we can't check them without mapping to the genome. However, if the coords // are in bounds and a reference allele is provided, we can check that. struct genbankCds cds; - coordsOK = getCds(db, foundAcc, &cds); + coordsOK = getCds(db, acc, &cds); if (coordsOK && retDiffRefAllele) { int start = hgvs->start1 + (hgvs->startIsUtr3 ? cds.end : cds.start); if (hgvs->startOffset == 0 && start >= 1 && start <= seqLen) checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); } } else { if (hgvs->start1 >= 1 && hgvs->start1 <= seqLen && hgvs->end >= 1 && hgvs->end <= seqLen) { coordsOK = TRUE; if (retDiffRefAllele) checkRefAllele(hgvs, hgvs->start1, accSeq, retDiffRefAllele); } } } -if (retFoundAcc) - *retFoundAcc = foundAcc; -if (retFoundVersion) - *retFoundVersion = foundVersion; +freeMem(accSeq); +freeMem(acc); return coordsOK; } boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ { if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito) return hgvsValidateGenomic(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele); else return hgvsValidateGene(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele); } static struct bed3 *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */ { struct bed3 *region = NULL; char *chrom = hgOfficialChromName(db, hgvs->seqAcc); if (isNotEmpty(chrom)) { AllocVar(region); region->chrom = chrom; region->chromStart = hgvs->start1 - 1; region->chromEnd = hgvs->end; } if (retPslTable) *retPslTable = NULL; return region; } static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs, - int maxCoord, struct genbankCds cds, + int maxCoord, struct genbankCds *cds, int *retStart, int *retEnd, int *retUpstreamBases, int *retDownstreamBases) /* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths * for when the coding HGVS has coordinates that extend beyond its sequence boundaries. * ret* args must be non-NULL. */ { // If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive. int start = hgvs->start1; if (start > 0) start -= 1; // If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end. int end = hgvs->end; if (end < 0) end += 1; // If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart if (hgvs->startIsUtr3) - *retStart = cds.end + start; + *retStart = cds->end + start; else - *retStart = cds.start + start; + *retStart = cds->start + start; if (hgvs->endIsUtr3) - *retEnd = cds.end + end; + *retEnd = cds->end + end; else - *retEnd = cds.start + end; + *retEnd = cds->start + end; // Now check for coords that extend beyond the transcript('s alignment to the genome) if (*retStart < 0) { // hgvs->start1 is upstream of coding transcript. *retUpstreamBases = -*retStart; *retStart = 0; } else if (*retStart > maxCoord) { // Even the start coord is downstream of coding transcript -- make a negative "upstream" // for adjusting start. *retUpstreamBases = -(*retStart - maxCoord + 1); *retStart = maxCoord - 1; } else *retUpstreamBases = 0; if (*retEnd > maxCoord) { // hgvs->end is downstream of coding transcript. *retDownstreamBases = *retEnd - maxCoord; *retEnd = maxCoord; } else if (*retEnd < 0) { // Even the end coord is upstream of coding transcript -- make a negative "downstream" // for adjusting end. *retDownstreamBases = *retEnd; *retEnd = 0; } else *retDownstreamBases = 0; } -static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, int accSize, int accEnd, - struct genbankCds cds, +static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, char *acc, int accSize, int accEnd, + struct genbankCds *cds, int *retUpstreamBases, int *retDownstreamBases) /* Allocate and return a PSL modeling the variant in nucleotide sequence acc. * The PSL target is the sequence and the query is the changed part of the sequence. * accEnd is given in case accSize includes an unaligned poly-A tail. * If hgvs is coding ("c.") then the caller must pass in a valid cds. * In case the start or end position is outside the bounds of the sequence, set retUpstreamBases * or retDownstreamBases to the number of bases beyond the beginning or end of sequence. * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point * and the caller will have to map that to the genome and then apply the intron offset. */ { if (hgvs == NULL) return NULL; if (hgvs->type == hgvstProtein) errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein."); struct psl *psl; AllocVar(psl); -psl->tName = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); +psl->tName = cloneString(acc); safecpy(psl->strand, sizeof(psl->strand), "+"); psl->tSize = accSize; if (hgvs->type != hgvstCoding) { // Sane 1-based fully closed coords. psl->tStart = hgvs->start1 - 1; psl->tEnd = hgvs->end; } else { // Simple or insanely complex CDS-relative coords. hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd, retUpstreamBases, retDownstreamBases); } int refLen = psl->tEnd - psl->tStart; // Just use refLen for alt until we parse the sequence change portion of the term: int altLen = refLen; psl->qName = cloneString(hgvs->change); psl->qSize = altLen; psl->qStart = 0; psl->qEnd = altLen; psl->blockCount = 1; AllocArray(psl->blockSizes, psl->blockCount); AllocArray(psl->qStarts, psl->blockCount); AllocArray(psl->tStarts, psl->blockCount); psl->blockSizes[0] = refLen <= altLen ? refLen : altLen; psl->qStarts[0] = psl->qStart; psl->tStarts[0] = psl->tStart; return psl; } -#define limitToRange(val, min, max) { if (val < min) { val = min; } \ - if (val > max) { val = max; } } - -static struct bed3 *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) -/* Return a bed3 with the variant's span on the genome, or NULL if unable to map. - * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ -{ -struct bed3 *region = NULL; -char *trackTable = NULL; -char *pslTable = NULL; -char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); -if (hgvs->type == hgvstGenomic) - return hgvsMapGDotToGenome(db, hgvs, retPslTable); -if (startsWith("NM_", acc)) - { - // TODO: replace these with NCBI's alignments - trackTable = "refGene"; - pslTable = "refSeqAli"; - } -else if (startsWith("LRG_", acc)) - { - trackTable = "lrgTranscriptAli"; - pslTable = "lrgTranscriptAli"; - } -struct genbankCds cds; -boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE; -if (trackTable && pslTable && (hgvs->type != hgvstCoding || gotCds) && - hTableExists(db, trackTable) && hTableExists(db, pslTable)) +static struct psl *mapPsl(char *db, struct hgvsVariant *hgvs, char *pslTable, char *acc, + struct genbankCds *cds, int *retUpstream, int *retDownstream) +/* If acc is found in pslTable, use pslTransMap to map hgvs onto the genome. */ { - struct dyString *dyQuery = sqlDyStringCreate("select * from %s where qName = '%s'", - pslTable, acc); +struct psl *mappedToGenome = NULL; +char query[2048]; +sqlSafef(query, sizeof(query), "select * from %s where qName = '%s'", pslTable, acc); struct sqlConnection *conn = hAllocConn(db); - struct sqlResult *sr = sqlGetResult(conn, dyQuery->string); - int bin = hIsBinned(db, pslTable); +struct sqlResult *sr = sqlGetResult(conn, query); char **row; if (sr && (row = sqlNextRow(sr))) { + int bin = 1; // All PSL tables used here, and any made in the future, use the bin column. struct psl *txAli = pslLoad(row+bin); // variantPsl contains the anchor if a non-cdsStart anchor is used because // the actual position might be outside the bounds of the transcript sequence (intron/UTR) - int upstream = 0, downstream = 0; - struct psl *variantPsl = pslFromHgvsNuc(hgvs, txAli->qSize, txAli->qEnd, cds, - &upstream, &downstream); - struct psl *mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); - if (mappedToGenome) + struct psl *variantPsl = pslFromHgvsNuc(hgvs, acc, txAli->qSize, txAli->qEnd, cds, + retUpstream, retDownstream); + mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); + pslFree(&variantPsl); + pslFree(&txAli); + } +sqlFreeResult(&sr); +hFreeConn(&conn); +return mappedToGenome; +} + +static char *pslTableForAcc(char *db, char *acc) +/* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where + * acc should be found. Don't free the returned string. */ { - AllocVar(region); - region->chrom = cloneString(mappedToGenome->tName); - region->chromStart = mappedToGenome->tStart; - region->chromEnd = mappedToGenome->tEnd; +char *pslTable = NULL; +if (startsWith("LRG_", acc)) + pslTable = "lrgTranscriptAli"; +else if (startsWith("NM_", acc)) + { + // Use NCBI's alignments if they are available + if (dbHasNcbiRefSeq(db)) + pslTable = "ncbiRefSeqPsl"; + else + pslTable = "refSeqAli"; + } +return pslTable; +} + +#define limitToRange(val, min, max) { if (val < min) { val = min; } \ + if (val > max) { val = max; } } + +static struct bed3 *pslAndFriendsToRegion(struct psl *psl, struct hgvsVariant *hgvs, + int upstream, int downstream) +/* If hgvs has any intron offsets and/or upstream/downstream offsets, add those to the + * anchor coords in psl and return the variant's region of the genome. */ +{ +struct bed3 *region = bed3New(psl->tName, psl->tStart, psl->tEnd); // If the start and/or end is in an intron, add the intron offset now. - boolean revStrand = (mappedToGenome->strand[0] == '-'); +boolean revStrand = (psl->strand[0] == '-'); if (hgvs->startOffset != 0) { if (revStrand) region->chromEnd -= hgvs->startOffset; else region->chromStart += hgvs->startOffset; } if (hgvs->endOffset != 0) { if (revStrand) region->chromStart -= hgvs->endOffset; else region->chromEnd += hgvs->endOffset; } // Apply extra up/downstream offsets (usually 0) if (revStrand) { region->chromStart -= downstream; region->chromEnd += upstream; } else { region->chromStart -= upstream; region->chromEnd += downstream; } - limitToRange(region->chromStart, 0, mappedToGenome->tSize) - limitToRange(region->chromEnd, 0, mappedToGenome->tSize) - freez(&mappedToGenome); +limitToRange(region->chromStart, 0, psl->tSize); +limitToRange(region->chromEnd, 0, psl->tSize); +return region; +} + +static struct bed3 *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) +/* Return a bed3 with the variant's span on the genome, or NULL if unable to map. + * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ +{ +char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); +if (isEmpty(acc)) + return NULL; +if (hgvs->type == hgvstGenomic) + return hgvsMapGDotToGenome(db, hgvs, retPslTable); +struct bed3 *region = NULL; +char *pslTable = pslTableForAcc(db, acc); +struct genbankCds cds; +boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE; +if (pslTable && (hgvs->type != hgvstCoding || gotCds) && hTableExists(db, pslTable)) + { + int upstream = 0, downstream = 0; + struct psl *mappedToGenome = mapPsl(db, hgvs, pslTable, acc, &cds, &upstream, &downstream); + // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back + // on UCSC alignments. + if (!mappedToGenome && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli")) + { + char *accNoVersion = cloneFirstWordByDelimiter(acc, '.'); + gotCds = (hgvs->type == hgvstCoding) ? getCds(db, accNoVersion, &cds) : FALSE; + if (hgvs->type != hgvstCoding || gotCds) + mappedToGenome = mapPsl(db, hgvs, "refSeqAli", accNoVersion, &cds, + &upstream, &downstream); + if (mappedToGenome) + { + pslTable = "refSeqAli"; + acc = accNoVersion; + } } + if (mappedToGenome) + { + region = pslAndFriendsToRegion(mappedToGenome, hgvs, upstream, downstream); + pslFree(&mappedToGenome); } - sqlFreeResult(&sr); - hFreeConn(&conn); } if (region && retPslTable) *retPslTable = cloneString(pslTable); return region; } static struct bed3 *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { struct bed3 *region = NULL; -char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); -if (startsWith("NP_", acc)) +char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); +if (acc && startsWith("NP_", acc)) { // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome. struct sqlConnection *conn = hAllocConn(db); char query[2048]; + if (dbHasNcbiRefSeq(db)) + sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", + acc); + else sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); if (nmAcc) { struct hgvsVariant cDot; zeroBytes(&cDot, sizeof(cDot)); cDot.type = hgvstCoding; cDot.seqAcc = nmAcc; cDot.start1 = ((hgvs->start1 - 1) * 3) + 1; cDot.end = ((hgvs->end - 1) * 3) + 3; cDot.change = hgvs->change; region = hgvsMapNucToGenome(db, &cDot, retPslTable); + freeMem(nmAcc); } } return region; } struct bed3 *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { if (hgvs == NULL) return NULL; struct bed3 *region = NULL; if (hgvs->type == hgvstProtein) region = hgvsMapPDotToGenome(db, hgvs, retPslTable); else region = hgvsMapNucToGenome(db, hgvs, retPslTable); return region; }