a1c329068ad18aa4093972b501e6324f8bcdb398 angie Fri Sep 16 17:19:30 2016 -0700 Parse HGVS position ranges and coding terms' UTR and intron coordinates. When the chromAlias table is present, this also now supports NC_*:g. terms. The sequence change part is no longer parsed -- it's not necessary for mapping to genomic ranges, although it will be necessary in order for hgVai to take HGVS input. This does not yet support ranges-of-ranges but neither does Mutalyzer. This also doesn't support uncertain positions ('?'). Like complex sequence changes, those can wait until we have a sophisticated parser. We also support some new not-quite-HGVS forms: geneSymbol and protein pos only, or geneSymbol and c.. refs #15071 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 38ed891..3074a77 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -1,531 +1,1033 @@ /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */ /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hgHgvs.h" +#include "chromInfo.h" #include "genbank.h" #include "hdb.h" #include "pslTransMap.h" #include "regexHelper.h" // Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq: #define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])" #define lrgRegionExp "(LRG_[0-9+])" // NC = RefSeq reference assembly chromosome // NG = RefSeq incomplete genomic region (e.g. gene locus) // NM = RefSeq curated mRNA // NP = RefSeq curated protein // NR = RefSeq curated (non-coding) RNA -#define geneSymbolExp "([A-Z0-9./_-]+)" +#define geneSymbolExp "([A-Za-z0-9./_-]+)" #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version // ...... optional gene symbol in ()s // .... optional gene symbol #define versionedRefSeqNCExp versionedAccPrefixExp("NC") #define versionedRefSeqNGExp versionedAccPrefixExp("NG") #define versionedRefSeqNMExp versionedAccPrefixExp("NM") #define versionedRefSeqNPExp versionedAccPrefixExp("NP") #define versionedRefSeqNRExp versionedAccPrefixExp("NR") -// Nucleotide substitution regexes +// Nucleotide position regexes // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA) -#define hgvsNtSubstExp "([0-9]+)([ACGT]+)>([ACGT]+)" -// ...... 1-based position -// ....... original sequence -// ....... replacement sequence -#define hgvsCDotSubstExp "c\\." hgvsNtSubstExp -#define hgvsGDotSubstExp "g\\." hgvsNtSubstExp -#define hgvsMDotSubstExp "m\\." hgvsNtSubstExp -#define hgvsNDotSubstExp "n\\." hgvsNtSubstExp -#define hgvsRDotSubstExp "r\\." hgvsNtSubstExp +#define posIntExp "([0-9]+)" +#define hgvsNtPosExp posIntExp "(_" posIntExp ")?" +// ...... 1-based start position +// ............. optional range separator and end position +// ...... 1-based end position +// Now for a regex that can handle positions like "26" or "*80" or "-24+75_-24+92"... +#define anchorExp "([-*])?" +#define offsetExp "([-+])" +#define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?" +#define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?" +// ... optional anchor "-" or "*" +// ... mandatory first number +// ....... optional offset separator and offset +// ... offset separator +// ... offset number +// ............... optional range separator and complex num +// ... optional anchor "-" or "*" +// ... first number +// ....... optional offset separator and offset +// ... offset separator +// ... offset number + +#define hgvsCDotPosExp "c\\." hgvsCdsPosExp +#define hgvsGDotPosExp "g\\." hgvsNtPosExp +#define hgvsMDotPosExp "m\\." hgvsNtPosExp +#define hgvsNDotPosExp "n\\." hgvsNtPosExp +#define hgvsRDotPosExp "r\\." hgvsNtPosExp // Protein substitution regex -#define AA3 "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter" -#define hgvsAminoAcidExp "([ARNDCQEGHILKMFPSTWYVX*]|" AA3 ")" -#define hgvsAminoAcidSubstExp hgvsAminoAcidExp "([0-9]+)" hgvsAminoAcidExp +#define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter" +#define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp +#define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)" #define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp // ... // original sequence // ...... // 1-based position // ... // replacement sequence +// Protein range (or just single pos) regex +#define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)" +#define hgvsPDotRangeExp "p\\." hgvsAaRangeExp +// original start AA ... +// 1-based start position ... +// optional range sep and AA+pos ..................................... +// original end AA ... +// 1-based end position ... +// change description ... // Complete HGVS term regexes combining sequence identifiers and change operations -#define hgvsFullRegex(seq, op) "^" seq ":" op "$" +#define hgvsFullRegex(seq, op) "^" seq ":" op -#define hgvsLrgCDotSubstExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotSubstExp) +#define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGDotPosExp) +#define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)" +// substring numbering: +// 0..................................... whole matching string +// 1................. accession and optional dot version +// 2........ optional dot version +// 3...... (n/a) optional gene symbol in ()s +// 4.... (n/a) optional gene symbol +// 5... 1-based start position +// 6.... optional range separator and end position +// 7... 1-based end position +// 8.... change description + +#define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) +#define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)" // substring numbering: // 0..................................................... whole matching string // 1................... LRG transcript -// 2..... 1-based position -// 3...... original sequence -// 4...... replacement sequence +// 2... optional anchor "-" or "*" +// 3... mandatory first number +// 4....... optional offset separator and offset +// 5... offset separator +// 6... offset number +// 7............... optional range sep and complex num +// 8... optional anchor "-" or "*" +// 9... first number +// 10....... optional offset separator and offset +// 11... offset separator +// 12... offset number +// 13........ sequence change -#define hgvsRefSeqNMCDotSubstExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotSubstExp) +#define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp) +#define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)" // substring numbering: -// 0..................................................... whole matching string -// 1............... accession and optional dot version +// 0............................................................... whole matching string +// 1............... acc & optional dot version // 2........ optional dot version -// 3..... optional gene symbol in ()s +// 3..... optional gene sym in ()s // 4... optional gene symbol -// 5..... 1-based position -// 6...... original sequence -// 7...... replacement sequence +// 5... optional anchor +// 6... mandatory first number +// 7....... optional offset +// 8... offset separator +// 9... offset number +// 10............... optional range +// 11... optional anchor +// 12... first number +// 13....... optional offset +// 14... offset separator +// 15... offset number +// 16........ sequence change #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position // 7...... replacement sequence +#define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp) +// substring numbering: +// 0..................................................... whole matching string +// 1................ accession and optional dot version +// 2..... optional dot version +// 3..... optional gene symbol in ()s +// 4... optional gene symbol +// 5... original start AA +// 6... 1-based start position +// 7................. optional range sep and AA+pos +// 8... original end AA +// 9... 1-based end position +// 10..... change description + // Pseudo-HGVS in common usage -#define pseudoHgvsGeneSymbolSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp +#define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... original sequence // 3...... 1-based position // 4...... replacement sequence -static struct hgvsVariant *hgvsParseCDotSubst(char *term) -/* If term is parseable as an HGVS CDS substitution, return the parsed representation, - * otherwise NULL. */ +#define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp "[: p.]+" hgvsAaRangeExp +// 0..................................................... whole matching string +// 1................... gene symbol +// 2... original start AA +// 3... 1-based start position +// 4................ optional range sep and AA+pos +// 5... original end AA +// 6... 1-based end position +// 7..... change description + +#define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp "[: p.]+" posIntExp +// 0.......................... whole matching string +// 1................... gene symbol +// 2..... 1-based position + + +#define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp +// 0..................................................... whole matching string +// 1................... gene symbol +// 2..... optional beginning of position exp +// 3..... beginning of position exp + + +static struct hgvsVariant *hgvsParseGDotPos(char *term) +/* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */ +{ +struct hgvsVariant *hgvs = NULL; +regmatch_t substrs[17]; +if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs))) + { + int accIx = 1; + int startPosIx = 5; + int endPosIx = 7; + int changeIx = 8; + AllocVar(hgvs); + hgvs->type = hgvstGenomic; + hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); + hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); + if (regexSubstrMatched(substrs[endPosIx])) + hgvs->end = regexSubstringInt(term, substrs[endPosIx]); + else + hgvs->end = hgvs->start1; + hgvs->change = regexSubstringClone(term, substrs[changeIx]); + } +return hgvs; +} + +static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset, + boolean *retIsUtr3, int *retPos, int *retOffset) +/* Extract matches from substrs starting at substrOffset to parse a complex number + * description into anchor type, 1-based position and optional offset. */ +{ +int anchorIx = substrOffset; +int posIx = substrOffset + 1; +int offsetOpIx = substrOffset + 3; +int offsetIx = substrOffset + 4; +// Determine what startPos is relative to, based on optional anchor and optional offset +char anchor[16]; // string length 0 or 1 +regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor)); +char offsetOp[16]; // string length 0 or 1 +regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp)); +*retIsUtr3 = (anchor[0] == '*'); +*retPos = regexSubstringInt(term, substrs[posIx]); +// Is pos negative? +if (anchor[0] == '-') + *retPos = -*retPos; +// Grab offset, if there is one. +if (isNotEmpty(offsetOp)) + { + *retOffset = regexSubstringInt(term, substrs[offsetIx]); + if (offsetOp[0] == '-') + *retOffset = -*retOffset; + } +} + +static struct hgvsVariant *hgvsParseCDotPos(char *term) +/* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; boolean matches = FALSE; int accIx = 1; -int posIx = 0; -int refIx = 0; -int altIx = 0; +int startAnchorIx = 2; +int endAnchorIx = 8; +int endPosIx = 9; +int changeIx = 13; +// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that +// affects all substring offsets after the accession. +int refSeqExtra = 3; int geneSymbolIx = -1; -regmatch_t substrs[8]; -if (regexMatchSubstrNoCase(term, hgvsLrgCDotSubstExp, substrs, ArraySize(substrs))) +regmatch_t substrs[17]; +if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs))) { matches = TRUE; - posIx = 2; - refIx = 3; - altIx = 4; } -else if (regexMatchSubstrNoCase(term, hgvsRefSeqNMCDotSubstExp, substrs, ArraySize(substrs))) +else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs))) { matches = TRUE; geneSymbolIx = 4; - posIx = 5; - refIx = 6; - altIx = 7; + startAnchorIx += refSeqExtra; + endAnchorIx += refSeqExtra; + endPosIx += refSeqExtra; + changeIx += refSeqExtra; } if (matches) { AllocVar(hgvs); hgvs->type = hgvstCoding; - hgvs->changeSymbol = cloneString(">"); hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); - hgvs->start1 = regexSubstringInt(term, substrs[posIx]); - hgvs->refAllele = regexSubstringClone(term, substrs[refIx]); - hgvs->altAllele = regexSubstringClone(term, substrs[altIx]); + extractComplexNum(term, substrs, startAnchorIx, + &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset); if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); - hgvs->end = hgvs->start1 - 1 + strlen(hgvs->refAllele); - } -return hgvs; -} - -static char *aaAbbrsToLetters(char *aaStr) -/* If the input string is composed of AA abbreviations like "Ala", "Asp" etc., convert them to - * single letter AAs like "A", "D" etc. Otherwise return a copy of aaStr. */ -{ -char *letters = NULL; -if (regexMatch(aaStr, "^(" AA3 ")+$")) + if (regexSubstrMatched(substrs[endPosIx])) { - int len = strlen(aaStr) / 3; - letters = needMem(len + 1); - int i; - for (i = 0; i < len; i++) - letters[i] = aaAbbrToLetter(&aaStr[i*3]); + extractComplexNum(term, substrs, endAnchorIx, + &hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset); } else - letters = cloneString(aaStr); -return letters; + { + hgvs->end = hgvs->start1; + hgvs->endIsUtr3 = hgvs->startIsUtr3; + hgvs->endOffset = hgvs->startOffset; + } + hgvs->change = regexSubstringClone(term, substrs[changeIx]); + } +return hgvs; } static struct hgvsVariant *hgvsParsePDotSubst(char *term) /* If term is parseable as an HGVS protein substitution, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[8]; -if (regexMatchSubstrNoCase(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs))) +if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs))) { int accIx = 1; int geneSymbolIx = 4; int refIx = 5; int posIx = 6; int altIx = 7; AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); + int changeStart = substrs[refIx].rm_so; + int changeEnd = substrs[altIx].rm_eo; + int len = changeEnd - changeStart; + char change[len+1]; + safencpy(change, sizeof(change), term+changeStart, len); + hgvs->change = cloneString(change); if (regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[posIx]); - // Convert 3-letter abbreviations to single-letter codes like "Ala" -> "A", - // so strlen(allele) is the number of amino acids. - char buf[2048]; - regexSubstringCopy(term, substrs[refIx], buf, sizeof(buf)); - hgvs->refAllele = aaAbbrsToLetters(buf); - regexSubstringCopy(term, substrs[altIx], buf, sizeof(buf)); - hgvs->altAllele = aaAbbrsToLetters(buf); - hgvs->end = hgvs->start1 - 1 + strlen(hgvs->refAllele); + hgvs->end = hgvs->start1; + } +return hgvs; +} + +static struct hgvsVariant *hgvsParsePDotRange(char *term) +/* If term is parseable as an HGVS protein range change, return the parsed representation, + * otherwise NULL. */ +{ +struct hgvsVariant *hgvs = NULL; +regmatch_t substrs[11]; +if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs))) + { + int accIx = 1; + int geneSymbolIx = 4; + int startRefIx = 5; + int startPosIx = 6; + int endPosIx = 9; + int changeIx = 10; + AllocVar(hgvs); + hgvs->type = hgvstProtein; + hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); + int changeStart = substrs[startRefIx].rm_so; + int changeEnd = substrs[changeIx].rm_eo; + int len = changeEnd - changeStart; + char change[len+1]; + safencpy(change, sizeof(change), term+changeStart, len); + hgvs->change = cloneString(change); + if (regexSubstrMatched(substrs[geneSymbolIx])) + hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); + hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); + if (regexSubstrMatched(substrs[endPosIx])) + hgvs->end = regexSubstringInt(term, substrs[endPosIx]); + else + hgvs->end = hgvs->start1; } return hgvs; } struct hgvsVariant *hgvsParseTerm(char *term) /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. - * This does not check validity of accessions or alleles. */ + * This does not check validity of accessions, coordinates or alleles. */ { -struct hgvsVariant *hgvs = hgvsParseCDotSubst(term); +struct hgvsVariant *hgvs = hgvsParseCDotPos(term); if (hgvs == NULL) hgvs = hgvsParsePDotSubst(term); +if (hgvs == NULL) + hgvs = hgvsParsePDotRange(term); +if (hgvs == NULL) + hgvs = hgvsParseGDotPos(term); return hgvs; } -struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) -/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */ +static char *npForGeneSymbol(char *db, char *geneSymbol) +/* Given a gene symbol, look it up to find its NP_ accession; if not found return NULL. */ { -struct hgvsVariant *hgvs = NULL; -regmatch_t substrs[5]; -if (regexMatchSubstrNoCase(term, pseudoHgvsGeneSymbolSubstExp, substrs, ArraySize(substrs))) - { - int geneSymbolIx = 1; - int refIx = 2; - int posIx = 3; - int altIx = 4; - char *geneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); - // Try to find an NP_ for geneSymbol, using refGene to make sure it's an NP for this species. struct sqlConnection *conn = hAllocConn(db); char query[2048]; +// Use refGene to make sure it's an NP for this species. sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where l.name = '%s' and l.mrnaAcc = r.name " - "and l.protAcc != '' order by l.protAcc" + "and l.protAcc != '' order by length(l.protAcc), l.protAcc" , refLinkTable, geneSymbol); char *npAcc = sqlQuickString(conn, query); hFreeConn(&conn); - if (isNotEmpty(npAcc)) - { - AllocVar(hgvs); - hgvs->type = hgvstProtein; - hgvs->seqAcc = npAcc; - hgvs->seqGeneSymbol = geneSymbol; - hgvs->start1 = regexSubstringInt(term, substrs[posIx]); - // Convert 3-letter abbreviations to single-letter codes like "Ala" -> "A", - // so strlen(allele) is the number of amino acids. - char buf[2048]; - regexSubstringCopy(term, substrs[refIx], buf, sizeof(buf)); - hgvs->refAllele = aaAbbrsToLetters(buf); - regexSubstringCopy(term, substrs[altIx], buf, sizeof(buf)); - hgvs->altAllele = aaAbbrsToLetters(buf); - int refLen = strlen(hgvs->refAllele); - hgvs->end = hgvs->start1 - 1 + refLen; - } - // Note: this doesn't support non-coding gene symbol substs (which should have nt alleles) +return npAcc; } -return hgvs; + +static char *nmForGeneSymbol(char *db, char *geneSymbol) +/* Given a gene symbol, look it up to find its NM_ accession; if not found return NULL. */ +{ +struct sqlConnection *conn = hAllocConn(db); +char query[2048]; +sqlSafef(query, sizeof(query), "select name from refGene where name2 = '%s' " + "order by length(name), name", geneSymbol); +char *nmAcc = sqlQuickString(conn, query); +hFreeConn(&conn); +return nmAcc; } -static char *getCdnaSeq(char *db, char *acc, char **retFoundAcc) -/* Return cdna sequence for acc, or NULL if not found. If retFoundAcc is not NULL, +static char *getProteinSeq(char *db, char *acc, char **retFoundAcc) +/* Return amino acid sequence for acc, or NULL if not found. If retFoundAcc is not NULL, * set it to the accession for which we grabbed seq (might have a .version chopped off). */ { char *seq = NULL; char *foundAcc = NULL; if (startsWith("LRG_", acc)) { - if (hTableExists(db, "lrgCdna")) + if (hTableExists(db, "lrgPep")) { char query[2048]; - sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); + sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); if (seq) foundAcc = cloneString(acc); } } else { char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.'); - struct dnaSeq *gbSeq = hGenBankGetMrna(db, trimmedAcc, gbSeqTable); - if (gbSeq) + aaSeq *aaSeq = hGenBankGetPep(db, trimmedAcc, gbSeqTable); + if (aaSeq) { - seq = gbSeq->dna; + seq = aaSeq->dna; foundAcc = trimmedAcc; } } if (retFoundAcc) *retFoundAcc = foundAcc; return seq; } -static char *getProteinSeq(char *db, char *acc, char **retFoundAcc) -/* Return amino acid sequence for acc, or NULL if not found. If retFoundAcc is not NULL, +static char refBaseForNp(char *db, char *npAcc, int pos) +// Get the amino acid base in NP_'s sequence at 1-based offset pos. +{ +char *seq = getProteinSeq(db, npAcc, NULL); +char base = seq[pos-1]; +freeMem(seq); +return base; +} + +struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) +/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */ +// Note: this doesn't support non-coding gene symbol terms (which should have nt alleles) +{ +struct hgvsVariant *hgvs = NULL; +regmatch_t substrs[11]; +int geneSymbolIx = 1; +boolean isSubst = regexMatchSubstr(term, pseudoHgvsGeneSymbolProtSubstExp, + substrs, ArraySize(substrs)); +if (isSubst || + regexMatchSubstr(term, pseudoHgvsGeneSymbolProtRangeExp, substrs, ArraySize(substrs))) + { + int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; + char geneSymbol[len+1]; + safencpy(geneSymbol, sizeof(geneSymbol), term, len); + char *npAcc = npForGeneSymbol(db, geneSymbol); + if (isNotEmpty(npAcc)) + { + // Make it a real HGVS term with the NP and pass that on to the usual parser. + int descStartIx = 2; + char *description = term + substrs[descStartIx].rm_so; + struct dyString *npTerm = dyStringCreate("%s(%s):p.%s", + npAcc, geneSymbol, description); + hgvs = hgvsParseTerm(npTerm->string); + dyStringFree(&npTerm); + } + } +else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolProtPosExp, substrs, ArraySize(substrs))) + { + int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; + char geneSymbol[len+1]; + safencpy(geneSymbol, sizeof(geneSymbol), term, len); + char *npAcc = npForGeneSymbol(db, geneSymbol); + if (isNotEmpty(npAcc)) + { + // Only position was provided, no change. Look up ref base and make a synonymous subst + // so it's parseable HGVS. + int posIx = 2; + int pos = regexSubstringInt(term, substrs[posIx]); + char refBase = refBaseForNp(db, npAcc, pos); + struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=", + npAcc, geneSymbol, refBase, pos); + hgvs = hgvsParseTerm(npTerm->string); + dyStringFree(&npTerm); + } + } +else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs))) + { + int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; + char geneSymbol[len+1]; + safencpy(geneSymbol, sizeof(geneSymbol), term, len); + char *nmAcc = nmForGeneSymbol(db, geneSymbol); + if (isNotEmpty(nmAcc)) + { + // Make it a real HGVS term with the NM and pass that on to the usual parser. + int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3; + char *description = term + substrs[descStartIx].rm_so; + struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s", + nmAcc, geneSymbol, description); + hgvs = hgvsParseTerm(nmTerm->string); + dyStringFree(&nmTerm); + } + } +return hgvs; +} + +static char refBaseFromNucSubst(char *change) +/* If sequence change is a nucleotide substitution, return the reference base, else null char. */ +{ +if (regexMatchNoCase(change, "^([ACGTU])>")) + return toupper(change[0]); +return '\0'; +} + +static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs, + char **retFoundAcc, int *retFoundVersion, + char **retDiffRefAllele) +/* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom. + * If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL. + * If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL. + * If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to + * the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to + * genomic sequence if they differ. */ +{ +boolean coordsOK = FALSE; +if (retDiffRefAllele) + *retDiffRefAllele = NULL; +if (retFoundAcc) + *retFoundAcc = NULL; +if (retFoundVersion) + *retFoundVersion = 0; +char *chrom = hgOfficialChromName(db, hgvs->seqAcc); +if (isNotEmpty(chrom)) + { + struct chromInfo *ci = hGetChromInfo(db, chrom); + if ((hgvs->start1 >= 1 && hgvs->start1 <= ci->size) && + (hgvs->end >=1 && hgvs->end <= ci->size)) + { + coordsOK = TRUE; + if (retDiffRefAllele) + { + char hgvsBase = refBaseFromNucSubst(hgvs->change); + if (hgvsBase != '\0') + { + struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom, + hgvs->start1-1, hgvs->start1); + touppers(refBase->dna); + if (refBase->dna[0] != hgvsBase) + *retDiffRefAllele = cloneString(refBase->dna); + freeMem(refBase); + } + } + } + // Since we found hgvs->seqAcc, split it into versionless and version parts. + if (retFoundAcc) + *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); + if (retFoundVersion) + { + char *p = strchr(hgvs->seqAcc, '.'); + if (p) + *retFoundVersion = atoi(p+1); + } + // Don't free chromInfo, it's cached! + } +return coordsOK; +} + +static char *getCdnaSeq(char *db, char *acc, char **retFoundAcc) +/* Return cdna sequence for acc, or NULL if not found. If retFoundAcc is not NULL, * set it to the accession for which we grabbed seq (might have a .version chopped off). */ { char *seq = NULL; char *foundAcc = NULL; if (startsWith("LRG_", acc)) { - if (hTableExists(db, "lrgPep")) + if (hTableExists(db, "lrgCdna")) { char query[2048]; - sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc); + sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); if (seq) foundAcc = cloneString(acc); } } else { char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.'); - aaSeq *aaSeq = hGenBankGetPep(db, trimmedAcc, gbSeqTable); - if (aaSeq) + struct dnaSeq *gbSeq = hGenBankGetMrna(db, trimmedAcc, gbSeqTable); + if (gbSeq) { - seq = aaSeq->dna; + seq = gbSeq->dna; foundAcc = trimmedAcc; } } if (retFoundAcc) *retFoundAcc = foundAcc; return seq; } -static int getCdsStart(char *db, char *acc) -/* Return the start coordinate for CDS in genbank or LRG acc, or -1 if not found. */ +static boolean getCds(char *db, char *acc, struct genbankCds *retCds) +/* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */ { -int cdsStart = -1; +boolean gotCds = FALSE; char query[1024]; if (startsWith("LRG_", acc)) sqlSafef(query, sizeof(query), "select cds from lrgCds where id = '%s'", acc); else sqlSafef(query, sizeof(query), "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND " "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc); struct sqlConnection *conn = hAllocConn(db); char cdsBuf[2048]; char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); hFreeConn(&conn); if (isNotEmpty(cdsStr)) - { - struct genbankCds cds; - if (genbankCdsParse(cdsStr, &cds) && cds.startComplete && cds.start != cds.end) - cdsStart = cds.start; - } -return cdsStart; + gotCds = (genbankCdsParse(cdsStr, retCds) && + retCds->startComplete && retCds->start != retCds->end); +return gotCds; } static int getGbVersion(char *db, char *acc) /* Return the local version that we have for acc. */ { char query[2048]; sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc); struct sqlConnection *conn = hAllocConn(db); int version = sqlQuickNum(conn, query); hFreeConn(&conn); return version; } -boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, +static char refBaseFromProt(char *change) +/* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code, + * else null char. */ +{ +regmatch_t substrs[1]; +if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs))) + { + char match[256]; // 1- or 3-letter code + regexSubstringCopy(change, substrs[0], match, sizeof(match)); + if (strlen(match) == 1) + return toupper(match[0]); + else + return aaAbbrToLetter(match); + } +return '\0'; +} + +static char *cloneStringFromChar(char c) +/* Return a cloned string from a single character. */ +{ +char str[2]; +str[0] = c; +str[1] = '\0'; +return cloneString(str); +} + +static void checkRefAllele(struct hgvsVariant *hgvs, int start1, char *accSeq, char **retDiffRefAllele) +/* If hgvs change includes a reference allele, and if accSeq at start1 does not match, + * then set retDiffRefAllele to the accSeq version. retDiffRefAllele must be non-NULL. */ +{ +char hgvsRefBase = (hgvs->type == hgvstProtein) ? refBaseFromProt(hgvs->change) : + refBaseFromNucSubst(hgvs->change); +if (hgvsRefBase != '\0') + { + char seqRefBase = toupper(accSeq[start1-1]); + if (seqRefBase != hgvsRefBase) + *retDiffRefAllele = cloneStringFromChar(seqRefBase); + } +if (hgvs->type == hgvstProtein && *retDiffRefAllele == NULL) + { + // Protein ranges have a second ref allele base to check + char *p = strchr(hgvs->change, '_'); + if (p != NULL) + { + hgvsRefBase = refBaseFromProt(p+1); + if (hgvsRefBase != '\0') + { + int end1 = atoi(skipToNumeric(p+1)); + char seqRefBase = toupper(accSeq[end1-1]); + if (seqRefBase != hgvsRefBase) + *retDiffRefAllele = cloneStringFromChar(seqRefBase); + } + } + } +} + +static boolean hgvsValidateGene(char *db, struct hgvsVariant *hgvs, + char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. + * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so + * those can't be checked without mapping the term to the genome. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ { +char *foundAcc = NULL; +int foundVersion = 0; boolean coordsOK = FALSE; char *acc = hgvs->seqAcc; -char *foundAcc = NULL; char *accSeq = NULL; -int cdsOffset = 0; -int refLen = strlen(hgvs->refAllele); if (hgvs->type == hgvstProtein) accSeq = getProteinSeq(db, acc, &foundAcc); else accSeq = getCdnaSeq(db, acc, &foundAcc); -if (retFoundVersion) - { - *retFoundVersion = 0; if (foundAcc && ! startsWith("LRG_", foundAcc)) - *retFoundVersion = getGbVersion(db, foundAcc); - } -if (hgvs->type == hgvstCoding && foundAcc) - cdsOffset = getCdsStart(db, foundAcc); -if (accSeq && cdsOffset >= 0 && - (cdsOffset + hgvs->start1 - 1 + refLen) <= strlen(accSeq)) + foundVersion = getGbVersion(db, foundAcc); +if (foundAcc) { - coordsOK = TRUE; - } + int seqLen = strlen(accSeq); + if (hgvs->type == hgvstCoding) + { + // Coding term coords can extend beyond the bounds of the transcript so + // we can't check them without mapping to the genome. However, if the coords + // are in bounds and a reference allele is provided, we can check that. + struct genbankCds cds; + coordsOK = getCds(db, foundAcc, &cds); if (coordsOK && retDiffRefAllele) { - char *ourSeq = cloneStringZ(accSeq + cdsOffset + hgvs->start1 - 1, refLen); - toUpperN(ourSeq, refLen); - if (sameString(hgvs->refAllele, ourSeq)) - *retDiffRefAllele = NULL; + int start = hgvs->start1 + (hgvs->startIsUtr3 ? cds.end : cds.start); + if (hgvs->startOffset == 0 && start >= 1 && start <= seqLen) + checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); + } + } else - *retDiffRefAllele = ourSeq; + { + if (hgvs->start1 >= 1 && hgvs->start1 <= seqLen && + hgvs->end >= 1 && hgvs->end <= seqLen) + { + coordsOK = TRUE; + if (retDiffRefAllele) + checkRefAllele(hgvs, hgvs->start1, accSeq, retDiffRefAllele); + } + } } if (retFoundAcc) *retFoundAcc = foundAcc; +if (retFoundVersion) + *retFoundVersion = foundVersion; return coordsOK; } -static struct psl *pslFromHgvs(struct hgvsVariant *hgvs, int accSize, int cdsStart) -/* Allocate and return a PSL modeling the variant in transcript named acc. */ +boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, + char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) +/* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. + * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so + * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. + * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version + * of hgvs->seqAcc) or NULL if we can't find any match. + * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), + * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). + * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords + * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ +{ +if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito) + return hgvsValidateGenomic(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele); +else + return hgvsValidateGene(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele); +} + +static struct bed3 *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) +/* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */ +{ +struct bed3 *region = NULL; +char *chrom = hgOfficialChromName(db, hgvs->seqAcc); +if (isNotEmpty(chrom)) + { + AllocVar(region); + region->chrom = chrom; + region->chromStart = hgvs->start1 - 1; + region->chromEnd = hgvs->end; + } +if (retPslTable) + *retPslTable = NULL; +return region; +} + +static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs, + int maxCoord, struct genbankCds cds, + int *retStart, int *retEnd, + int *retUpstreamBases, int *retDownstreamBases) +/* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths + * for when the coding HGVS has coordinates that extend beyond its sequence boundaries. + * ret* args must be non-NULL. */ +{ +// If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive. +int start = hgvs->start1; +if (start > 0) + start -= 1; +// If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end. +int end = hgvs->end; +if (end < 0) + end += 1; +// If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart +if (hgvs->startIsUtr3) + *retStart = cds.end + start; +else + *retStart = cds.start + start; +if (hgvs->endIsUtr3) + *retEnd = cds.end + end; +else + *retEnd = cds.start + end; +// Now check for coords that extend beyond the transcript('s alignment to the genome) +if (*retStart < 0) + { + // hgvs->start1 is upstream of coding transcript. + *retUpstreamBases = -*retStart; + *retStart = 0; + } +else if (*retStart > maxCoord) + { + // Even the start coord is downstream of coding transcript -- make a negative "upstream" + // for adjusting start. + *retUpstreamBases = -(*retStart - maxCoord + 1); + *retStart = maxCoord - 1; + } +else + *retUpstreamBases = 0; +if (*retEnd > maxCoord) + { + // hgvs->end is downstream of coding transcript. + *retDownstreamBases = *retEnd - maxCoord; + *retEnd = maxCoord; + } +else if (*retEnd < 0) + { + // Even the end coord is upstream of coding transcript -- make a negative "downstream" + // for adjusting end. + *retDownstreamBases = *retEnd; + *retEnd = 0; + } +else + *retDownstreamBases = 0; +} + +static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, int accSize, int accEnd, + struct genbankCds cds, + int *retUpstreamBases, int *retDownstreamBases) +/* Allocate and return a PSL modeling the variant in nucleotide sequence acc. + * The PSL target is the sequence and the query is the changed part of the sequence. + * accEnd is given in case accSize includes an unaligned poly-A tail. + * If hgvs is coding ("c.") then the caller must pass in a valid cds. + * In case the start or end position is outside the bounds of the sequence, set retUpstreamBases + * or retDownstreamBases to the number of bases beyond the beginning or end of sequence. + * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point + * and the caller will have to map that to the genome and then apply the intron offset. */ { if (hgvs == NULL) return NULL; +if (hgvs->type == hgvstProtein) + errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein."); struct psl *psl; AllocVar(psl); -int refLen = strlen(hgvs->refAllele); -int altLen = strlen(hgvs->altAllele); -struct dyString *dy = dyStringCreate("%s%s%s", - hgvs->refAllele, hgvs->changeSymbol, hgvs->altAllele); -psl->qName = dyStringCannibalize(&dy); +psl->tName = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); +safecpy(psl->strand, sizeof(psl->strand), "+"); +psl->tSize = accSize; +if (hgvs->type != hgvstCoding) + { + // Sane 1-based fully closed coords. + psl->tStart = hgvs->start1 - 1; + psl->tEnd = hgvs->end; + } +else + { + // Simple or insanely complex CDS-relative coords. + hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd, + retUpstreamBases, retDownstreamBases); + } +int refLen = psl->tEnd - psl->tStart; +// Just use refLen for alt until we parse the sequence change portion of the term: +int altLen = refLen; +psl->qName = cloneString(hgvs->change); psl->qSize = altLen; psl->qStart = 0; psl->qEnd = altLen; -psl->tName = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); -psl->tSize = accSize; -safecpy(psl->strand, sizeof(psl->strand), "+"); -psl->tStart = cdsStart + hgvs->start1 - 1; -psl->tEnd = psl->tStart + refLen; psl->blockCount = 1; AllocArray(psl->blockSizes, psl->blockCount); AllocArray(psl->qStarts, psl->blockCount); AllocArray(psl->tStarts, psl->blockCount); psl->blockSizes[0] = refLen <= altLen ? refLen : altLen; psl->qStarts[0] = psl->qStart; psl->tStarts[0] = psl->tStart; return psl; } -static struct psl *hgvsMapCDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) -/* Return a psl with target coords from db, mapping the variant to the genome. - * Return NULL if unable to map. +#define limitToRange(val, min, max) { if (val < min) { val = min; } \ + if (val > max) { val = max; } } + +static struct bed3 *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) +/* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { -struct psl *mappedToGenome = NULL; +struct bed3 *region = NULL; char *trackTable = NULL; char *pslTable = NULL; char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); -int cdsStart = -1; +if (hgvs->type == hgvstGenomic) + return hgvsMapGDotToGenome(db, hgvs, retPslTable); if (startsWith("NM_", acc)) { // TODO: replace these with NCBI's alignments trackTable = "refGene"; pslTable = "refSeqAli"; } else if (startsWith("LRG_", acc)) { trackTable = "lrgTranscriptAli"; pslTable = "lrgTranscriptAli"; } -cdsStart = getCdsStart(db, acc); -if (trackTable && pslTable && cdsStart >= 0 && +struct genbankCds cds; +boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE; +if (trackTable && pslTable && (hgvs->type != hgvstCoding || gotCds) && hTableExists(db, trackTable) && hTableExists(db, pslTable)) { struct dyString *dyQuery = sqlDyStringCreate("select * from %s where qName = '%s'", pslTable, acc); struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr = sqlGetResult(conn, dyQuery->string); int bin = hIsBinned(db, pslTable); char **row; if (sr && (row = sqlNextRow(sr))) { struct psl *txAli = pslLoad(row+bin); - struct psl *variantPsl = pslFromHgvs(hgvs, txAli->qSize, cdsStart); - mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); + // variantPsl contains the anchor if a non-cdsStart anchor is used because + // the actual position might be outside the bounds of the transcript sequence (intron/UTR) + int upstream = 0, downstream = 0; + struct psl *variantPsl = pslFromHgvsNuc(hgvs, txAli->qSize, txAli->qEnd, cds, + &upstream, &downstream); + struct psl *mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); + if (mappedToGenome) + { + AllocVar(region); + region->chrom = cloneString(mappedToGenome->tName); + region->chromStart = mappedToGenome->tStart; + region->chromEnd = mappedToGenome->tEnd; + // If the start and/or end is in an intron, add the intron offset now. + boolean revStrand = (mappedToGenome->strand[0] == '-'); + if (hgvs->startOffset != 0) + { + if (revStrand) + region->chromEnd -= hgvs->startOffset; + else + region->chromStart += hgvs->startOffset; + } + if (hgvs->endOffset != 0) + { + if (revStrand) + region->chromStart -= hgvs->endOffset; + else + region->chromEnd += hgvs->endOffset; + } + // Apply extra up/downstream offsets (usually 0) + if (revStrand) + { + region->chromStart -= downstream; + region->chromEnd += upstream; + } + else + { + region->chromStart -= upstream; + region->chromEnd += downstream; + } + limitToRange(region->chromStart, 0, mappedToGenome->tSize) + limitToRange(region->chromEnd, 0, mappedToGenome->tSize) + freez(&mappedToGenome); + } } sqlFreeResult(&sr); hFreeConn(&conn); } -if (mappedToGenome && retPslTable) +if (region && retPslTable) *retPslTable = cloneString(pslTable); -return mappedToGenome; -} - -static char *aaToArbitraryCodons(char *aaStr) -/* Translate amino acid string into codon string where we just take the first codon we - * find for each amino. */ -{ -int aaLen = strlen(aaStr); -char *codonStr = needMem(aaLen*3 + 1); -int i; -for (i = 0; i < aaLen; i++) - aaToArbitraryCodon(aaStr[i], &codonStr[i*3]); -return codonStr; +return region; } -static struct psl *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) -/* Return a psl with target coords from db, mapping the variant to the genome. - * Return NULL if unable to map. +static struct bed3 *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) +/* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { -struct psl *mappedToGenome = NULL; +struct bed3 *region = NULL; char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); if (startsWith("NP_", acc)) { // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome. struct sqlConnection *conn = hAllocConn(db); char query[2048]; sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); if (nmAcc) { struct hgvsVariant cDot; zeroBytes(&cDot, sizeof(cDot)); cDot.type = hgvstCoding; cDot.seqAcc = nmAcc; - cDot.changeSymbol = ">"; - // These aren't necessarily right since AA doesn't have as much info as codon. - // We could get the refAllele right using the reference sequence -- but altAllele - // could still be ambiguous. In the meantime we're not really using the alleles - // for anything besides length so far... - cDot.refAllele = aaToArbitraryCodons(hgvs->refAllele); - cDot.altAllele = aaToArbitraryCodons(hgvs->altAllele); cDot.start1 = ((hgvs->start1 - 1) * 3) + 1; cDot.end = ((hgvs->end - 1) * 3) + 3; - mappedToGenome = hgvsMapCDotToGenome(db, &cDot, retPslTable); + cDot.change = hgvs->change; + region = hgvsMapNucToGenome(db, &cDot, retPslTable); } } -return mappedToGenome; +return region; } -struct psl *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) -/* Return a psl with target coords from db, mapping the variant to the genome. - * Return NULL if unable to map. +struct bed3 *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) +/* Return a bed3 with the variant's span on the genome, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { -if (hgvs == NULL || !hgvsValidate(db, hgvs, NULL, NULL, NULL)) +if (hgvs == NULL) return NULL; -struct psl *mappedToGenome = NULL; -if (hgvs->type == hgvstCoding) - mappedToGenome = hgvsMapCDotToGenome(db, hgvs, retPslTable); -else if (hgvs->type == hgvstProtein) - mappedToGenome = hgvsMapPDotToGenome(db, hgvs, retPslTable); -return mappedToGenome; +struct bed3 *region = NULL; +if (hgvs->type == hgvstProtein) + region = hgvsMapPDotToGenome(db, hgvs, retPslTable); +else + region = hgvsMapNucToGenome(db, hgvs, retPslTable); +return region; }