a1c329068ad18aa4093972b501e6324f8bcdb398
angie
Fri Sep 16 17:19:30 2016 -0700
Parse HGVS position ranges and coding terms' UTR and intron coordinates.
When the chromAlias table is present, this also now supports NC_*:g. terms.
The sequence change part is no longer parsed -- it's not necessary for mapping
to genomic ranges, although it will be necessary in order for hgVai to take
HGVS input.
This does not yet support ranges-of-ranges but neither does Mutalyzer. This
also doesn't support uncertain positions ('?'). Like complex sequence changes,
those can wait until we have a sophisticated parser.
We also support some new not-quite-HGVS forms: geneSymbol and protein pos only,
or geneSymbol and c.<valid position range>.
refs #15071
diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 38ed891..3074a77 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -1,531 +1,1033 @@
/* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */
/* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */
/* Copyright (C) 2016 The Regents of the University of California
* See README in this or parent directory for licensing information. */
#include "common.h"
#include "hgHgvs.h"
+#include "chromInfo.h"
#include "genbank.h"
#include "hdb.h"
#include "pslTransMap.h"
#include "regexHelper.h"
// Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq:
#define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])"
#define lrgRegionExp "(LRG_[0-9+])"
// NC = RefSeq reference assembly chromosome
// NG = RefSeq incomplete genomic region (e.g. gene locus)
// NM = RefSeq curated mRNA
// NP = RefSeq curated protein
// NR = RefSeq curated (non-coding) RNA
-#define geneSymbolExp "([A-Z0-9./_-]+)"
+#define geneSymbolExp "([A-Za-z0-9./_-]+)"
#define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?"
#define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp
// ........................ accession and optional dot version
// ........... optional dot version
// ...... optional gene symbol in ()s
// .... optional gene symbol
#define versionedRefSeqNCExp versionedAccPrefixExp("NC")
#define versionedRefSeqNGExp versionedAccPrefixExp("NG")
#define versionedRefSeqNMExp versionedAccPrefixExp("NM")
#define versionedRefSeqNPExp versionedAccPrefixExp("NP")
#define versionedRefSeqNRExp versionedAccPrefixExp("NR")
-// Nucleotide substitution regexes
+// Nucleotide position regexes
// (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA)
-#define hgvsNtSubstExp "([0-9]+)([ACGT]+)>([ACGT]+)"
-// ...... 1-based position
-// ....... original sequence
-// ....... replacement sequence
-#define hgvsCDotSubstExp "c\\." hgvsNtSubstExp
-#define hgvsGDotSubstExp "g\\." hgvsNtSubstExp
-#define hgvsMDotSubstExp "m\\." hgvsNtSubstExp
-#define hgvsNDotSubstExp "n\\." hgvsNtSubstExp
-#define hgvsRDotSubstExp "r\\." hgvsNtSubstExp
+#define posIntExp "([0-9]+)"
+#define hgvsNtPosExp posIntExp "(_" posIntExp ")?"
+// ...... 1-based start position
+// ............. optional range separator and end position
+// ...... 1-based end position
+// Now for a regex that can handle positions like "26" or "*80" or "-24+75_-24+92"...
+#define anchorExp "([-*])?"
+#define offsetExp "([-+])"
+#define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?"
+#define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?"
+// ... optional anchor "-" or "*"
+// ... mandatory first number
+// ....... optional offset separator and offset
+// ... offset separator
+// ... offset number
+// ............... optional range separator and complex num
+// ... optional anchor "-" or "*"
+// ... first number
+// ....... optional offset separator and offset
+// ... offset separator
+// ... offset number
+
+#define hgvsCDotPosExp "c\\." hgvsCdsPosExp
+#define hgvsGDotPosExp "g\\." hgvsNtPosExp
+#define hgvsMDotPosExp "m\\." hgvsNtPosExp
+#define hgvsNDotPosExp "n\\." hgvsNtPosExp
+#define hgvsRDotPosExp "r\\." hgvsNtPosExp
// Protein substitution regex
-#define AA3 "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter"
-#define hgvsAminoAcidExp "([ARNDCQEGHILKMFPSTWYVX*]|" AA3 ")"
-#define hgvsAminoAcidSubstExp hgvsAminoAcidExp "([0-9]+)" hgvsAminoAcidExp
+#define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter"
+#define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp
+#define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)"
#define hgvsPDotSubstExp "p\\." hgvsAminoAcidSubstExp
// ... // original sequence
// ...... // 1-based position
// ... // replacement sequence
+// Protein range (or just single pos) regex
+#define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)"
+#define hgvsPDotRangeExp "p\\." hgvsAaRangeExp
+// original start AA ...
+// 1-based start position ...
+// optional range sep and AA+pos .....................................
+// original end AA ...
+// 1-based end position ...
+// change description ...
// Complete HGVS term regexes combining sequence identifiers and change operations
-#define hgvsFullRegex(seq, op) "^" seq ":" op "$"
+#define hgvsFullRegex(seq, op) "^" seq ":" op
-#define hgvsLrgCDotSubstExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotSubstExp)
+#define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGDotPosExp)
+#define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)"
+// substring numbering:
+// 0..................................... whole matching string
+// 1................. accession and optional dot version
+// 2........ optional dot version
+// 3...... (n/a) optional gene symbol in ()s
+// 4.... (n/a) optional gene symbol
+// 5... 1-based start position
+// 6.... optional range separator and end position
+// 7... 1-based end position
+// 8.... change description
+
+#define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp)
+#define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)"
// substring numbering:
// 0..................................................... whole matching string
// 1................... LRG transcript
-// 2..... 1-based position
-// 3...... original sequence
-// 4...... replacement sequence
+// 2... optional anchor "-" or "*"
+// 3... mandatory first number
+// 4....... optional offset separator and offset
+// 5... offset separator
+// 6... offset number
+// 7............... optional range sep and complex num
+// 8... optional anchor "-" or "*"
+// 9... first number
+// 10....... optional offset separator and offset
+// 11... offset separator
+// 12... offset number
+// 13........ sequence change
-#define hgvsRefSeqNMCDotSubstExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotSubstExp)
+#define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp)
+#define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)"
// substring numbering:
-// 0..................................................... whole matching string
-// 1............... accession and optional dot version
+// 0............................................................... whole matching string
+// 1............... acc & optional dot version
// 2........ optional dot version
-// 3..... optional gene symbol in ()s
+// 3..... optional gene sym in ()s
// 4... optional gene symbol
-// 5..... 1-based position
-// 6...... original sequence
-// 7...... replacement sequence
+// 5... optional anchor
+// 6... mandatory first number
+// 7....... optional offset
+// 8... offset separator
+// 9... offset number
+// 10............... optional range
+// 11... optional anchor
+// 12... first number
+// 13....... optional offset
+// 14... offset separator
+// 15... offset number
+// 16........ sequence change
#define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp)
// substring numbering:
// 0..................................................... whole matching string
// 1................ accession and optional dot version
// 2..... optional dot version
// 3..... optional gene symbol in ()s
// 4... optional gene symbol
// 5..... original sequence
// 6...... 1-based position
// 7...... replacement sequence
+#define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp)
+// substring numbering:
+// 0..................................................... whole matching string
+// 1................ accession and optional dot version
+// 2..... optional dot version
+// 3..... optional gene symbol in ()s
+// 4... optional gene symbol
+// 5... original start AA
+// 6... 1-based start position
+// 7................. optional range sep and AA+pos
+// 8... original end AA
+// 9... 1-based end position
+// 10..... change description
+
// Pseudo-HGVS in common usage
-#define pseudoHgvsGeneSymbolSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp
+#define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp "[: p.]+" hgvsAminoAcidSubstExp
// 0..................................................... whole matching string
// 1................... gene symbol
// 2..... original sequence
// 3...... 1-based position
// 4...... replacement sequence
-static struct hgvsVariant *hgvsParseCDotSubst(char *term)
-/* If term is parseable as an HGVS CDS substitution, return the parsed representation,
- * otherwise NULL. */
+#define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp "[: p.]+" hgvsAaRangeExp
+// 0..................................................... whole matching string
+// 1................... gene symbol
+// 2... original start AA
+// 3... 1-based start position
+// 4................ optional range sep and AA+pos
+// 5... original end AA
+// 6... 1-based end position
+// 7..... change description
+
+#define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp "[: p.]+" posIntExp
+// 0.......................... whole matching string
+// 1................... gene symbol
+// 2..... 1-based position
+
+
+#define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp
+// 0..................................................... whole matching string
+// 1................... gene symbol
+// 2..... optional beginning of position exp
+// 3..... beginning of position exp
+
+
+static struct hgvsVariant *hgvsParseGDotPos(char *term)
+/* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */
+{
+struct hgvsVariant *hgvs = NULL;
+regmatch_t substrs[17];
+if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs)))
+ {
+ int accIx = 1;
+ int startPosIx = 5;
+ int endPosIx = 7;
+ int changeIx = 8;
+ AllocVar(hgvs);
+ hgvs->type = hgvstGenomic;
+ hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
+ hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
+ if (regexSubstrMatched(substrs[endPosIx]))
+ hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
+ else
+ hgvs->end = hgvs->start1;
+ hgvs->change = regexSubstringClone(term, substrs[changeIx]);
+ }
+return hgvs;
+}
+
+static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset,
+ boolean *retIsUtr3, int *retPos, int *retOffset)
+/* Extract matches from substrs starting at substrOffset to parse a complex number
+ * description into anchor type, 1-based position and optional offset. */
+{
+int anchorIx = substrOffset;
+int posIx = substrOffset + 1;
+int offsetOpIx = substrOffset + 3;
+int offsetIx = substrOffset + 4;
+// Determine what startPos is relative to, based on optional anchor and optional offset
+char anchor[16]; // string length 0 or 1
+regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor));
+char offsetOp[16]; // string length 0 or 1
+regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp));
+*retIsUtr3 = (anchor[0] == '*');
+*retPos = regexSubstringInt(term, substrs[posIx]);
+// Is pos negative?
+if (anchor[0] == '-')
+ *retPos = -*retPos;
+// Grab offset, if there is one.
+if (isNotEmpty(offsetOp))
+ {
+ *retOffset = regexSubstringInt(term, substrs[offsetIx]);
+ if (offsetOp[0] == '-')
+ *retOffset = -*retOffset;
+ }
+}
+
+static struct hgvsVariant *hgvsParseCDotPos(char *term)
+/* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */
{
struct hgvsVariant *hgvs = NULL;
boolean matches = FALSE;
int accIx = 1;
-int posIx = 0;
-int refIx = 0;
-int altIx = 0;
+int startAnchorIx = 2;
+int endAnchorIx = 8;
+int endPosIx = 9;
+int changeIx = 13;
+// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that
+// affects all substring offsets after the accession.
+int refSeqExtra = 3;
int geneSymbolIx = -1;
-regmatch_t substrs[8];
-if (regexMatchSubstrNoCase(term, hgvsLrgCDotSubstExp, substrs, ArraySize(substrs)))
+regmatch_t substrs[17];
+if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs)))
{
matches = TRUE;
- posIx = 2;
- refIx = 3;
- altIx = 4;
}
-else if (regexMatchSubstrNoCase(term, hgvsRefSeqNMCDotSubstExp, substrs, ArraySize(substrs)))
+else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs)))
{
matches = TRUE;
geneSymbolIx = 4;
- posIx = 5;
- refIx = 6;
- altIx = 7;
+ startAnchorIx += refSeqExtra;
+ endAnchorIx += refSeqExtra;
+ endPosIx += refSeqExtra;
+ changeIx += refSeqExtra;
}
if (matches)
{
AllocVar(hgvs);
hgvs->type = hgvstCoding;
- hgvs->changeSymbol = cloneString(">");
hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
- hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
- hgvs->refAllele = regexSubstringClone(term, substrs[refIx]);
- hgvs->altAllele = regexSubstringClone(term, substrs[altIx]);
+ extractComplexNum(term, substrs, startAnchorIx,
+ &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset);
if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
- hgvs->end = hgvs->start1 - 1 + strlen(hgvs->refAllele);
- }
-return hgvs;
-}
-
-static char *aaAbbrsToLetters(char *aaStr)
-/* If the input string is composed of AA abbreviations like "Ala", "Asp" etc., convert them to
- * single letter AAs like "A", "D" etc. Otherwise return a copy of aaStr. */
-{
-char *letters = NULL;
-if (regexMatch(aaStr, "^(" AA3 ")+$"))
+ if (regexSubstrMatched(substrs[endPosIx]))
{
- int len = strlen(aaStr) / 3;
- letters = needMem(len + 1);
- int i;
- for (i = 0; i < len; i++)
- letters[i] = aaAbbrToLetter(&aaStr[i*3]);
+ extractComplexNum(term, substrs, endAnchorIx,
+ &hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset);
}
else
- letters = cloneString(aaStr);
-return letters;
+ {
+ hgvs->end = hgvs->start1;
+ hgvs->endIsUtr3 = hgvs->startIsUtr3;
+ hgvs->endOffset = hgvs->startOffset;
+ }
+ hgvs->change = regexSubstringClone(term, substrs[changeIx]);
+ }
+return hgvs;
}
static struct hgvsVariant *hgvsParsePDotSubst(char *term)
/* If term is parseable as an HGVS protein substitution, return the parsed representation,
* otherwise NULL. */
{
struct hgvsVariant *hgvs = NULL;
regmatch_t substrs[8];
-if (regexMatchSubstrNoCase(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs)))
+if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs)))
{
int accIx = 1;
int geneSymbolIx = 4;
int refIx = 5;
int posIx = 6;
int altIx = 7;
AllocVar(hgvs);
hgvs->type = hgvstProtein;
hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
+ int changeStart = substrs[refIx].rm_so;
+ int changeEnd = substrs[altIx].rm_eo;
+ int len = changeEnd - changeStart;
+ char change[len+1];
+ safencpy(change, sizeof(change), term+changeStart, len);
+ hgvs->change = cloneString(change);
if (regexSubstrMatched(substrs[geneSymbolIx]))
hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
- // Convert 3-letter abbreviations to single-letter codes like "Ala" -> "A",
- // so strlen(allele) is the number of amino acids.
- char buf[2048];
- regexSubstringCopy(term, substrs[refIx], buf, sizeof(buf));
- hgvs->refAllele = aaAbbrsToLetters(buf);
- regexSubstringCopy(term, substrs[altIx], buf, sizeof(buf));
- hgvs->altAllele = aaAbbrsToLetters(buf);
- hgvs->end = hgvs->start1 - 1 + strlen(hgvs->refAllele);
+ hgvs->end = hgvs->start1;
+ }
+return hgvs;
+}
+
+static struct hgvsVariant *hgvsParsePDotRange(char *term)
+/* If term is parseable as an HGVS protein range change, return the parsed representation,
+ * otherwise NULL. */
+{
+struct hgvsVariant *hgvs = NULL;
+regmatch_t substrs[11];
+if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs)))
+ {
+ int accIx = 1;
+ int geneSymbolIx = 4;
+ int startRefIx = 5;
+ int startPosIx = 6;
+ int endPosIx = 9;
+ int changeIx = 10;
+ AllocVar(hgvs);
+ hgvs->type = hgvstProtein;
+ hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
+ int changeStart = substrs[startRefIx].rm_so;
+ int changeEnd = substrs[changeIx].rm_eo;
+ int len = changeEnd - changeStart;
+ char change[len+1];
+ safencpy(change, sizeof(change), term+changeStart, len);
+ hgvs->change = cloneString(change);
+ if (regexSubstrMatched(substrs[geneSymbolIx]))
+ hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
+ hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
+ if (regexSubstrMatched(substrs[endPosIx]))
+ hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
+ else
+ hgvs->end = hgvs->start1;
}
return hgvs;
}
struct hgvsVariant *hgvsParseTerm(char *term)
/* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL.
- * This does not check validity of accessions or alleles. */
+ * This does not check validity of accessions, coordinates or alleles. */
{
-struct hgvsVariant *hgvs = hgvsParseCDotSubst(term);
+struct hgvsVariant *hgvs = hgvsParseCDotPos(term);
if (hgvs == NULL)
hgvs = hgvsParsePDotSubst(term);
+if (hgvs == NULL)
+ hgvs = hgvsParsePDotRange(term);
+if (hgvs == NULL)
+ hgvs = hgvsParseGDotPos(term);
return hgvs;
}
-struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
-/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */
+static char *npForGeneSymbol(char *db, char *geneSymbol)
+/* Given a gene symbol, look it up to find its NP_ accession; if not found return NULL. */
{
-struct hgvsVariant *hgvs = NULL;
-regmatch_t substrs[5];
-if (regexMatchSubstrNoCase(term, pseudoHgvsGeneSymbolSubstExp, substrs, ArraySize(substrs)))
- {
- int geneSymbolIx = 1;
- int refIx = 2;
- int posIx = 3;
- int altIx = 4;
- char *geneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
- // Try to find an NP_ for geneSymbol, using refGene to make sure it's an NP for this species.
struct sqlConnection *conn = hAllocConn(db);
char query[2048];
+// Use refGene to make sure it's an NP for this species.
sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
"where l.name = '%s' and l.mrnaAcc = r.name "
- "and l.protAcc != '' order by l.protAcc"
+ "and l.protAcc != '' order by length(l.protAcc), l.protAcc"
, refLinkTable, geneSymbol);
char *npAcc = sqlQuickString(conn, query);
hFreeConn(&conn);
- if (isNotEmpty(npAcc))
- {
- AllocVar(hgvs);
- hgvs->type = hgvstProtein;
- hgvs->seqAcc = npAcc;
- hgvs->seqGeneSymbol = geneSymbol;
- hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
- // Convert 3-letter abbreviations to single-letter codes like "Ala" -> "A",
- // so strlen(allele) is the number of amino acids.
- char buf[2048];
- regexSubstringCopy(term, substrs[refIx], buf, sizeof(buf));
- hgvs->refAllele = aaAbbrsToLetters(buf);
- regexSubstringCopy(term, substrs[altIx], buf, sizeof(buf));
- hgvs->altAllele = aaAbbrsToLetters(buf);
- int refLen = strlen(hgvs->refAllele);
- hgvs->end = hgvs->start1 - 1 + refLen;
- }
- // Note: this doesn't support non-coding gene symbol substs (which should have nt alleles)
+return npAcc;
}
-return hgvs;
+
+static char *nmForGeneSymbol(char *db, char *geneSymbol)
+/* Given a gene symbol, look it up to find its NM_ accession; if not found return NULL. */
+{
+struct sqlConnection *conn = hAllocConn(db);
+char query[2048];
+sqlSafef(query, sizeof(query), "select name from refGene where name2 = '%s' "
+ "order by length(name), name", geneSymbol);
+char *nmAcc = sqlQuickString(conn, query);
+hFreeConn(&conn);
+return nmAcc;
}
-static char *getCdnaSeq(char *db, char *acc, char **retFoundAcc)
-/* Return cdna sequence for acc, or NULL if not found. If retFoundAcc is not NULL,
+static char *getProteinSeq(char *db, char *acc, char **retFoundAcc)
+/* Return amino acid sequence for acc, or NULL if not found. If retFoundAcc is not NULL,
* set it to the accession for which we grabbed seq (might have a .version chopped off). */
{
char *seq = NULL;
char *foundAcc = NULL;
if (startsWith("LRG_", acc))
{
- if (hTableExists(db, "lrgCdna"))
+ if (hTableExists(db, "lrgPep"))
{
char query[2048];
- sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
+ sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc);
struct sqlConnection *conn = hAllocConn(db);
seq = sqlQuickString(conn, query);
hFreeConn(&conn);
if (seq)
foundAcc = cloneString(acc);
}
}
else
{
char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.');
- struct dnaSeq *gbSeq = hGenBankGetMrna(db, trimmedAcc, gbSeqTable);
- if (gbSeq)
+ aaSeq *aaSeq = hGenBankGetPep(db, trimmedAcc, gbSeqTable);
+ if (aaSeq)
{
- seq = gbSeq->dna;
+ seq = aaSeq->dna;
foundAcc = trimmedAcc;
}
}
if (retFoundAcc)
*retFoundAcc = foundAcc;
return seq;
}
-static char *getProteinSeq(char *db, char *acc, char **retFoundAcc)
-/* Return amino acid sequence for acc, or NULL if not found. If retFoundAcc is not NULL,
+static char refBaseForNp(char *db, char *npAcc, int pos)
+// Get the amino acid base in NP_'s sequence at 1-based offset pos.
+{
+char *seq = getProteinSeq(db, npAcc, NULL);
+char base = seq[pos-1];
+freeMem(seq);
+return base;
+}
+
+struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
+/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */
+// Note: this doesn't support non-coding gene symbol terms (which should have nt alleles)
+{
+struct hgvsVariant *hgvs = NULL;
+regmatch_t substrs[11];
+int geneSymbolIx = 1;
+boolean isSubst = regexMatchSubstr(term, pseudoHgvsGeneSymbolProtSubstExp,
+ substrs, ArraySize(substrs));
+if (isSubst ||
+ regexMatchSubstr(term, pseudoHgvsGeneSymbolProtRangeExp, substrs, ArraySize(substrs)))
+ {
+ int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
+ char geneSymbol[len+1];
+ safencpy(geneSymbol, sizeof(geneSymbol), term, len);
+ char *npAcc = npForGeneSymbol(db, geneSymbol);
+ if (isNotEmpty(npAcc))
+ {
+ // Make it a real HGVS term with the NP and pass that on to the usual parser.
+ int descStartIx = 2;
+ char *description = term + substrs[descStartIx].rm_so;
+ struct dyString *npTerm = dyStringCreate("%s(%s):p.%s",
+ npAcc, geneSymbol, description);
+ hgvs = hgvsParseTerm(npTerm->string);
+ dyStringFree(&npTerm);
+ }
+ }
+else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolProtPosExp, substrs, ArraySize(substrs)))
+ {
+ int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
+ char geneSymbol[len+1];
+ safencpy(geneSymbol, sizeof(geneSymbol), term, len);
+ char *npAcc = npForGeneSymbol(db, geneSymbol);
+ if (isNotEmpty(npAcc))
+ {
+ // Only position was provided, no change. Look up ref base and make a synonymous subst
+ // so it's parseable HGVS.
+ int posIx = 2;
+ int pos = regexSubstringInt(term, substrs[posIx]);
+ char refBase = refBaseForNp(db, npAcc, pos);
+ struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=",
+ npAcc, geneSymbol, refBase, pos);
+ hgvs = hgvsParseTerm(npTerm->string);
+ dyStringFree(&npTerm);
+ }
+ }
+else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs)))
+ {
+ int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
+ char geneSymbol[len+1];
+ safencpy(geneSymbol, sizeof(geneSymbol), term, len);
+ char *nmAcc = nmForGeneSymbol(db, geneSymbol);
+ if (isNotEmpty(nmAcc))
+ {
+ // Make it a real HGVS term with the NM and pass that on to the usual parser.
+ int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3;
+ char *description = term + substrs[descStartIx].rm_so;
+ struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s",
+ nmAcc, geneSymbol, description);
+ hgvs = hgvsParseTerm(nmTerm->string);
+ dyStringFree(&nmTerm);
+ }
+ }
+return hgvs;
+}
+
+static char refBaseFromNucSubst(char *change)
+/* If sequence change is a nucleotide substitution, return the reference base, else null char. */
+{
+if (regexMatchNoCase(change, "^([ACGTU])>"))
+ return toupper(change[0]);
+return '\0';
+}
+
+static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs,
+ char **retFoundAcc, int *retFoundVersion,
+ char **retDiffRefAllele)
+/* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom.
+ * If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL.
+ * If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL.
+ * If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to
+ * the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to
+ * genomic sequence if they differ. */
+{
+boolean coordsOK = FALSE;
+if (retDiffRefAllele)
+ *retDiffRefAllele = NULL;
+if (retFoundAcc)
+ *retFoundAcc = NULL;
+if (retFoundVersion)
+ *retFoundVersion = 0;
+char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
+if (isNotEmpty(chrom))
+ {
+ struct chromInfo *ci = hGetChromInfo(db, chrom);
+ if ((hgvs->start1 >= 1 && hgvs->start1 <= ci->size) &&
+ (hgvs->end >=1 && hgvs->end <= ci->size))
+ {
+ coordsOK = TRUE;
+ if (retDiffRefAllele)
+ {
+ char hgvsBase = refBaseFromNucSubst(hgvs->change);
+ if (hgvsBase != '\0')
+ {
+ struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom,
+ hgvs->start1-1, hgvs->start1);
+ touppers(refBase->dna);
+ if (refBase->dna[0] != hgvsBase)
+ *retDiffRefAllele = cloneString(refBase->dna);
+ freeMem(refBase);
+ }
+ }
+ }
+ // Since we found hgvs->seqAcc, split it into versionless and version parts.
+ if (retFoundAcc)
+ *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
+ if (retFoundVersion)
+ {
+ char *p = strchr(hgvs->seqAcc, '.');
+ if (p)
+ *retFoundVersion = atoi(p+1);
+ }
+ // Don't free chromInfo, it's cached!
+ }
+return coordsOK;
+}
+
+static char *getCdnaSeq(char *db, char *acc, char **retFoundAcc)
+/* Return cdna sequence for acc, or NULL if not found. If retFoundAcc is not NULL,
* set it to the accession for which we grabbed seq (might have a .version chopped off). */
{
char *seq = NULL;
char *foundAcc = NULL;
if (startsWith("LRG_", acc))
{
- if (hTableExists(db, "lrgPep"))
+ if (hTableExists(db, "lrgCdna"))
{
char query[2048];
- sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc);
+ sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
struct sqlConnection *conn = hAllocConn(db);
seq = sqlQuickString(conn, query);
hFreeConn(&conn);
if (seq)
foundAcc = cloneString(acc);
}
}
else
{
char *trimmedAcc = cloneFirstWordByDelimiter(acc, '.');
- aaSeq *aaSeq = hGenBankGetPep(db, trimmedAcc, gbSeqTable);
- if (aaSeq)
+ struct dnaSeq *gbSeq = hGenBankGetMrna(db, trimmedAcc, gbSeqTable);
+ if (gbSeq)
{
- seq = aaSeq->dna;
+ seq = gbSeq->dna;
foundAcc = trimmedAcc;
}
}
if (retFoundAcc)
*retFoundAcc = foundAcc;
return seq;
}
-static int getCdsStart(char *db, char *acc)
-/* Return the start coordinate for CDS in genbank or LRG acc, or -1 if not found. */
+static boolean getCds(char *db, char *acc, struct genbankCds *retCds)
+/* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */
{
-int cdsStart = -1;
+boolean gotCds = FALSE;
char query[1024];
if (startsWith("LRG_", acc))
sqlSafef(query, sizeof(query),
"select cds from lrgCds where id = '%s'", acc);
else
sqlSafef(query, sizeof(query),
"SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND "
"(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc);
struct sqlConnection *conn = hAllocConn(db);
char cdsBuf[2048];
char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
hFreeConn(&conn);
if (isNotEmpty(cdsStr))
- {
- struct genbankCds cds;
- if (genbankCdsParse(cdsStr, &cds) && cds.startComplete && cds.start != cds.end)
- cdsStart = cds.start;
- }
-return cdsStart;
+ gotCds = (genbankCdsParse(cdsStr, retCds) &&
+ retCds->startComplete && retCds->start != retCds->end);
+return gotCds;
}
static int getGbVersion(char *db, char *acc)
/* Return the local version that we have for acc. */
{
char query[2048];
sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc);
struct sqlConnection *conn = hAllocConn(db);
int version = sqlQuickNum(conn, query);
hFreeConn(&conn);
return version;
}
-boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion,
+static char refBaseFromProt(char *change)
+/* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code,
+ * else null char. */
+{
+regmatch_t substrs[1];
+if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs)))
+ {
+ char match[256]; // 1- or 3-letter code
+ regexSubstringCopy(change, substrs[0], match, sizeof(match));
+ if (strlen(match) == 1)
+ return toupper(match[0]);
+ else
+ return aaAbbrToLetter(match);
+ }
+return '\0';
+}
+
+static char *cloneStringFromChar(char c)
+/* Return a cloned string from a single character. */
+{
+char str[2];
+str[0] = c;
+str[1] = '\0';
+return cloneString(str);
+}
+
+static void checkRefAllele(struct hgvsVariant *hgvs, int start1, char *accSeq,
char **retDiffRefAllele)
+/* If hgvs change includes a reference allele, and if accSeq at start1 does not match,
+ * then set retDiffRefAllele to the accSeq version. retDiffRefAllele must be non-NULL. */
+{
+char hgvsRefBase = (hgvs->type == hgvstProtein) ? refBaseFromProt(hgvs->change) :
+ refBaseFromNucSubst(hgvs->change);
+if (hgvsRefBase != '\0')
+ {
+ char seqRefBase = toupper(accSeq[start1-1]);
+ if (seqRefBase != hgvsRefBase)
+ *retDiffRefAllele = cloneStringFromChar(seqRefBase);
+ }
+if (hgvs->type == hgvstProtein && *retDiffRefAllele == NULL)
+ {
+ // Protein ranges have a second ref allele base to check
+ char *p = strchr(hgvs->change, '_');
+ if (p != NULL)
+ {
+ hgvsRefBase = refBaseFromProt(p+1);
+ if (hgvsRefBase != '\0')
+ {
+ int end1 = atoi(skipToNumeric(p+1));
+ char seqRefBase = toupper(accSeq[end1-1]);
+ if (seqRefBase != hgvsRefBase)
+ *retDiffRefAllele = cloneStringFromChar(seqRefBase);
+ }
+ }
+ }
+}
+
+static boolean hgvsValidateGene(char *db, struct hgvsVariant *hgvs,
+ char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
/* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
+ * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so
+ * those can't be checked without mapping the term to the genome.
* If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
* of hgvs->seqAcc) or NULL if we can't find any match.
* If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
* set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
* If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
* matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
{
+char *foundAcc = NULL;
+int foundVersion = 0;
boolean coordsOK = FALSE;
char *acc = hgvs->seqAcc;
-char *foundAcc = NULL;
char *accSeq = NULL;
-int cdsOffset = 0;
-int refLen = strlen(hgvs->refAllele);
if (hgvs->type == hgvstProtein)
accSeq = getProteinSeq(db, acc, &foundAcc);
else
accSeq = getCdnaSeq(db, acc, &foundAcc);
-if (retFoundVersion)
- {
- *retFoundVersion = 0;
if (foundAcc && ! startsWith("LRG_", foundAcc))
- *retFoundVersion = getGbVersion(db, foundAcc);
- }
-if (hgvs->type == hgvstCoding && foundAcc)
- cdsOffset = getCdsStart(db, foundAcc);
-if (accSeq && cdsOffset >= 0 &&
- (cdsOffset + hgvs->start1 - 1 + refLen) <= strlen(accSeq))
+ foundVersion = getGbVersion(db, foundAcc);
+if (foundAcc)
{
- coordsOK = TRUE;
- }
+ int seqLen = strlen(accSeq);
+ if (hgvs->type == hgvstCoding)
+ {
+ // Coding term coords can extend beyond the bounds of the transcript so
+ // we can't check them without mapping to the genome. However, if the coords
+ // are in bounds and a reference allele is provided, we can check that.
+ struct genbankCds cds;
+ coordsOK = getCds(db, foundAcc, &cds);
if (coordsOK && retDiffRefAllele)
{
- char *ourSeq = cloneStringZ(accSeq + cdsOffset + hgvs->start1 - 1, refLen);
- toUpperN(ourSeq, refLen);
- if (sameString(hgvs->refAllele, ourSeq))
- *retDiffRefAllele = NULL;
+ int start = hgvs->start1 + (hgvs->startIsUtr3 ? cds.end : cds.start);
+ if (hgvs->startOffset == 0 && start >= 1 && start <= seqLen)
+ checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
+ }
+ }
else
- *retDiffRefAllele = ourSeq;
+ {
+ if (hgvs->start1 >= 1 && hgvs->start1 <= seqLen &&
+ hgvs->end >= 1 && hgvs->end <= seqLen)
+ {
+ coordsOK = TRUE;
+ if (retDiffRefAllele)
+ checkRefAllele(hgvs, hgvs->start1, accSeq, retDiffRefAllele);
+ }
+ }
}
if (retFoundAcc)
*retFoundAcc = foundAcc;
+if (retFoundVersion)
+ *retFoundVersion = foundVersion;
return coordsOK;
}
-static struct psl *pslFromHgvs(struct hgvsVariant *hgvs, int accSize, int cdsStart)
-/* Allocate and return a PSL modeling the variant in transcript named acc. */
+boolean hgvsValidate(char *db, struct hgvsVariant *hgvs,
+ char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
+/* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
+ * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so
+ * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found.
+ * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
+ * of hgvs->seqAcc) or NULL if we can't find any match.
+ * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
+ * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
+ * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
+ * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
+{
+if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito)
+ return hgvsValidateGenomic(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele);
+else
+ return hgvsValidateGene(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele);
+}
+
+static struct bed3 *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
+/* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */
+{
+struct bed3 *region = NULL;
+char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
+if (isNotEmpty(chrom))
+ {
+ AllocVar(region);
+ region->chrom = chrom;
+ region->chromStart = hgvs->start1 - 1;
+ region->chromEnd = hgvs->end;
+ }
+if (retPslTable)
+ *retPslTable = NULL;
+return region;
+}
+
+static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs,
+ int maxCoord, struct genbankCds cds,
+ int *retStart, int *retEnd,
+ int *retUpstreamBases, int *retDownstreamBases)
+/* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths
+ * for when the coding HGVS has coordinates that extend beyond its sequence boundaries.
+ * ret* args must be non-NULL. */
+{
+// If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive.
+int start = hgvs->start1;
+if (start > 0)
+ start -= 1;
+// If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end.
+int end = hgvs->end;
+if (end < 0)
+ end += 1;
+// If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart
+if (hgvs->startIsUtr3)
+ *retStart = cds.end + start;
+else
+ *retStart = cds.start + start;
+if (hgvs->endIsUtr3)
+ *retEnd = cds.end + end;
+else
+ *retEnd = cds.start + end;
+// Now check for coords that extend beyond the transcript('s alignment to the genome)
+if (*retStart < 0)
+ {
+ // hgvs->start1 is upstream of coding transcript.
+ *retUpstreamBases = -*retStart;
+ *retStart = 0;
+ }
+else if (*retStart > maxCoord)
+ {
+ // Even the start coord is downstream of coding transcript -- make a negative "upstream"
+ // for adjusting start.
+ *retUpstreamBases = -(*retStart - maxCoord + 1);
+ *retStart = maxCoord - 1;
+ }
+else
+ *retUpstreamBases = 0;
+if (*retEnd > maxCoord)
+ {
+ // hgvs->end is downstream of coding transcript.
+ *retDownstreamBases = *retEnd - maxCoord;
+ *retEnd = maxCoord;
+ }
+else if (*retEnd < 0)
+ {
+ // Even the end coord is upstream of coding transcript -- make a negative "downstream"
+ // for adjusting end.
+ *retDownstreamBases = *retEnd;
+ *retEnd = 0;
+ }
+else
+ *retDownstreamBases = 0;
+}
+
+static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, int accSize, int accEnd,
+ struct genbankCds cds,
+ int *retUpstreamBases, int *retDownstreamBases)
+/* Allocate and return a PSL modeling the variant in nucleotide sequence acc.
+ * The PSL target is the sequence and the query is the changed part of the sequence.
+ * accEnd is given in case accSize includes an unaligned poly-A tail.
+ * If hgvs is coding ("c.") then the caller must pass in a valid cds.
+ * In case the start or end position is outside the bounds of the sequence, set retUpstreamBases
+ * or retDownstreamBases to the number of bases beyond the beginning or end of sequence.
+ * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point
+ * and the caller will have to map that to the genome and then apply the intron offset. */
{
if (hgvs == NULL)
return NULL;
+if (hgvs->type == hgvstProtein)
+ errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein.");
struct psl *psl;
AllocVar(psl);
-int refLen = strlen(hgvs->refAllele);
-int altLen = strlen(hgvs->altAllele);
-struct dyString *dy = dyStringCreate("%s%s%s",
- hgvs->refAllele, hgvs->changeSymbol, hgvs->altAllele);
-psl->qName = dyStringCannibalize(&dy);
+psl->tName = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
+safecpy(psl->strand, sizeof(psl->strand), "+");
+psl->tSize = accSize;
+if (hgvs->type != hgvstCoding)
+ {
+ // Sane 1-based fully closed coords.
+ psl->tStart = hgvs->start1 - 1;
+ psl->tEnd = hgvs->end;
+ }
+else
+ {
+ // Simple or insanely complex CDS-relative coords.
+ hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd,
+ retUpstreamBases, retDownstreamBases);
+ }
+int refLen = psl->tEnd - psl->tStart;
+// Just use refLen for alt until we parse the sequence change portion of the term:
+int altLen = refLen;
+psl->qName = cloneString(hgvs->change);
psl->qSize = altLen;
psl->qStart = 0;
psl->qEnd = altLen;
-psl->tName = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
-psl->tSize = accSize;
-safecpy(psl->strand, sizeof(psl->strand), "+");
-psl->tStart = cdsStart + hgvs->start1 - 1;
-psl->tEnd = psl->tStart + refLen;
psl->blockCount = 1;
AllocArray(psl->blockSizes, psl->blockCount);
AllocArray(psl->qStarts, psl->blockCount);
AllocArray(psl->tStarts, psl->blockCount);
psl->blockSizes[0] = refLen <= altLen ? refLen : altLen;
psl->qStarts[0] = psl->qStart;
psl->tStarts[0] = psl->tStart;
return psl;
}
-static struct psl *hgvsMapCDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
-/* Return a psl with target coords from db, mapping the variant to the genome.
- * Return NULL if unable to map.
+#define limitToRange(val, min, max) { if (val < min) { val = min; } \
+ if (val > max) { val = max; } }
+
+static struct bed3 *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
+/* Return a bed3 with the variant's span on the genome, or NULL if unable to map.
* If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
{
-struct psl *mappedToGenome = NULL;
+struct bed3 *region = NULL;
char *trackTable = NULL;
char *pslTable = NULL;
char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
-int cdsStart = -1;
+if (hgvs->type == hgvstGenomic)
+ return hgvsMapGDotToGenome(db, hgvs, retPslTable);
if (startsWith("NM_", acc))
{
// TODO: replace these with NCBI's alignments
trackTable = "refGene";
pslTable = "refSeqAli";
}
else if (startsWith("LRG_", acc))
{
trackTable = "lrgTranscriptAli";
pslTable = "lrgTranscriptAli";
}
-cdsStart = getCdsStart(db, acc);
-if (trackTable && pslTable && cdsStart >= 0 &&
+struct genbankCds cds;
+boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE;
+if (trackTable && pslTable && (hgvs->type != hgvstCoding || gotCds) &&
hTableExists(db, trackTable) && hTableExists(db, pslTable))
{
struct dyString *dyQuery = sqlDyStringCreate("select * from %s where qName = '%s'",
pslTable, acc);
struct sqlConnection *conn = hAllocConn(db);
struct sqlResult *sr = sqlGetResult(conn, dyQuery->string);
int bin = hIsBinned(db, pslTable);
char **row;
if (sr && (row = sqlNextRow(sr)))
{
struct psl *txAli = pslLoad(row+bin);
- struct psl *variantPsl = pslFromHgvs(hgvs, txAli->qSize, cdsStart);
- mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli);
+ // variantPsl contains the anchor if a non-cdsStart anchor is used because
+ // the actual position might be outside the bounds of the transcript sequence (intron/UTR)
+ int upstream = 0, downstream = 0;
+ struct psl *variantPsl = pslFromHgvsNuc(hgvs, txAli->qSize, txAli->qEnd, cds,
+ &upstream, &downstream);
+ struct psl *mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli);
+ if (mappedToGenome)
+ {
+ AllocVar(region);
+ region->chrom = cloneString(mappedToGenome->tName);
+ region->chromStart = mappedToGenome->tStart;
+ region->chromEnd = mappedToGenome->tEnd;
+ // If the start and/or end is in an intron, add the intron offset now.
+ boolean revStrand = (mappedToGenome->strand[0] == '-');
+ if (hgvs->startOffset != 0)
+ {
+ if (revStrand)
+ region->chromEnd -= hgvs->startOffset;
+ else
+ region->chromStart += hgvs->startOffset;
+ }
+ if (hgvs->endOffset != 0)
+ {
+ if (revStrand)
+ region->chromStart -= hgvs->endOffset;
+ else
+ region->chromEnd += hgvs->endOffset;
+ }
+ // Apply extra up/downstream offsets (usually 0)
+ if (revStrand)
+ {
+ region->chromStart -= downstream;
+ region->chromEnd += upstream;
+ }
+ else
+ {
+ region->chromStart -= upstream;
+ region->chromEnd += downstream;
+ }
+ limitToRange(region->chromStart, 0, mappedToGenome->tSize)
+ limitToRange(region->chromEnd, 0, mappedToGenome->tSize)
+ freez(&mappedToGenome);
+ }
}
sqlFreeResult(&sr);
hFreeConn(&conn);
}
-if (mappedToGenome && retPslTable)
+if (region && retPslTable)
*retPslTable = cloneString(pslTable);
-return mappedToGenome;
-}
-
-static char *aaToArbitraryCodons(char *aaStr)
-/* Translate amino acid string into codon string where we just take the first codon we
- * find for each amino. */
-{
-int aaLen = strlen(aaStr);
-char *codonStr = needMem(aaLen*3 + 1);
-int i;
-for (i = 0; i < aaLen; i++)
- aaToArbitraryCodon(aaStr[i], &codonStr[i*3]);
-return codonStr;
+return region;
}
-static struct psl *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
-/* Return a psl with target coords from db, mapping the variant to the genome.
- * Return NULL if unable to map.
+static struct bed3 *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
+/* Return a bed3 with the variant's span on the genome, or NULL if unable to map.
* If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
{
-struct psl *mappedToGenome = NULL;
+struct bed3 *region = NULL;
char *acc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
if (startsWith("NP_", acc))
{
// Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome.
struct sqlConnection *conn = hAllocConn(db);
char query[2048];
sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
"where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
char *nmAcc = sqlQuickString(conn, query);
hFreeConn(&conn);
if (nmAcc)
{
struct hgvsVariant cDot;
zeroBytes(&cDot, sizeof(cDot));
cDot.type = hgvstCoding;
cDot.seqAcc = nmAcc;
- cDot.changeSymbol = ">";
- // These aren't necessarily right since AA doesn't have as much info as codon.
- // We could get the refAllele right using the reference sequence -- but altAllele
- // could still be ambiguous. In the meantime we're not really using the alleles
- // for anything besides length so far...
- cDot.refAllele = aaToArbitraryCodons(hgvs->refAllele);
- cDot.altAllele = aaToArbitraryCodons(hgvs->altAllele);
cDot.start1 = ((hgvs->start1 - 1) * 3) + 1;
cDot.end = ((hgvs->end - 1) * 3) + 3;
- mappedToGenome = hgvsMapCDotToGenome(db, &cDot, retPslTable);
+ cDot.change = hgvs->change;
+ region = hgvsMapNucToGenome(db, &cDot, retPslTable);
}
}
-return mappedToGenome;
+return region;
}
-struct psl *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
-/* Return a psl with target coords from db, mapping the variant to the genome.
- * Return NULL if unable to map.
+struct bed3 *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
+/* Return a bed3 with the variant's span on the genome, or NULL if unable to map.
* If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
{
-if (hgvs == NULL || !hgvsValidate(db, hgvs, NULL, NULL, NULL))
+if (hgvs == NULL)
return NULL;
-struct psl *mappedToGenome = NULL;
-if (hgvs->type == hgvstCoding)
- mappedToGenome = hgvsMapCDotToGenome(db, hgvs, retPslTable);
-else if (hgvs->type == hgvstProtein)
- mappedToGenome = hgvsMapPDotToGenome(db, hgvs, retPslTable);
-return mappedToGenome;
+struct bed3 *region = NULL;
+if (hgvs->type == hgvstProtein)
+ region = hgvsMapPDotToGenome(db, hgvs, retPslTable);
+else
+ region = hgvsMapNucToGenome(db, hgvs, retPslTable);
+return region;
}