b5b259b8314a0253ed8c26811bb8f9167f10829d
angie
Mon Aug 28 14:11:56 2017 -0700
hgvs{C,N}FromVpTx: when variant spans exon/intron boundary, use genomic ref instead of transcript ref so that intron bases aren't omitted. Thanks ChrisL for example in #19968-note17. refs #19968
diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c
index 2bed75a..f2f3b2d 100644
--- src/hg/lib/hgHgvs.c
+++ src/hg/lib/hgHgvs.c
@@ -1,2647 +1,2658 @@
/* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */
/* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */
/* Copyright (C) 2016 The Regents of the University of California
* See README in this or parent directory for licensing information. */
#include "common.h"
#include "hgHgvs.h"
#include "chromInfo.h"
#include "genbank.h"
#include "hdb.h"
#include "indelShift.h"
#include "pslTransMap.h"
#include "regexHelper.h"
#include "trackHub.h"
void hgvsVariantFree(struct hgvsVariant **pHgvs)
// Free *pHgvs and its contents, and set *pHgvs to NULL.
{
if (pHgvs && *pHgvs)
{
struct hgvsVariant *hgvs = *pHgvs;
freez(&hgvs->seqAcc);
freez(&hgvs->seqGeneSymbol);
freez(&hgvs->changes);
freez(pHgvs);
}
}
// Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq:
#define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])"
#define lrgRegionExp "(LRG_[0-9+])"
// NC = RefSeq reference assembly chromosome
// NG = RefSeq incomplete genomic region (e.g. gene locus)
// NM = RefSeq curated mRNA
// NP = RefSeq curated protein
// NR = RefSeq curated (non-coding) RNA
#define geneSymbolExp "([A-Za-z0-9./_-]+)"
#define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?"
#define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp
// ........................ accession and optional dot version
// ........... optional dot version
// ...... optional gene symbol in ()s
// .... optional gene symbol
#define versionedRefSeqNCExp versionedAccPrefixExp("[NX]C")
#define versionedRefSeqNGExp versionedAccPrefixExp("[NX]G")
#define versionedRefSeqNMExp versionedAccPrefixExp("[NX]M")
#define versionedRefSeqNPExp versionedAccPrefixExp("[NX]P")
#define versionedRefSeqNMRExp versionedAccPrefixExp("[NX][MR]")
// Nucleotide position regexes
// (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA)
#define posIntExp "([0-9]+)"
#define hgvsGenoPosExp posIntExp "(_" posIntExp ")?"
// ...... 1-based start position
// ............. optional range separator and end position
// ...... 1-based end position
// n. terms can have exonic anchor base and intron offset for both start and end:
#define offsetExp "([-+])"
// c. terms may also have a UTR indicator before the anchor base (- for UTR5, * for UTR3)
#define anchorExp "([-*])?"
#define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?"
#define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?"
// ... optional UTR anchor "-" or "*"
// ... mandatory 1-based start anchor base offset
// ....... optional offset separator and offset
// ... intron offset separator
// ... intron offset number
// ............... optional range separator and complex end
// ... optional UTR anchor "-" or "*"
// ... 1-based end anchor base offset
// ....... optional offset separator and offset
// ... intron offset separator
// ... intron offset number
// It's pretty common for users to omit the '.' so if it's missing but the rest of the regex fits,
// roll with it.
#define hgvsCDotPosExp "c\\.?" hgvsCdsPosExp
#define hgvsGMDotPosExp "([gm])\\.?" hgvsGenoPosExp
#define hgvsNDotPosExp "n\\.?" hgvsCdsPosExp
// Not supporting RDot at this point because r. terms may use either n. or c. numbering!
// #define hgvsRDotPosExp "r\\.?" hgvsCdsPosExp
// Protein substitution regex
#define aa3Exp "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val|Ter"
#define hgvsAminoAcidExp "[ARNDCQEGHILKMFPSTWYVX*]|" aa3Exp
#define hgvsAminoAcidSubstExp "(" hgvsAminoAcidExp ")" posIntExp "(" hgvsAminoAcidExp "|=)"
#define hgvsPDotSubstExp "p\\.\\(?" hgvsAminoAcidSubstExp "\\)?"
// ... // original sequence
// ...... // 1-based position
// ... // replacement sequence
// Protein range (or just single pos) regex
#define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)"
#define hgvsPDotRangeExp "p\\.\\(?" hgvsAaRangeExp "\\)?"
// original start AA ...
// 1-based start position ...
// optional range sep and AA+pos .....................................
// original end AA ...
// 1-based end position ...
// change description ...
// Complete HGVS term regexes combining sequence identifiers and change operations
#define hgvsFullRegex(seq, op) "^" seq "[ :]+" op
#define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp)
#define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)"
// substring numbering:
// 0..................................... whole matching string
// 1................. accession and optional dot version
// 2........ optional dot version
// 3...... (n/a) optional gene symbol in ()s
// 4.... (n/a) optional gene symbol
// 5. g or m
// 6.. 1-based start position
// 7... optional range separator and end position
// 8.. 1-based end position
// 9... change description
#define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp)
#define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)"
// substring numbering:
// 0..................................................... whole matching string
// 1................... LRG transcript
// 2... optional UTR anchor "-" or "*"
// 3... mandatory first number
// 4....... optional offset separator and offset
// 5... offset separator
// 6... offset number
// 7............... optional range sep and complex num
// 8... optional UTR anchor "-" or "*"
// 9... first number
// 10....... optional offset separator and offset
// 11... offset separator
// 12... offset number
// 13........ sequence change
#define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp)
#define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)"
// substring numbering:
// 0............................................................... whole matching string
// 1............... acc & optional dot version
// 2........ optional dot version
// 3..... optional gene sym in ()s
// 4... optional gene symbol
// 5... optional UTR anchor
// 6... mandatory first number
// 7....... optional offset
// 8... offset separator
// 9... offset number
// 10............... optional range
// 11... optional UTR anchor
// 12... first number
// 13....... optional offset
// 14... offset separator
// 15... offset number
// 16........ sequence change
#define hgvsLrgNDotExp hgvsFullRegex(lrgTranscriptExp, hgvsNDotPosExp) "(.*)"
// substring numbering:
// 0..................................................... whole matching string
// 1................... LRG transcript
// 2... n/a 4 n.: UTR anchor "-" or "*"
// 3... mandatory first number
// 4....... optional offset separator and offset
// 5... offset separator
// 6... offset number
// 7............... optional range sep and complex num
// 8... n/a 4 n.: UTR anchor "-" or "*"
// 9... first number
// 10....... optional offset separator and offset
// 11... offset separator
// 12... offset number
// 13........ sequence change
#define hgvsRefSeqNMRNDotExp hgvsFullRegex(versionedRefSeqNMRExp, hgvsNDotPosExp) "(.*)"
// substring numbering:
// 0............................................................... whole matching string
// 1............... acc & optional dot version
// 2........ optional dot version
// 3..... optional gene sym in ()s
// 4... optional gene symbol
// 5... n/a 4 n.: UTR anchor
// 6... mandatory first number
// 7....... optional offset
// 8... offset separator
// 9... offset number
// 10............... optional range
// 11... n/a 4 n.: UTR anchor
// 12... first number
// 13....... optional offset
// 14... offset separator
// 15... offset number
// 16........ sequence change
#define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp)
// substring numbering:
// 0..................................................... whole matching string
// 1................ accession and optional dot version
// 2..... optional dot version
// 3..... optional gene symbol in ()s
// 4... optional gene symbol
// 5..... original sequence
// 6...... 1-based position
// 7...... replacement sequence
#define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp)
// substring numbering:
// 0..................................................... whole matching string
// 1................ accession and optional dot version
// 2..... optional dot version
// 3..... optional gene symbol in ()s
// 4... optional gene symbol
// 5... original start AA
// 6... 1-based start position
// 7................. optional range sep and AA+pos
// 8... original end AA
// 9... 1-based end position
// 10..... change description
// Pseudo-HGVS in common usage
// g. with "chr" ID:
#define pseudoHgvsChrGDotExp hgvsFullRegex("(chr[0-9A-Za-z_]+)", hgvsGMDotPosExp) "(.*)"
// substring numbering:
// 0..................................... whole matching string
// 1........... chr...
// 2. g or m
// 3...... 1-based start position
// 4....... optional range separator and end position
// 5..... 1-based end position
// 6.... change description
// Sometimes users give an NM_ accession, but a protein change.
#define maybePDot "[ :]+p?\\.?\\(?"
#define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp maybePDot hgvsAminoAcidSubstExp "\\)?"
// substring numbering:
// 0..................................................... whole matching string
// 1............... acc & optional dot version
// 2........ optional dot version
// 3..... optional gene sym in ()s
// 4... optional gene symbol
// 5..... original sequence
// 6...... 1-based position
// 7...... replacement sequence
#define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp maybePDot hgvsAaRangeExp "\\)?"
// substring numbering:
// 0..................................................... whole matching string
// 1............... acc & optional dot version
// 2........ optional dot version
// 3..... optional gene sym in ()s
// 4... optional gene symbol
// 5... original start AA
// 6... 1-based start position
// 7.......... optional range sep and AA+pos
// 8... original end AA
// 9... 1-based end position
// 10.... change description
// Common: gene symbol followed by space and/or punctuation followed by protein change
#define pseudoHgvsGeneSymbolProtSubstExp "^" geneSymbolExp maybePDot hgvsAminoAcidSubstExp "\\)?"
// 0..................................................... whole matching string
// 1................... gene symbol
// 2..... original sequence
// 3...... 1-based position
// 4...... replacement sequence
#define pseudoHgvsGeneSymbolProtRangeExp "^" geneSymbolExp maybePDot hgvsAaRangeExp "\\)?"
// 0..................................................... whole matching string
// 1................... gene symbol
// 2... original start AA
// 3... 1-based start position
// 4................ optional range sep and AA+pos
// 5... original end AA
// 6... 1-based end position
// 7..... change description
// As above but omitting the protein change
#define pseudoHgvsGeneSymbolProtPosExp "^" geneSymbolExp maybePDot posIntExp "\\)?"
// 0.......................... whole matching string
// 1................... gene symbol
// 2..... 1-based position
// Gene symbol, maybe punctuation, and a clear "c." position (and possibly change)
#define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp
// 0..................................................... whole matching string
// 1................... gene symbol
// 2..... optional beginning of position exp
// 3..... beginning of position exp
static struct hgvsVariant *hgvsParseGDotPos(char *term)
/* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */
{
struct hgvsVariant *hgvs = NULL;
regmatch_t substrs[17];
if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs)))
{
int accIx = 1;
int startPosIx = 6;
int endPosIx = 8;
int changeIx = 9;
AllocVar(hgvs);
// HGVS recommendation May 2017: replace m. with g. since mitochondrion is genomic too
hgvs->type = hgvstGenomic;
hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
if (regexSubstrMatched(substrs[endPosIx]))
hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
else
hgvs->end = hgvs->start1;
hgvs->changes = regexSubstringClone(term, substrs[changeIx]);
}
return hgvs;
}
static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset,
boolean *retIsUtr3, int *retPos, int *retOffset)
/* Extract matches from substrs starting at substrOffset to parse a complex number
* description into anchor type, 1-based position and optional offset. */
{
int anchorIx = substrOffset;
int posIx = substrOffset + 1;
int offsetOpIx = substrOffset + 3;
int offsetIx = substrOffset + 4;
// Determine what startPos is relative to, based on optional anchor and optional offset
char anchor[16]; // string length 0 or 1
regexSubstringCopy(term, substrs[anchorIx], anchor, sizeof(anchor));
char offsetOp[16]; // string length 0 or 1
regexSubstringCopy(term, substrs[offsetOpIx], offsetOp, sizeof(offsetOp));
*retIsUtr3 = (anchor[0] == '*');
*retPos = regexSubstringInt(term, substrs[posIx]);
// Is pos negative?
if (anchor[0] == '-')
*retPos = -*retPos;
// Grab offset, if there is one.
if (isNotEmpty(offsetOp))
{
*retOffset = regexSubstringInt(term, substrs[offsetIx]);
if (offsetOp[0] == '-')
*retOffset = -*retOffset;
}
}
static struct hgvsVariant *hgvsParseCNDotPos(char *term)
/* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */
{
struct hgvsVariant *hgvs = NULL;
boolean isNoncoding = FALSE;
boolean matches = FALSE;
int accIx = 1;
int startAnchorIx = 2;
int endAnchorIx = 8;
int endPosIx = 9;
int changeIx = 13;
// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that
// affects all substring offsets after the accession.
int refSeqExtra = 3;
int geneSymbolIx = -1;
regmatch_t substrs[17];
if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs)) ||
(isNoncoding = regexMatchSubstr(term, hgvsLrgNDotExp, substrs, ArraySize(substrs))))
{
matches = TRUE;
}
else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs)) ||
(isNoncoding = regexMatchSubstr(term, hgvsRefSeqNMRNDotExp, substrs, ArraySize(substrs))))
{
matches = TRUE;
geneSymbolIx = 4;
startAnchorIx += refSeqExtra;
endAnchorIx += refSeqExtra;
endPosIx += refSeqExtra;
changeIx += refSeqExtra;
}
if (matches)
{
AllocVar(hgvs);
hgvs->type = isNoncoding ? hgvstNoncoding : hgvstCoding;
hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
extractComplexNum(term, substrs, startAnchorIx,
&hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset);
if (isNoncoding && hgvs->startIsUtr3)
{
warn("hgvsParseCNDotPos: noncoding term '%s' appears to start in UTR3 (*), "
"not applicable for noncoding", term);
hgvs->startIsUtr3 = FALSE;
}
if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx]))
hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
if (regexSubstrMatched(substrs[endPosIx]))
{
extractComplexNum(term, substrs, endAnchorIx,
&hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset);
if (isNoncoding && hgvs->endIsUtr3)
{
warn("hgvsParseCNDotPos: noncoding term '%s' appears to end in UTR3 (*), "
"not applicable for noncoding", term);
hgvs->endIsUtr3 = FALSE;
}
}
else
{
hgvs->end = hgvs->start1;
hgvs->endIsUtr3 = hgvs->startIsUtr3;
hgvs->endOffset = hgvs->startOffset;
}
hgvs->changes = regexSubstringClone(term, substrs[changeIx]);
}
return hgvs;
}
static struct hgvsVariant *hgvsParsePDotSubst(char *term)
/* If term is parseable as an HGVS protein substitution, return the parsed representation,
* otherwise NULL. */
{
struct hgvsVariant *hgvs = NULL;
regmatch_t substrs[8];
if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs)))
{
int accIx = 1;
int geneSymbolIx = 4;
int refIx = 5;
int posIx = 6;
int altIx = 7;
AllocVar(hgvs);
hgvs->type = hgvstProtein;
hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
int changeStart = substrs[refIx].rm_so;
int changeEnd = substrs[altIx].rm_eo;
int len = changeEnd - changeStart;
char change[len+1];
safencpy(change, sizeof(change), term+changeStart, len);
hgvs->changes = cloneString(change);
if (regexSubstrMatched(substrs[geneSymbolIx]))
hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
hgvs->start1 = regexSubstringInt(term, substrs[posIx]);
hgvs->end = hgvs->start1;
}
return hgvs;
}
static struct hgvsVariant *hgvsParsePDotRange(char *term)
/* If term is parseable as an HGVS protein range change, return the parsed representation,
* otherwise NULL. */
{
struct hgvsVariant *hgvs = NULL;
regmatch_t substrs[11];
if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs)))
{
int accIx = 1;
int geneSymbolIx = 4;
int startRefIx = 5;
int startPosIx = 6;
int endPosIx = 9;
int changeIx = 10;
AllocVar(hgvs);
hgvs->type = hgvstProtein;
hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]);
int changeStart = substrs[startRefIx].rm_so;
int changeEnd = substrs[changeIx].rm_eo;
int len = changeEnd - changeStart;
char change[len+1];
safencpy(change, sizeof(change), term+changeStart, len);
hgvs->changes = cloneString(change);
if (regexSubstrMatched(substrs[geneSymbolIx]))
hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]);
hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
if (regexSubstrMatched(substrs[endPosIx]))
hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
else
hgvs->end = hgvs->start1;
}
return hgvs;
}
struct hgvsVariant *hgvsParseTerm(char *term)
/* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL.
* This does not check validity of accessions, coordinates or alleles. */
{
struct hgvsVariant *hgvs = hgvsParseCNDotPos(term);
if (hgvs == NULL)
hgvs = hgvsParsePDotSubst(term);
if (hgvs == NULL)
hgvs = hgvsParsePDotRange(term);
if (hgvs == NULL)
hgvs = hgvsParseGDotPos(term);
return hgvs;
}
static char *npForGeneSymbol(char *db, char *geneSymbol)
/* Given a gene symbol, look up and return its NP_ accession; if not found return NULL. */
{
char query[2048];
if (hDbHasNcbiRefSeq(db))
{
sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where name = '%s' "
"and protAcc != 'n/a' and protAcc != '' "
"order by length(protAcc), protAcc",
geneSymbol);
}
else if (hTableExists(db, "refGene"))
{
// Join with refGene to make sure it's an NP for this species.
sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
"where l.name = '%s' and l.mrnaAcc = r.name "
"and l.protAcc != '' order by length(l.protAcc), l.protAcc"
, refLinkTable, geneSymbol);
}
else
return NULL;
struct sqlConnection *conn = hAllocConn(db);
char *npAcc = sqlQuickString(conn, query);
hFreeConn(&conn);
return npAcc;
}
static char *nmForGeneSymbol(char *db, char *geneSymbol)
/* Given a gene symbol, look up and return its NM_ accession; if not found return NULL. */
{
if (trackHubDatabase(db))
return NULL;
char query[2048];
char *geneTable = NULL;
if (hDbHasNcbiRefSeq(db))
geneTable = "ncbiRefSeq";
else if (hTableExists(db, "refGene"))
geneTable = "refGene";
if (geneTable == NULL)
return NULL;
sqlSafef(query, sizeof(query), "select name from %s where name2 = '%s' "
"order by length(name), name", geneTable, geneSymbol);
struct sqlConnection *conn = hAllocConn(db);
char *nmAcc = sqlQuickString(conn, query);
hFreeConn(&conn);
return nmAcc;
}
static char *npForNm(char *db, char *nmAcc)
/* Given an NM_ accession, look up and return its NP_ accession; if not found return NULL. */
{
if (trackHubDatabase(db))
return NULL;
char query[2048];
if (hDbHasNcbiRefSeq(db))
{
// ncbiRefSeq tables use versioned NM_ accs, but the user might have passed in a
// versionless NM_, so adjust query accordingly:
if (strchr(nmAcc, '.'))
sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id = '%s'", nmAcc);
else
sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where id like '%s.%%'",
nmAcc);
}
else if (hTableExists(db, "refGene"))
{
// Trim .version if present since our genbank tables don't use versioned names.
char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.');
sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r "
"where r.name = '%s' and l.mrnaAcc = r.name "
"and l.protAcc != '' order by length(l.protAcc), l.protAcc",
refLinkTable, trimmed);
}
else return NULL;
struct sqlConnection *conn = hAllocConn(db);
char *npAcc = sqlQuickString(conn, query);
hFreeConn(&conn);
return npAcc;
}
static char *getProteinSeq(char *db, char *acc)
/* Return amino acid sequence for acc, or NULL if not found. */
{
if (trackHubDatabase(db))
return NULL;
char *seq = NULL;
if (startsWith("LRG_", acc))
{
if (hTableExists(db, "lrgPep"))
{
char query[2048];
sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc);
struct sqlConnection *conn = hAllocConn(db);
seq = sqlQuickString(conn, query);
hFreeConn(&conn);
}
}
else
{
if (hDbHasNcbiRefSeq(db))
{
char query[2048];
sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable "
"where name = '%s'", acc);
struct sqlConnection *conn = hAllocConn(db);
seq = sqlQuickString(conn, query);
hFreeConn(&conn);
}
else
{
aaSeq *aaSeq = hGenBankGetPep(db, acc, NULL);
if (aaSeq)
seq = aaSeq->dna;
}
}
return seq;
}
static char refBaseForNp(char *db, char *npAcc, int pos)
// Get the amino acid base in NP_'s sequence at 1-based offset pos.
{
char *seq = getProteinSeq(db, npAcc);
char base = seq ? seq[pos-1] : '\0';
freeMem(seq);
return base;
}
struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term)
/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */
// Note: this doesn't support non-coding gene symbol terms (which should have nt alleles)
{
struct hgvsVariant *hgvs = NULL;
regmatch_t substrs[11];
int geneSymbolIx = 1;
boolean isSubst;
if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp,
substrs, ArraySize(substrs))) ||
regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs)))
{
// User gave an NM_ accession but a protein change -- swap in the right NP_.
int nmAccIx = 1;
int geneSymbolIx = 4;
int len = substrs[nmAccIx].rm_eo - substrs[nmAccIx].rm_so;
char nmAcc[len+1];
safencpy(nmAcc, sizeof(nmAcc), term, len);
char *npAcc = npForNm(db, nmAcc);
if (isNotEmpty(npAcc))
{
// Make it a real HGVS term with the NP and pass that on to the usual parser.
int descStartIx = 5;
char *description = term + substrs[descStartIx].rm_so;
struct dyString *npTerm;
if (regexSubstrMatched(substrs[geneSymbolIx]))
{
len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
char geneSymbol[len+1];
safencpy(geneSymbol, sizeof(geneSymbol), term, len);
npTerm = dyStringCreate("%s(%s):p.%s", npAcc, geneSymbol, description);
}
else
npTerm = dyStringCreate("%s:p.%s", npAcc, description);
hgvs = hgvsParseTerm(npTerm->string);
dyStringFree(&npTerm);
freeMem(npAcc);
}
}
else if ((isSubst = regexMatchSubstr(term, pseudoHgvsGeneSymbolProtSubstExp,
substrs, ArraySize(substrs))) ||
regexMatchSubstr(term, pseudoHgvsGeneSymbolProtRangeExp, substrs, ArraySize(substrs)))
{
int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
char geneSymbol[len+1];
safencpy(geneSymbol, sizeof(geneSymbol), term, len);
char *npAcc = npForGeneSymbol(db, geneSymbol);
if (isNotEmpty(npAcc))
{
// Make it a real HGVS term with the NP and pass that on to the usual parser.
int descStartIx = 2;
char *description = term + substrs[descStartIx].rm_so;
struct dyString *npTerm = dyStringCreate("%s(%s):p.%s",
npAcc, geneSymbol, description);
hgvs = hgvsParseTerm(npTerm->string);
dyStringFree(&npTerm);
freeMem(npAcc);
}
}
else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolProtPosExp, substrs, ArraySize(substrs)))
{
int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
char geneSymbol[len+1];
safencpy(geneSymbol, sizeof(geneSymbol), term, len);
char *npAcc = npForGeneSymbol(db, geneSymbol);
if (isNotEmpty(npAcc))
{
// Only position was provided, no change. Look up ref base and make a synonymous subst
// so it's parseable HGVS.
int posIx = 2;
int pos = regexSubstringInt(term, substrs[posIx]);
char refBase = refBaseForNp(db, npAcc, pos);
struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=",
npAcc, geneSymbol, refBase, pos);
hgvs = hgvsParseTerm(npTerm->string);
dyStringFree(&npTerm);
freeMem(npAcc);
}
}
else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs)))
{
int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so;
char geneSymbol[len+1];
safencpy(geneSymbol, sizeof(geneSymbol), term, len);
char *nmAcc = nmForGeneSymbol(db, geneSymbol);
if (isNotEmpty(nmAcc))
{
// Make it a real HGVS term with the NM and pass that on to the usual parser.
int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3;
char *description = term + substrs[descStartIx].rm_so;
struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s",
nmAcc, geneSymbol, description);
hgvs = hgvsParseTerm(nmTerm->string);
dyStringFree(&nmTerm);
freeMem(nmAcc);
}
}
else if (regexMatchSubstr(term, pseudoHgvsChrGDotExp, substrs, ArraySize(substrs)))
{
int chrIx = 1;
int startPosIx = 3;
int endPosIx = 5;
int changeIx = 6;
AllocVar(hgvs);
hgvs->type = hgvstGenomic;
hgvs->seqAcc = regexSubstringClone(term, substrs[chrIx]);
hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]);
if (regexSubstrMatched(substrs[endPosIx]))
hgvs->end = regexSubstringInt(term, substrs[endPosIx]);
else
hgvs->end = hgvs->start1;
hgvs->changes = regexSubstringClone(term, substrs[changeIx]);
}
return hgvs;
}
static char refBaseFromNucSubst(char *change)
/* If sequence change is a nucleotide substitution, return the reference base, else null char. */
{
if (regexMatchNoCase(change, "^([ACGTU])>"))
return toupper(change[0]);
return '\0';
}
static void hgvsStartEndToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int *retStart, int *retEnd)
/* Convert HGVS's fully closed, 1-based-unless-negative start and end to UCSC start and end. */
{
// If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive.
int start = hgvs->start1;
if (start > 0)
start--;
// If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end.
int end = hgvs->end;
if (end < 0)
end++;
if (retStart)
*retStart = start;
if (retEnd)
*retEnd = end;
}
static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs,
char **retFoundAcc, int *retFoundVersion,
char **retDiffRefAllele)
/* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom.
* If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL.
* If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL.
* If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to
* the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to
* genomic sequence if they differ. */
{
boolean coordsOK = FALSE;
if (retDiffRefAllele)
*retDiffRefAllele = NULL;
if (retFoundAcc)
*retFoundAcc = NULL;
if (retFoundVersion)
*retFoundVersion = 0;
char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
if (isNotEmpty(chrom))
{
struct chromInfo *ci = hGetChromInfo(db, chrom);
int start, end;
hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
if (start >= 0 && start < ci->size && end > 0 && end < ci->size)
{
coordsOK = TRUE;
if (retDiffRefAllele)
{
char hgvsBase = refBaseFromNucSubst(hgvs->changes);
if (hgvsBase != '\0')
{
struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom, start, end);
touppers(refBase->dna);
if (refBase->dna[0] != hgvsBase)
*retDiffRefAllele = cloneString(refBase->dna);
dnaSeqFree(&refBase);
}
}
}
// Since we found hgvs->seqAcc, split it into versionless and version parts.
if (retFoundAcc)
*retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.');
if (retFoundVersion)
{
char *p = strchr(hgvs->seqAcc, '.');
if (p)
*retFoundVersion = atoi(p+1);
}
// Don't free chromInfo, it's cached!
}
return coordsOK;
}
static char *getCdnaSeq(char *db, char *acc)
/* Return cdna sequence for acc, or NULL if not found. */
{
if (trackHubDatabase(db))
return NULL;
char *seq = NULL;
if (startsWith("LRG_", acc))
{
if (hTableExists(db, "lrgCdna"))
{
char query[2048];
sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc);
struct sqlConnection *conn = hAllocConn(db);
seq = sqlQuickString(conn, query);
hFreeConn(&conn);
}
}
else
{
struct dnaSeq *cdnaSeq = NULL;
if (hDbHasNcbiRefSeq(db))
cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq");
else
cdnaSeq = hGenBankGetMrna(db, acc, NULL);
if (cdnaSeq)
seq = dnaSeqCannibalize(&cdnaSeq);
}
return seq;
}
static boolean getCds(char *db, char *acc, struct genbankCds *retCds)
/* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */
{
if (trackHubDatabase(db))
return FALSE;
boolean gotCds = FALSE;
char query[1024];
if (startsWith("LRG_", acc))
sqlSafef(query, sizeof(query),
"select cds from lrgCds where id = '%s'", acc);
else if (hDbHasNcbiRefSeq(db) &&
// This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete:
strchr(acc, '.'))
sqlSafef(query, sizeof(query),
"select cds from ncbiRefSeqCds where id = '%s'", acc);
else
sqlSafef(query, sizeof(query),
"SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND "
"(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc);
struct sqlConnection *conn = hAllocConn(db);
char cdsBuf[2048];
char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf));
hFreeConn(&conn);
if (isNotEmpty(cdsStr))
gotCds = (genbankCdsParse(cdsStr, retCds) &&
retCds->startComplete && retCds->start != retCds->end);
return gotCds;
}
static char refBaseFromProt(char *change)
/* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code,
* else null char. */
{
regmatch_t substrs[1];
if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs)))
{
char match[256]; // 1- or 3-letter code
regexSubstringCopy(change, substrs[0], match, sizeof(match));
if (strlen(match) == 1)
return toupper(match[0]);
else
return aaAbbrToLetter(match);
}
return '\0';
}
static char *cloneStringFromChar(char c)
/* Return a cloned string from a single character. */
{
char str[2];
str[0] = c;
str[1] = '\0';
return cloneString(str);
}
static void checkRefAllele(struct hgvsVariant *hgvs, int start, char *accSeq,
char **retDiffRefAllele)
/* If hgvs change includes a reference allele, and if accSeq at start does not match,
* then set retDiffRefAllele to the accSeq version. retDiffRefAllele must be non-NULL. */
{
char hgvsRefBase = (hgvs->type == hgvstProtein) ? refBaseFromProt(hgvs->changes) :
refBaseFromNucSubst(hgvs->changes);
if (hgvsRefBase != '\0')
{
char seqRefBase = toupper(accSeq[start]);
if (seqRefBase != hgvsRefBase)
*retDiffRefAllele = cloneStringFromChar(seqRefBase);
}
if (hgvs->type == hgvstProtein && *retDiffRefAllele == NULL)
{
// Protein ranges have a second ref allele base to check
char *p = strchr(hgvs->changes, '_');
if (p != NULL)
{
hgvsRefBase = refBaseFromProt(p+1);
if (hgvsRefBase != '\0')
{
int end1 = atoi(skipToNumeric(p+1));
char seqRefBase = toupper(accSeq[end1-1]);
if (seqRefBase != hgvsRefBase)
*retDiffRefAllele = cloneStringFromChar(seqRefBase);
}
}
}
}
static int getGbVersion(char *db, char *acc)
/* Return the local version that we have for acc. */
{
if (trackHubDatabase(db))
return 0;
char query[2048];
sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc);
struct sqlConnection *conn = hAllocConn(db);
int version = sqlQuickNum(conn, query);
hFreeConn(&conn);
return version;
}
static char *normalizeVersion(char *db, char *acc, int *retFoundVersion)
/* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0.
* The user may give us a RefSeq accession with or without a .version.
* If ncbiRefSeq tables are present, return acc with version, looking up version if necessary.
* If we look it up and can't find it, this returns NULL.
* If instead we're using genbank tables, return acc with no version.
* That ensures that acc will be found in our local tables. */
{
if (trackHubDatabase(db))
return NULL;
char *normalizedAcc = NULL;
int foundVersion = 0;
if (startsWith("LRG_", acc))
{
normalizedAcc = cloneString(acc);
}
else if (hDbHasNcbiRefSeq(db))
{
// ncbiRefSeq tables need versioned accessions.
if (strchr(acc, '.'))
normalizedAcc = cloneString(acc);
else
{
char query[2048];
sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc);
struct sqlConnection *conn = hAllocConn(db);
normalizedAcc = sqlQuickString(conn, query);
hFreeConn(&conn);
}
if (isNotEmpty(normalizedAcc))
{
char *p = strchr(normalizedAcc, '.');
assert(p);
foundVersion = atoi(p+1);
}
}
else
{
// genbank tables -- no version
normalizedAcc = cloneFirstWordByDelimiter(acc, '.');
foundVersion = getGbVersion(db, normalizedAcc);
}
if (retFoundVersion)
*retFoundVersion = foundVersion;
return normalizedAcc;
}
static boolean hgvsValidateGene(char *db, struct hgvsVariant *hgvs,
char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
/* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
* Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so
* those can't be checked without mapping the term to the genome.
* If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
* of hgvs->seqAcc) or NULL if we can't find any match.
* If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
* set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
* If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
* matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
{
char *acc = normalizeVersion(db, hgvs->seqAcc, retFoundVersion);
if (isEmpty(acc))
return FALSE;
boolean coordsOK = FALSE;
char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc));
if (accSeq)
{
// By convention, foundAcc is always versionless because it's accompanied by foundVersion.
if (retFoundAcc)
*retFoundAcc = cloneFirstWordByDelimiter(acc, '.');
int seqLen = strlen(accSeq);
int start, end;
hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
if (hgvs->type == hgvstCoding)
{
struct genbankCds cds;
coordsOK = getCds(db, acc, &cds);
if (coordsOK)
{
start += (hgvs->startIsUtr3 ? cds.end : cds.start);
end += (hgvs->endIsUtr3 ? cds.end : cds.start);
if (start > end)
coordsOK = FALSE;
else if (retDiffRefAllele && hgvs->startOffset == 0 && start >= 0 && start <= seqLen)
checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
}
}
else
{
if (start <= end)
{
coordsOK = TRUE;
if (retDiffRefAllele && hgvs->startOffset == 0 && start >= 0 && start < seqLen)
checkRefAllele(hgvs, start, accSeq, retDiffRefAllele);
}
}
}
freeMem(accSeq);
freeMem(acc);
return coordsOK;
}
boolean hgvsValidate(char *db, struct hgvsVariant *hgvs,
char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele)
/* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc.
* Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so
* those can't be checked without mapping the term to the genome; this returns TRUE if seq is found.
* If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version
* of hgvs->seqAcc) or NULL if we can't find any match.
* If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4),
* set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG).
* If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords
* matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */
{
if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito)
return hgvsValidateGenomic(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele);
else
return hgvsValidateGene(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele);
}
static struct bed *bed6New(char *chrom, unsigned chromStart, unsigned chromEnd, char *name,
int score, char strand)
/* Alloc & return a new bed with only the first 6 fields populated. */
{
struct bed *bed6;
AllocVar(bed6);
bed6->chrom = cloneString(chrom);
bed6->chromStart = chromStart;
bed6->chromEnd = chromEnd;
bed6->name = cloneString(name);
bed6->score = score;
bed6->strand[0] = strand;
return bed6;
}
boolean hgvsIsInsertion(struct hgvsVariant *hgvs)
/* Return TRUE if hgvs is an insertion (end == start1+1, change starts with "ins"). */
{
return (hgvs->end == hgvs->start1+1 && startsWith("ins", hgvs->changes));
}
static void adjustInsStartEnd(struct hgvsVariant *hgvs, uint *retStart0, uint *retEnd)
/* HGVS insertion coordinates include the base before and the base after the insertion point,
* while we treat it as a zero-length point. So if this is insertion, adjust the start and end. */
{
if (hgvsIsInsertion(hgvs))
{
*retStart0 = *retStart0 + 1;
*retEnd = *retEnd - 1;
}
}
static struct bed *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
/* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */
{
struct bed *region = NULL;
char *chrom = hgOfficialChromName(db, hgvs->seqAcc);
if (isNotEmpty(chrom) && hgvs->start1 > 0)
{
region = bed6New(chrom, hgvs->start1 - 1, hgvs->end, "", 0, '+');
adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd);
}
if (retPslTable)
*retPslTable = NULL;
return region;
}
static void hgvsTranscriptToZeroBasedHalfOpen(struct hgvsVariant *hgvs,
int maxCoord, struct genbankCds *cds,
int *retStart, int *retEnd,
int *retUpstreamBases, int *retDownstreamBases)
/* Convert a transcript HGVS's start1 and end into UCSC coords plus upstream and downstream lengths
* for when the transcript HGVS has coordinates that extend beyond its sequence boundaries.
* ret* args must be non-NULL. */
{
hgvsStartEndToZeroBasedHalfOpen(hgvs, retStart, retEnd);
if (hgvs->type == hgvstCoding)
{
// If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart
*retStart += (hgvs->startIsUtr3 ? cds->end : cds->start);
*retEnd += (hgvs->endIsUtr3 ? cds->end : cds->start);
}
// Now check for coords that extend beyond the transcript('s alignment to the genome)
if (*retStart < 0)
{
// hgvs->start1 is upstream of transcript.
*retUpstreamBases = -*retStart;
*retStart = 0;
}
else if (*retStart >= maxCoord)
{
// Even the start coord is downstream of transcript -- make a negative "upstream"
// for adjusting start.
*retUpstreamBases = -(*retStart - maxCoord + 1);
*retStart = maxCoord - 1;
}
else
*retUpstreamBases = 0;
if (*retEnd > maxCoord)
{
// hgvs->end is downstream of transcript.
*retDownstreamBases = *retEnd - maxCoord;
*retEnd = maxCoord;
}
else if (*retEnd <= 0)
{
// Even the end coord is upstream of transcript -- make a negative "downstream"
// for adjusting end.
*retEnd += *retUpstreamBases;
*retDownstreamBases = -*retUpstreamBases;
}
else
*retDownstreamBases = 0;
}
static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, char *acc, int accSize, int accEnd,
struct genbankCds *cds,
int *retUpstreamBases, int *retDownstreamBases)
/* Allocate and return a PSL modeling the variant in nucleotide sequence acc.
* The PSL target is the sequence and the query is the changed part of the sequence.
* accEnd is given in case accSize includes an unaligned poly-A tail.
* If hgvs is coding ("c.") then the caller must pass in a valid cds.
* In case the start or end position is outside the bounds of the sequence, set retUpstreamBases
* or retDownstreamBases to the number of bases beyond the beginning or end of sequence.
* NOTE: intron offsets are ignored; the PSL contains the exon anchor point
* and the caller will have to map that to the genome and then apply the intron offset. */
{
if (hgvs == NULL)
return NULL;
if (hgvs->type == hgvstProtein)
errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein.");
struct psl *psl;
AllocVar(psl);
psl->tName = cloneString(acc);
safecpy(psl->strand, sizeof(psl->strand), "+");
psl->tSize = accSize;
if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito)
{
// Sane 1-based fully closed coords.
hgvsStartEndToZeroBasedHalfOpen(hgvs, &psl->tStart, &psl->tEnd);
}
else
{
// Simple or insanely complex CDS-relative coords.
hgvsTranscriptToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd,
retUpstreamBases, retDownstreamBases);
}
int refLen = psl->tEnd - psl->tStart;
// Just use refLen for alt until we parse the sequence change portion of the term:
int altLen = refLen;
psl->qName = cloneString(hgvs->changes);
psl->qSize = altLen;
psl->qStart = 0;
psl->qEnd = altLen;
psl->blockCount = 1;
AllocArray(psl->blockSizes, psl->blockCount);
AllocArray(psl->qStarts, psl->blockCount);
AllocArray(psl->tStarts, psl->blockCount);
psl->blockSizes[0] = refLen <= altLen ? refLen : altLen;
psl->qStarts[0] = psl->qStart;
psl->tStarts[0] = psl->tStart;
return psl;
}
static struct psl *pslDelFromCoord(struct psl *txAli, int tStart, struct psl *variantPsl)
/* Return a PSL with same target and query as txAli, but as a deletion at offset tStart:
* two zero-length blocks surrounding no target and query = variantPsl's target coords. */
{
struct psl *del = pslNew(txAli->qName, txAli->qSize, variantPsl->tStart, variantPsl->tEnd,
txAli->tName, txAli->tSize, tStart, tStart,
txAli->strand, 2, 0);
// I wonder if zero-length blockSizes would trigger crashes somewhere...
del->blockSizes[0] = del->blockSizes[1] = 0;
del->tStarts[0] = del->tStarts[1] = del->tStart;
del->qStarts[0] = del->qStart;
del->qStarts[1] = del->qEnd;
return del;
}
struct psl *pslFromGap(struct psl *txAli, int blkIx, struct psl *variantPsl)
/* Return a PSL with same target and query as txAli, but as a potentially double-sided gap between
* two zero-length blocks surrounding skipped bases in target and/or query. */
{
int qGapStart = txAli->qStarts[blkIx] + txAli->blockSizes[blkIx];
int qGapEnd = txAli->qStarts[blkIx+1];
int tGapStart = txAli->tStarts[blkIx] + txAli->blockSizes[blkIx];
int tGapEnd = txAli->tStarts[blkIx+1];
struct psl *gapPsl = pslNew(txAli->qName, txAli->qSize, qGapStart, qGapEnd,
txAli->tName, txAli->tSize, tGapStart, tGapEnd,
txAli->strand, 2, 0);
int qBlockStart = txAli->qStarts[blkIx];
if (variantPsl->tStart > qBlockStart && variantPsl->tStart < qGapStart)
{
// keep non-zero overlapping part of preceding block, if any
int overlapSize = qGapStart - variantPsl->tStart;
gapPsl->blockSizes[0] = overlapSize;
gapPsl->tStart = gapPsl->tStarts[0] = tGapStart - overlapSize;
gapPsl->qStart = gapPsl->qStarts[0] = variantPsl->tStart;
}
else
{
// zero-length block at beginning of gap
gapPsl->blockSizes[0] = 0;
gapPsl->tStarts[0] = tGapStart;
gapPsl->qStarts[0] = qGapStart;
}
int qNextBlkEnd = txAli->qStarts[blkIx+1] + txAli->blockSizes[blkIx+1];
if (variantPsl->tEnd > qGapEnd && variantPsl->tEnd <= qNextBlkEnd)
{
// keep non-zero overlapping part of next block, if any
int overlapSize = variantPsl->tEnd - qGapEnd;
gapPsl->blockSizes[1] = overlapSize;
gapPsl->tStarts[1] = tGapEnd;
gapPsl->tEnd = tGapEnd + overlapSize;
gapPsl->qStarts[1] = qGapEnd;
gapPsl->qEnd = variantPsl->tEnd;
}
else
{
// zero-length block at end of gap
gapPsl->blockSizes[1] = 0;
gapPsl->tStarts[1] = tGapEnd;
gapPsl->qStarts[1] = qGapEnd;
}
return gapPsl;
}
static struct psl *mapToDeletion(struct psl *variantPsl, struct psl *txAli)
/* If the variant falls on a transcript base that is deleted in the reference genome,
* (or upstream/downstream mapped to a zero-length point),
* return the deletion coords (pslTransMap returns NULL), otherwise return NULL. */
{
// variant start and end coords, in transcript coords:
int vStart = variantPsl->tStart;
int vEnd = variantPsl->tEnd;
boolean isRc = (pslQStrand(txAli) == '-');
if (isRc)
{
vStart = variantPsl->tSize - variantPsl->tEnd;
vEnd = variantPsl->tSize - variantPsl->tStart;
}
if (vEnd < txAli->qStart)
return pslDelFromCoord(txAli, isRc ? txAli->tEnd : txAli->tStart, variantPsl);
else if (vStart > txAli->qEnd)
return pslDelFromCoord(txAli, isRc ? txAli->tStart : txAli->tEnd, variantPsl);
int i;
for (i = 0; i < txAli->blockCount - 1; i++)
{
int qBlockEnd = txAli->qStarts[i] + txAli->blockSizes[i];
int qNextBlockStart = txAli->qStarts[i+1];
int tBlockEnd = txAli->tStarts[i] + txAli->blockSizes[i];
int tNextBlockStart = txAli->tStarts[i+1];
if (vStart >= qBlockEnd && vEnd <= qNextBlockStart)
{
if (tBlockEnd == tNextBlockStart)
return pslDelFromCoord(txAli, tBlockEnd, variantPsl);
else
return pslFromGap(txAli, i, variantPsl);
}
}
// Not contained in a deletion from reference genome (txAli target) -- return NULL.
return NULL;
}
static struct psl *mapPsl(char *db, struct hgvsVariant *hgvs, char *pslTable, char *acc,
struct genbankCds *cds, int *retUpstream, int *retDownstream)
/* If acc is found in pslTable, use pslTransMap to map hgvs onto the genome. */
{
struct psl *mappedToGenome = NULL;
char query[2048];
sqlSafef(query, sizeof(query), "select * from %s where qName = '%s'", pslTable, acc);
struct sqlConnection *conn = hAllocConn(db);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row;
if (sr && (row = sqlNextRow(sr)))
{
int bin = 1; // All PSL tables used here, and any made in the future, use the bin column.
struct psl *txAli = pslLoad(row+bin);
// variantPsl contains the anchor if a non-cdsStart anchor is used because
// the actual position might be outside the bounds of the transcript sequence (intron/UTR)
struct psl *variantPsl = pslFromHgvsNuc(hgvs, acc, txAli->qSize, txAli->qEnd, cds,
retUpstream, retDownstream);
mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli);
// If there is a deletion in the genome / insertion in the transcript then pslTransMap cannot
// map those bases to the genome. In that case take a harder look and return the deletion
// coords.
if (mappedToGenome == NULL)
mappedToGenome = mapToDeletion(variantPsl, txAli);
pslFree(&variantPsl);
pslFree(&txAli);
}
sqlFreeResult(&sr);
hFreeConn(&conn);
return mappedToGenome;
}
static char *pslTableForAcc(char *db, char *acc)
/* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where
* acc should be found (table may or may not exist). Don't free the returned string. */
{
char *pslTable = NULL;
if (startsWith("LRG_", acc))
pslTable = "lrgTranscriptAli";
else if (startsWith("NM_", acc) || startsWith("NR_", acc) ||
startsWith("XM_", acc) || startsWith("XR_", acc))
{
// Use NCBI's alignments if they are available
if (hDbHasNcbiRefSeq(db))
pslTable = "ncbiRefSeqPsl";
else
pslTable = "refSeqAli";
}
return pslTable;
}
#define limitToRange(val, min, max) { if (val < min) { val = min; } \
if (val > max) { val = max; } }
static struct bed *pslAndFriendsToRegion(struct psl *psl, struct hgvsVariant *hgvs,
int upstream, int downstream)
/* If hgvs has any intron offsets and/or upstream/downstream offsets, add those to the
* anchor coords in psl and return the variant's region of the genome. */
{
struct bed *region = bed6New(psl->tName, psl->tStart, psl->tEnd, psl->qName, 0, psl->strand[0]);
// If the start and/or end is in an intron, add the intron offset now.
boolean revStrand = (psl->strand[0] == '-');
if (hgvs->startOffset != 0)
{
if (revStrand)
region->chromEnd -= hgvs->startOffset;
else
region->chromStart += hgvs->startOffset;
}
if (hgvs->endOffset != 0)
{
if (revStrand)
region->chromStart -= hgvs->endOffset;
else
region->chromEnd += hgvs->endOffset;
}
// Apply extra up/downstream offsets (usually 0)
if (revStrand)
{
region->chromStart -= downstream;
region->chromEnd += upstream;
}
else
{
region->chromStart -= upstream;
region->chromEnd += downstream;
}
limitToRange(region->chromStart, 0, psl->tSize);
limitToRange(region->chromEnd, 0, psl->tSize);
return region;
}
static struct bed *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
/* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
* If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
{
if (hgvs->type == hgvstGenomic)
return hgvsMapGDotToGenome(db, hgvs, retPslTable);
char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
if (isEmpty(acc))
return NULL;
struct bed *region = NULL;
char *pslTable = pslTableForAcc(db, acc);
struct genbankCds cds;
boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE;
if (pslTable && (hgvs->type != hgvstCoding || gotCds) && hTableExists(db, pslTable))
{
int upstream = 0, downstream = 0;
struct psl *mappedToGenome = mapPsl(db, hgvs, pslTable, acc, &cds, &upstream, &downstream);
// As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back
// on UCSC alignments.
if (!mappedToGenome && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli"))
{
char *accNoVersion = cloneFirstWordByDelimiter(acc, '.');
gotCds = (hgvs->type == hgvstCoding) ? getCds(db, accNoVersion, &cds) : FALSE;
if (hgvs->type != hgvstCoding || gotCds)
mappedToGenome = mapPsl(db, hgvs, "refSeqAli", accNoVersion, &cds,
&upstream, &downstream);
if (mappedToGenome)
{
pslTable = "refSeqAli";
acc = accNoVersion;
}
}
if (mappedToGenome)
{
region = pslAndFriendsToRegion(mappedToGenome, hgvs, upstream, downstream);
pslFree(&mappedToGenome);
}
}
if (region)
{
adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd);
if (retPslTable)
*retPslTable = cloneString(pslTable);
}
return region;
}
static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
/* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
* If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
{
struct bed *region = NULL;
char *acc = normalizeVersion(db, hgvs->seqAcc, NULL);
if (acc && (startsWith("NP_", acc) || startsWith("XP_", acc)))
{
// Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome.
struct sqlConnection *conn = hAllocConn(db);
char query[2048];
if (hDbHasNcbiRefSeq(db))
sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'",
acc);
else if (hTableExists(db, "refGene"))
sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r "
"where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc);
else
return NULL;
char *nmAcc = sqlQuickString(conn, query);
hFreeConn(&conn);
if (nmAcc)
{
struct hgvsVariant cDot;
zeroBytes(&cDot, sizeof(cDot));
cDot.type = hgvstCoding;
cDot.seqAcc = nmAcc;
cDot.start1 = ((hgvs->start1 - 1) * 3) + 1;
cDot.end = ((hgvs->end - 1) * 3) + 3;
cDot.changes = hgvs->changes;
region = hgvsMapNucToGenome(db, &cDot, retPslTable);
freeMem(nmAcc);
}
}
return region;
}
struct bed *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable)
/* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map.
* If successful and retPslTable is not NULL, set it to the name of the PSL table used. */
{
if (hgvs == NULL)
return NULL;
struct bed *region = NULL;
if (hgvs->type == hgvstProtein)
region = hgvsMapPDotToGenome(db, hgvs, retPslTable);
else
region = hgvsMapNucToGenome(db, hgvs, retPslTable);
return region;
}
static int getDotVersion(char *acc)
/* If acc ends in .<number> then return number; otherwise return -1. */
{
char *p = strrchr(acc, '.');
if (p && isdigit(p[1]))
return atoi(p+1);
return -1;
}
struct bed *hgvsValidateAndMap(struct hgvsVariant *hgvs, char *db, char *term,
struct dyString *dyWarn, char **retPslTable)
/* If we have sequence for hgvs->seqAcc and the hgvs coords are within the bounds
* of that sequence, and we are able to map the coords in seqAcc to genomic coords in
* db, return a bed6 with those coords and strand. If unsuccessful, or if the reference
* sequence differs at the mapped location, then write a warning message to dyWarn.
* If mapping is successful and retPslTable is not NULL, set it to the psl table
* used for mapping. */
{
struct bed *mapping = NULL;
char *pslTable = NULL;
char *foundAcc = NULL, *diffRefAllele = NULL;
int foundVersion = 0;
boolean coordsOk = hgvsValidate(db, hgvs, &foundAcc, &foundVersion, &diffRefAllele);
if (foundAcc == NULL)
dyStringPrintf(dyWarn, "Can't find sequence for accession '%s'", hgvs->seqAcc);
else
{
int hgvsVersion = getDotVersion(hgvs->seqAcc);
char foundAccWithV[strlen(foundAcc)+20];
if (foundVersion)
safef(foundAccWithV, sizeof(foundAccWithV), "%s.%d", foundAcc, foundVersion);
else
safecpy(foundAccWithV, sizeof(foundAccWithV), foundAcc);
if (hgvsVersion >= 0 && hgvsVersion != foundVersion)
dyStringPrintf(dyWarn, "HGVS term '%s' is based on %s but UCSC has version %s",
term, hgvs->seqAcc, foundAccWithV);
if (! coordsOk)
dyStringPrintf(dyWarn, "HGVS term '%s' has coordinates outside the bounds of %s",
term, foundAccWithV);
else if (diffRefAllele != NULL)
dyStringPrintf(dyWarn, "HGVS term '%s' reference value does not match %s value '%s'",
term, foundAccWithV, diffRefAllele);
if (coordsOk)
mapping = hgvsMapToGenome(db, hgvs, &pslTable);
}
if (retPslTable)
*retPslTable = cloneString(pslTable);
return mapping;
}
char *hgvsChangeGetAssertedRef(struct hgvsChange *change)
/* If change asserts a reference sequence value, return that, otherwise return NULL. */
{
if (change->type == hgvsctSubst || change->type == hgvsctDel || change->type == hgvsctDup ||
change->type == hgvsctInv || change->type == hgvsctNoChange || change->type == hgvsctIns ||
change->type == hgvsctCon)
{
return cloneString(change->value.refAlt.refSequence);
}
// When change->type is hgvsctRepeat, the actual reference sequence usually spans a larger region
// than the repeating unit. Without applying a fuzzy* tandem-repeat search to the reference
// assembly in the neighborhood of the HGVS coordinates, we can't tell what the true reference
// coords and sequence are.
// *fuzzy: ClinVar sometimes gives repeat counts that imply a few mismatches from
// the strict repetition of the consensus sequence.
// HGVS repeat notation is inherently flaky because the reference's repeat count may differ
// between assemblies, and is subject to how many mismatches are considered tolerable.
// So don't support it.
return NULL;
}
static struct dnaSeq *getGenbankSequenceRange(char *db, char *acc, int start, int end)
/* Look up acc in our local GenBank data; if found, and if both start and end are in bounds,
* return sequence from start to end; otherwise return NULL. */
{
struct dnaSeq *seq = NULL;
int accVersion = getDotVersion(acc);
int foundVersion = 0;
char *normalizedAcc = normalizeVersion(db, acc, &foundVersion);
if (accVersion < 0 || accVersion == foundVersion)
{
char *wholeSeq = getCdnaSeq(db, normalizedAcc);
if (wholeSeq && start >= 0 && end <= strlen(wholeSeq))
{
int size = end - start;
char *name = ""; // ignored
// newDnaSeq does not clone its dna input! So allocate here.
char *dna = cloneStringZ(wholeSeq+start, size);
touppers(dna);
seq = newDnaSeq(dna, size, name);
}
freez(&wholeSeq);
}
return seq;
}
static boolean hgvsToTxCoords(struct hgvsVariant *hgvs, char *db, uint *retStart, uint *retEnd)
/* If hgvs is not coding then set *retStart to hgvs->start1-1 and *retEnd to hgvs->end & ret TRUE.
* If coding and we have complete CDS info then apply cdsStart and/or cdsEnd offset to start & end.
* If start or end is intronic, or start is negative (genomic upstream of tx) then return FALSE.
* Note: at this point we don't know the length of the sequence so can't tell if start and/or end
* fall past the end of the transcript (genomic downstream). */
{
boolean coordsOk = TRUE;
int start, end;
hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end);
if (hgvs->type == hgvstCoding)
{
struct genbankCds cds;
if (getCds(db, hgvs->seqAcc, &cds))
{
// Determine whether to use cdsStart or cdsEnd for hgvs start and hgvs end.
// Make sure the cds start and end are marked as complete.
if (hgvs->startIsUtr3 && cds.endComplete)
start += cds.end;
else if (!hgvs->startIsUtr3 && cds.startComplete)
start += cds.start;
else
coordsOk = FALSE;
if (hgvs->endIsUtr3 && cds.endComplete)
end += cds.end;
else if (!hgvs->endIsUtr3 && cds.startComplete)
end += cds.start;
else
coordsOk = FALSE;
}
else
coordsOk = FALSE;
}
if (hgvs->startOffset != 0 || hgvs->endOffset != 0)
// intronic start and/or end -- not in transcript bounds
coordsOk = FALSE;
if (start < 0)
coordsOk = FALSE;
// Unfortunately we don't know if start and end are past the end of transcript without
// doing another database query...
if (retStart)
*retStart = start;
if (retEnd)
*retEnd = end;
return coordsOk;
}
static char *getHgvsSequenceRange(struct hgvsVariant *hgvs, char *db)
/* If we have the sequence specified in hgvs, return the portion covered by hgvs. */
{
char *seq = NULL;
char *acc = hgvs->seqAcc;
// First see if acc can be translated to an internal chrom name.
// Note: an NC_ version from a different assembly might be used. To be really fancy
// we could try hgOfficialChromName in other dbs... but wait for users to request it!
char *chrom = hgOfficialChromName(db, acc);
uint start = hgvs->start1-1;
uint end = hgvs->end;
if (chrom)
{
// Simple sequence range
struct dnaSeq *dnaSeq = hChromSeq(db, chrom, start, end);
touppers(dnaSeq->dna);
seq = dnaSeqCannibalize(&dnaSeq);
}
else if (hgvsToTxCoords(hgvs, db, &start, &end))
{
// Leave it up to getGenbankSequenceRange to reject out-of-bounds coords because
// it will look up the sequence size.
struct dnaSeq *dnaSeq = getGenbankSequenceRange(db, hgvs->seqAcc, start, end);
seq = dnaSeqCannibalize(&dnaSeq);
}
return seq;
}
char *hgvsGetRef(struct hgvsVariant *hgvs, char *db)
/* Return the sequence from the HGVS accession at the HGVS coords, or NULL if HGVS coords are
* out of bounds (e.g. up/downstream or intronic) or otherwise unsupported. Use "" as insertion
* reference sequence. */
{
return getHgvsSequenceRange(hgvs, db);
}
static char *altFromNestedTerm(struct hgvsVariant *nestedTerm, char *db)
/* If we have sequence for the accession */
{
char *nestedRefSeq = getHgvsSequenceRange(nestedTerm, db);
if (nestedRefSeq && sameOk(nestedTerm->changes, "inv"))
{
// If nested term ended with "inv" then reverse-complement the sequence.
reverseComplement(nestedRefSeq, strlen(nestedRefSeq));
}
return nestedRefSeq;
}
static void copyMaybeRc(char *buf, size_t bufSize, char *seq, boolean isRc)
/* Copy seq into buf; if isRc, reverse-complement the copied sequence in buf. */
{
int seqLen = strlen(seq);
safencpy(buf, bufSize, seq, seqLen);
if (isRc)
reverseComplement(buf, seqLen);
}
char *hgvsChangeGetAlt(struct hgvsChange *changeList, char *hgvsSeqRef, char *db)
/* Unpack changeList and return the alternate (changed from hgvsSeqRef) sequence, or NULL
* if unable. */
{
struct dyString *alt = dyStringNew(0);
boolean success = TRUE;
if (hgvsSeqRef == NULL)
errAbort("hgvsChangeGetAlt: must provide non-NULL hgvsSeqRef");
struct hgvsChange *change;
for (change = changeList; change != NULL && success; change = change->next)
{
if (change->type == hgvsctRepeat)
// Unsupported -- abort and return NULL.
success = FALSE;
if (change->type == hgvsctNoChange)
dyStringAppend(alt, hgvsSeqRef);
else if (change->type == hgvsctDup)
{
dyStringAppend(alt, hgvsSeqRef);
dyStringAppend(alt, hgvsSeqRef);
}
else if (change->type == hgvsctDel)
// alt is the empty string
;
else if (change->type == hgvsctInv)
{
int refLen = strlen(hgvsSeqRef);
char refInv[refLen+1];
copyMaybeRc(refInv, sizeof(refInv), hgvsSeqRef, TRUE);
dyStringAppend(alt, refInv);
}
else if (change->type == hgvsctSubst || change->type == hgvsctIns || change->type == hgvsctCon)
{
struct hgvsChangeRefAlt *refAlt = &change->value.refAlt;
if (refAlt->altType == hgvsstSimple)
{
dyStringAppend(alt, refAlt->altValue.seq);
}
else if (refAlt->altType == hgvsstLength)
{
int i;
for (i = 0; i < refAlt->altValue.length; i++)
dyStringAppendC(alt, 'N');
}
else if (refAlt->altType == hgvsstNestedTerm)
{
char *nestedTermSeq = altFromNestedTerm(&refAlt->altValue.term, db);
if (nestedTermSeq)
dyStringAppend(alt, nestedTermSeq);
else
success = FALSE;
}
else
{
success = FALSE;
}
}
}
if (success)
return dyStringCannibalize(&alt);
else
{
dyStringFree(&alt);
return NULL;
}
}
static boolean doDupToIns(boolean isRc, int *pLeftShiftOffset,
char *genomicRef, char *genomicRefFwd, char *hgvsSeqRef, char *hgvsSeqAlt,
char *pLeftBase, struct bed *mapping)
/* HGVS dup is TMI for VCF, so simplify to an insertion -- this can modify all of its inputs!
* We need to maintain HVGS right-shifting before we do VCF left-shifting, so if this is a dup
* then change chromStart to chromEnd (vice versa if isRc), update leftBase, trim duplicate seq
* from start of alt, and set ref to "". */
{
boolean dupToIns = FALSE;
int refLen = mapping->chromEnd - mapping->chromStart;
int altLen = strlen(hgvsSeqAlt);
if (refLen > 0 && altLen >= 2*refLen &&
sameString(genomicRef, hgvsSeqRef) &&
startsWith(genomicRef, hgvsSeqAlt))
{
dupToIns = TRUE;
if (isRc || sameString(genomicRef, hgvsSeqAlt+refLen))
{
// '-' strand or pure dup ==> insertion at chromStart, no change to leftBase
mapping->chromEnd = mapping->chromStart;
}
else
{
// '+' strand and extra seq after dup ==> insertion at chromEnd, update leftBase & offset
*pLeftShiftOffset += refLen;
*pLeftBase = genomicRef[refLen-1];
mapping->chromStart = mapping->chromEnd;
}
memmove(hgvsSeqAlt, hgvsSeqAlt+refLen, altLen-refLen+1);
genomicRef[0] = genomicRefFwd[0] = hgvsSeqRef[0] = '\0';
}
return dupToIns;
}
static char *makeVcfAlt(char *genomicRef, char *hgvsSeqRef, char *hgvsSeqAlt, struct bed *mapping,
boolean isRc, char leftBase, boolean leftBaseRight, boolean *retIsIndel)
/* Based on comparing the three possibly differing alleles (genomic, hgvs ref, hgvs alt),
* determine which sequence(s) will go in alt. Determine whether this variant is an indel
* (alleles not all the same length), and if so prepend leftBase to alt allele(s). */
{
int hgvsSeqRefLen = strlen(hgvsSeqRef);
int hgvsSeqAltLen = strlen(hgvsSeqAlt);
// VCF alleles are always on '+' (Fwd) strand of genome.
char hgvsSeqRefFwd[hgvsSeqRefLen+1];
copyMaybeRc(hgvsSeqRefFwd, sizeof(hgvsSeqRefFwd), hgvsSeqRef, isRc);
char hgvsSeqAltFwd[hgvsSeqAltLen+1];
copyMaybeRc(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), hgvsSeqAlt, isRc);
int genomicRefLen = strlen(genomicRef);
// The alt allele string will contain 0 to 2 sequences, maybe a comma, and 0 to 2 leftBases:
int altMaxLen = genomicRefLen + hgvsSeqRefLen + hgvsSeqAltLen + 16;
char vcfAlt[altMaxLen];
// If the HGVS change is "=" (no change) then vcfAlt will be "."
boolean noChange = sameString(hgvsSeqRef, hgvsSeqAlt);
// VCF indels have start coord one base to the left of the actual indel point.
// HGVS treats multi-base substitutions as indels but VCF does not, so look for
// differing length of ref vs. each alt sequence in order to call it a VCF indel.
boolean isIndel = (genomicRefLen != hgvsSeqRefLen || genomicRefLen != hgvsSeqAltLen ||
genomicRefLen == 0);
if (sameString(hgvsSeqRef, genomicRef))
{
// VCF alt is HGVS alt
if (noChange)
safecpy(vcfAlt, sizeof(vcfAlt), ".");
else if (isIndel)
{
if (leftBaseRight)
safef(vcfAlt, sizeof(vcfAlt), "%s%c", hgvsSeqAltFwd, leftBase);
else
safef(vcfAlt, sizeof(vcfAlt), "%c%s", leftBase, hgvsSeqAltFwd);
}
else
safecpy(vcfAlt, sizeof(vcfAlt), hgvsSeqAltFwd);
}
else if (sameString(genomicRef, hgvsSeqAlt))
{
// Genomic reference allele is HGVS alt allele; VCF alt is HGVS ref allele
if (noChange)
safecpy(vcfAlt, sizeof(vcfAlt), ".");
else if (isIndel)
{
if (leftBaseRight)
safef(vcfAlt, sizeof(vcfAlt), "%s%c", hgvsSeqRefFwd, leftBase);
else
safef(vcfAlt, sizeof(vcfAlt), "%c%s", leftBase, hgvsSeqRefFwd);
}
else
safecpy(vcfAlt, sizeof(vcfAlt), hgvsSeqRefFwd);
}
else
{
// Both HGVS ref and HGVS alt differ from genomicRef, so both are VCF alts (unless noChange)
if (noChange)
{
if (isIndel)
{
if (leftBaseRight)
safef(vcfAlt, sizeof(vcfAlt), "%s%c", hgvsSeqRefFwd, leftBase);
else
safef(vcfAlt, sizeof(vcfAlt), "%c%s", leftBase, hgvsSeqRefFwd);
}
else
safef(vcfAlt, sizeof(vcfAlt), "%s", hgvsSeqRefFwd);
}
else if (isIndel)
{
if (leftBaseRight)
safef(vcfAlt, sizeof(vcfAlt), "%s%c,%s%c",
hgvsSeqRefFwd, leftBase, hgvsSeqAltFwd, leftBase);
else
safef(vcfAlt, sizeof(vcfAlt), "%c%s,%c%s",
leftBase, hgvsSeqRefFwd, leftBase, hgvsSeqAltFwd);
}
else
safef(vcfAlt, sizeof(vcfAlt), "%s,%s", hgvsSeqRefFwd, hgvsSeqAltFwd);
}
if (retIsIndel)
*retIsIndel = isIndel;
return cloneString(vcfAlt);
}
static char *makeHgvsToVcfInfo(boolean dupToIns, int basesShifted, boolean isIndel, int end)
/* Make a semicolon-separated list of any applicable interesting things. */
{
struct dyString *info = dyStringNew(0);
if (dupToIns)
dyStringAppend(info, "DupToIns");
if (basesShifted)
{
dyStringAppendSep(info, ";");
dyStringPrintf(info, "BasesShifted=%d", basesShifted);
}
if (isIndel)
{
dyStringAppendSep(info, ";");
// yet another special case for indel at beginning of chrom
if (end == 0)
end = 1;
dyStringPrintf(info, "END=%d", end);
}
if (dyStringIsEmpty(info))
dyStringAppendC(info, '.');
return dyStringCannibalize(&info);
}
int vcfRowCmp(const void *va, const void *vb)
/* Compare for sorting based on chrom,pos. */
{
const struct vcfRow *a = *((struct vcfRow **)va);
const struct vcfRow *b = *((struct vcfRow **)vb);
int dif;
dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
dif = a->pos - b->pos;
return dif;
}
void vcfRowTabOutList(FILE *f, struct vcfRow *rowList)
/* Write out each row of VCF to f. */
{
struct vcfRow *row;
for (row = rowList; row != NULL; row = row->next)
fprintf(f, "%s\t%d\t%s\t%s\t%s\t.\t%s\t%s\n",
row->chrom, row->pos, row->id, row->ref, row->alt, row->filter, row->info);
}
static char *makeHgvsToVcfFilter(char *genomicRef, char *hgvsSeqRef, char *hgvsTermRef)
/* Check for sequence disagreements that we can report in the FILTER column */
{
struct dyString *filter = dyStringNew(0);
if (differentString(hgvsSeqRef, genomicRef))
dyStringAppend(filter, "HgvsRefGenomicMismatch");
if (hgvsTermRef != NULL && differentString(hgvsTermRef, hgvsSeqRef))
{
dyStringAppendSep(filter, ";");
dyStringAppend(filter, "HgvsRefAssertedMismatch");
}
if (dyStringIsEmpty(filter))
dyStringAppend(filter, "PASS");
return dyStringCannibalize(&filter);
}
static struct vcfRow *vcfFromHgvs(char *db, char *term, struct bed *mapping,
struct hgvsVariant *hgvs, struct hgvsChange *changeList,
boolean doLeftShift, struct dyString *dyError)
/* Determine ref and alt sequence, move start left one base if indel, and return vcfRow.
* If doLeftShift, also shift items with ambiguous start as far left on the genome as
* possible. That is the VCF convention but it is inconvenient for variant annotation;
* the HGVS standard is to shift right as far as possible on the tx strand, so consequences
* are reported at the point where the sequence has changed. */
{
// Include some genomic sequence to the left (if possible) in case we need to left-shift
// an ambiguous mapping and/or include the base to the left of an indel.
int genomicRefLen = mapping->chromEnd - mapping->chromStart;
uint winStart, winEnd;
indelShiftRangeForVariant(mapping->chromStart, genomicRefLen, strlen(hgvs->changes),
&winStart, &winEnd);
struct seqWindow *seqWin = chromSeqWindowNew(db, mapping->chrom, winStart, winEnd);
int varWinStart = mapping->chromStart - seqWin->start;
int indelOffset = (mapping->chromStart == 0) ? 0 : 1;
char leftBase = seqWin->seq[varWinStart-indelOffset];
char genomicRefFwd[genomicRefLen+1];
safencpy(genomicRefFwd, sizeof(genomicRefFwd), seqWin->seq + varWinStart, genomicRefLen);
// VCF is always on '+' (Fwd) strand of reference. HGVS sequences may be on the '-' strand.
// Make a version of the genome assembly allele for comparison to HGVS sequences.
boolean isRc = (mapping->strand[0] == '-');
char genomicRef[genomicRefLen+1];
copyMaybeRc(genomicRef, sizeof(genomicRef), genomicRefFwd, isRc);
// Get HGVS ref and alt alleles
char *hgvsSeqRef = hgvsGetRef(hgvs, db);
if (hgvsSeqRef == NULL)
// hgvsSeqRef can be NULL when sequence coords are out of transcript bounds e.g. intron.
// Use genomic reference sequence (rev-complemented if mapped to '-' strand)
hgvsSeqRef = cloneString(genomicRef);
char *hgvsTermRef = hgvsChangeGetAssertedRef(changeList);
char *hgvsSeqAlt = hgvsChangeGetAlt(changeList, hgvsSeqRef, db);
if (hgvsIsInsertion(hgvs))
// HGVS insertions are two bases (before and after), but ours are zero-length points,
// so adjust hgvsSeqRef accordingly.
hgvsSeqRef[0] = '\0';
if (hgvsSeqAlt == NULL)
{
dyStringAppend(dyError, "Unable to get alternate sequence");
return NULL;
}
char *filter = makeHgvsToVcfFilter(genomicRef, hgvsSeqRef, hgvsTermRef);
// doDupToIns and indelShift may modify the contents of their arguments:
boolean dupToIns = doDupToIns(isRc, &varWinStart, genomicRef, genomicRefFwd,
hgvsSeqRef, hgvsSeqAlt, &leftBase, mapping);
if (dupToIns)
genomicRefLen = strlen(genomicRef);
int basesShifted = 0;
if (doLeftShift &&
sameString(genomicRef, hgvsSeqRef))
{
int altLen = strlen(hgvsSeqAlt);
char hgvsSeqAltFwd[altLen+1];
copyMaybeRc(hgvsSeqAltFwd, sizeof(hgvsSeqAltFwd), hgvsSeqAlt, isRc);
basesShifted = indelShift(seqWin, &mapping->chromStart, &mapping->chromEnd,
hgvsSeqAltFwd, INDEL_SHIFT_NO_MAX, isdLeft);
if (basesShifted)
{
varWinStart = mapping->chromStart - seqWin->start;
int leftBaseOffset = max(0, varWinStart - 1);
leftBase = seqWin->seq[leftBaseOffset];
copyMaybeRc(hgvsSeqAlt, altLen+1, hgvsSeqAltFwd, isRc);
safencpy(genomicRefFwd, sizeof(genomicRefFwd), seqWin->seq+varWinStart, genomicRefLen);
copyMaybeRc(genomicRef, sizeof(genomicRef), genomicRefFwd, isRc);
safecpy(hgvsSeqRef, genomicRefLen+1, genomicRef);
}
}
// VCF special case for indel at beginning of chromosome: there is no left base, so instead
// put the first base of the chromosome to the right!
// See https://samtools.github.io/hts-specs/VCFv4.2.pdf 1.4.1.4 REF and discussion at
// https://sourceforge.net/p/vcftools/mailman/vcftools-spec/thread/508CAD45.8010301%40broadinstitute.org/
boolean leftBaseRight = (mapping->chromStart == 0);
// Make REF and ALT strings ('+' strand, leftBase for indels) and write VCF
boolean isIndel = FALSE;
char *vcfAlt = makeVcfAlt(genomicRef, hgvsSeqRef, hgvsSeqAlt, mapping, isRc,
leftBase, leftBaseRight, &isIndel);
int pos = mapping->chromStart + 1;
if (isIndel && !leftBaseRight)
pos--;
char vcfRef[genomicRefLen+1+1];
if (isIndel)
{
if (leftBaseRight)
safef(vcfRef, sizeof(vcfRef), "%s%c", genomicRefFwd, leftBase);
else
safef(vcfRef, sizeof(vcfRef), "%c%s", leftBase, genomicRefFwd);
}
else
safecpy(vcfRef, sizeof(vcfRef), genomicRefFwd);
char *info = makeHgvsToVcfInfo(dupToIns, basesShifted, isIndel, mapping->chromEnd);
struct vcfRow *row;
AllocVar(row);
row->chrom = cloneString(mapping->chrom);
row->pos = pos;
row->id = cloneString(term);
row->ref = cloneString(vcfRef);
row->alt = vcfAlt;
row->filter = filter;
row->info = info;
freeMem(hgvsSeqRef);
freeMem(hgvsSeqAlt);
chromSeqWindowFree(&seqWin);
return row;
}
struct vcfRow *hgvsToVcfRow(char *db, char *term, boolean doLeftShift, struct dyString *dyError)
/* Convert HGVS to a row of VCF suitable for sorting & printing. If unable, return NULL and
* put the reason in dyError. Protein terms are ambiguous at the nucleotide level so they are
* not supported at this point. */
{
struct vcfRow *row = NULL;
struct hgvsVariant *hgvs = hgvsParseTerm(term);
if (!hgvs)
{
dyStringPrintf(dyError, "Unable to parse '%s' as HGVS", term);
return NULL;
}
else if (hgvs->type == hgvstProtein)
{
dyStringPrintf(dyError, "HGVS protein changes such as '%s' are ambiguous at the nucleotide "
"level, so not supported", term);
return NULL;
}
struct dyString *dyWarn = dyStringNew(0);
char *pslTable = NULL;
struct bed *mapping = hgvsValidateAndMap(hgvs, db, term, dyWarn, &pslTable);
if (!mapping)
{
dyStringPrintf(dyError, "Unable to map '%s' to reference %s: %s",
term, db, dyStringContents(dyWarn));
}
else
{
if (dyStringIsNotEmpty(dyWarn))
{
dyStringAppend(dyError, dyStringContents(dyWarn));
dyStringClear(dyWarn);
}
struct hgvsChange *changeList = hgvsParseNucleotideChange(hgvs->changes, hgvs->type,
dyWarn);
if (changeList == NULL)
{
if (isEmpty(hgvs->changes))
dyStringPrintf(dyError, "HGVS term '%s' does not specify any sequence changes", term);
else if (dyStringIsNotEmpty(dyWarn))
dyStringPrintf(dyError, "Unable to parse HGVS description in '%s': %s", term,
dyStringContents(dyWarn));
else
dyStringPrintf(dyError, "Unable to parse HGVS description in '%s'", term);
}
else if (dyStringIsNotEmpty(dyWarn))
{
dyStringPrintf(dyError, "Unable to parse HGVS description in '%s': %s",
term, dyStringContents(dyWarn));
}
else if (changeList->type == hgvsctRepeat)
{
dyStringPrintf(dyError,
"Can't convert '%s' to VCF: HGVS repeat notation is not supported.", term);
}
else
{
row = vcfFromHgvs(db, term, mapping, hgvs, changeList, doLeftShift, dyWarn);
if (dyStringIsNotEmpty(dyWarn))
{
dyStringPrintf(dyError, "Can't convert '%s' to VCF: %s",
term, dyStringContents(dyWarn));
}
}
}
bedFree(&mapping);
dyStringFree(&dyWarn);
return row;
}
static int allNCount(char *seq)
/* If seq is entirely N's, return its length, otherwise 0. */
{
int nLen = 0;
for (nLen = 0; seq[nLen] != '\0'; nLen++)
if (seq[nLen] != 'N')
return 0;
return nLen;
}
// HGVS allows terms to specify deleted or duplicated bases if there are "several bases".
// Search for "NOTE: it is allowed" here:
// http://varnomen.hgvs.org/recommendations/DNA/variant/deletion/
// http://varnomen.hgvs.org/recommendations/DNA/variant/duplication/
// It does not say how many "several" can be. It doesn't specify for inv, but common
// practice is to include bases and I don't see why it should be different from del or dup.
// If somebody complains about hardcoding then I guess we could make it a parameter of
// hgvsAppendChangesFromNucRefAlt.
#define HGVS_SEVERAL 30
static void hgvsAppendChangesFromNucRefAlt(struct dyString *dy, char *ref, char *alt, int dupLen,
boolean breakDelIns)
/* Translate reference allele and alternate allele into an HGVS change description, append to dy.
* If breakDelIns, then show deleted bases (e.g. show 'delAGinsTT' instead of 'delinsTT').
* Note: this forces ref and alt to uppercase.
* No support for con or repeats at this point, just {=, >, del, dup, ins, inv}. */
{
touppers(ref);
touppers(alt);
if (sameString(ref, alt))
dyStringPrintf(dy, "%s=", ref);
else
{
int refLen = strlen(ref);
int altLen = strlen(alt);
char refInv[refLen+1];
safencpy(refInv, sizeof(refInv), ref, refLen);
reverseComplement(refInv, refLen);
if (refLen == 1 && altLen == 1)
dyStringPrintf(dy, "%c>%c", ref[0], alt[0]);
else if (dupLen > 0)
{
dyStringAppend(dy, "dup");
if (dupLen <= HGVS_SEVERAL)
dyStringAppendN(dy, alt, dupLen);
// Could be a pure duplication followed by insertion:
if (altLen > dupLen)
dyStringPrintf(dy, "ins%s", alt+dupLen);
}
else if (refLen == 0)
{
int allNLen = allNCount(alt);
if (allNLen)
dyStringPrintf(dy, "ins%d", allNLen);
else
dyStringPrintf(dy, "ins%s", alt);
}
else if (altLen == 0)
{
dyStringAppend(dy, "del");
if (refLen <= HGVS_SEVERAL)
dyStringAppendN(dy, ref, refLen);
}
else if (refLen == altLen && refLen > 1 && sameString(refInv, alt))
{
dyStringAppend(dy, "inv");
if (refLen <= HGVS_SEVERAL)
dyStringAppendN(dy, ref, refLen);
}
else
{
dyStringAppend(dy, "del");
if (breakDelIns && refLen <= HGVS_SEVERAL)
dyStringAppendN(dy, ref, refLen);
int allNLen = allNCount(alt);
if (allNLen)
dyStringPrintf(dy, "ins%d", allNLen);
else
dyStringPrintf(dy, "ins%s", alt);
}
}
}
void hgvsAppendChangesFromPepRefAlt(struct dyString *dy, char *ref, char *alt, int dupLen)
/* Translate reference allele and alternate allele into an HGVS change description, append to dy.
* ref and alt must be sequences of single-letter IUPAC amino acid codes.
* Note: this forces ref and alt to uppercase. */
{
touppers(ref);
touppers(alt);
if (sameString(ref, alt))
dyStringAppendC(dy, '=');
else
{
int refLen = strlen(ref);
int altLen = strlen(alt);
char altAbbr[3*altLen+1];
int ix;
for (ix = 0; ix < altLen; ix++)
aaToAbbr(alt[ix], altAbbr+3*ix, sizeof(altAbbr)-3*ix);
if (refLen == 1 && altLen == 1)
// Simple substitution
dyStringAppend(dy, altAbbr);
else if (dupLen > 0)
{
dyStringAppend(dy, "dup");
// Could be a pure duplication followed by insertion:
if (altLen > dupLen)
dyStringPrintf(dy, "ins%s", altAbbr+(dupLen*3));
}
else if (refLen == 0)
dyStringPrintf(dy, "ins%s", altAbbr);
else if (altLen == 0)
{
dyStringAppend(dy, "del");
}
else
{
dyStringPrintf(dy, "delins%s", altAbbr);
}
}
}
char *hgvsGFromVariant(struct seqWindow *gSeqWin, struct bed3 *variantBed, char *alt, char *acc,
boolean breakDelIns)
/* Return an HGVS g. string representing the genomic variant at the position of variantBed with
* reference allele from gSeqWin and alternate allele alt. 3'-shift indels if applicable.
* If acc is non-NULL it is used instead of variantBed->chrom.
* If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
{
struct dyString *dy = dyStringCreate("%s:g.", acc ? acc : variantBed->chrom);
uint vStart = variantBed->chromStart, vEnd = variantBed->chromEnd;
gSeqWin->fetch(gSeqWin, variantBed->chrom, vStart, vEnd);
int refLen = vEnd - vStart;
char ref[refLen+1];
seqWindowCopy(gSeqWin, vStart, refLen, ref, sizeof(ref));
int altLen = strlen(alt);
char altCpy[altLen+1];
safecpy(altCpy, sizeof(altCpy), alt);
if (differentString(ref, altCpy))
// Trim identical bases from start and end -- unless this is an assertion that there is
// no change, in which case it's good to keep the range on which that assertion was made.
trimRefAlt(ref, altCpy, &vStart, &vEnd, &refLen, &altLen);
if (indelShiftIsApplicable(refLen, altLen) &&
indelShift(gSeqWin, &vStart, &vEnd, altCpy, INDEL_SHIFT_NO_MAX, isdRight))
// update ref
seqWindowCopy(gSeqWin, vStart, refLen, ref, sizeof(ref));
int dupLen = 0;
if (refLen == 1)
// Single base: single 1-based coordinate
dyStringPrintf(dy, "%d", vStart+1);
else if (refLen == 0)
{
// Insertion or duplication
if (altLen > 0 && altLen <= vStart)
{
// Detect duplicated sequence
uint seqStart = vStart - altLen;
char precedingRef[altLen+1];
seqWindowCopy(gSeqWin, seqStart, altLen, precedingRef, sizeof(precedingRef));
if (sameString(altCpy, precedingRef))
dupLen = altLen;
}
if (dupLen > 0)
{
if (dupLen == 1)
// Single-base duplication
dyStringPrintf(dy, "%d", vStart); // - dupLen + 1 cancel each other out
else
// Multi-base duplication: range is the dupLen bases preceding vStart
dyStringPrintf(dy, "%d_%d", vStart - dupLen + 1, vStart);
}
else
// Insertion: two-base range enclosing zero-base insertion point
dyStringPrintf(dy, "%d_%d", vStart, vEnd+1);
}
else
// Deletion or MNV
dyStringPrintf(dy, "%d_%d", vStart+1, vEnd);
hgvsAppendChangesFromNucRefAlt(dy, ref, altCpy, dupLen, breakDelIns);
return dyStringCannibalize(&dy);
}
static int findDup(char *alt, struct seqWindow *seqWin, uint refPoint, boolean isRc)
/* Given that the variant is an insertion of alt at refPoint (minding isRc), if alt looks
* like a duplicate of the sequence before refPoint, return the length of duplicated sequence. */
{
int altLen = strlen(alt);
// Don't underflow the sequence:
if (!isRc && altLen > refPoint)
return 0;
uint seqStart = refPoint - (isRc ? 0 : altLen);
uint minEnd = seqStart + altLen;
if (seqWin->start > seqStart || seqWin->end < minEnd)
seqWin->fetch(seqWin, seqWin->seqName,
min(seqStart, seqWin->start),
max(seqWin->end, minEnd));
// Get sequence, reverse-complement if necessary
char precedingRef[altLen+1];
seqWindowCopy(seqWin, seqStart, altLen, precedingRef, sizeof(precedingRef));
if (isRc)
reverseComplement(precedingRef, altLen);
if (sameString(alt, precedingRef))
return altLen;
else
{
// Detect an insertion plus a few slop bases at the end, like "dupinsTAT" where
// alt starts with precedingRef[3..] followed by new "TAT". Then dupLen is altLen-3.
// Unfortunately, indelShift can actually shift far enough to prevent a perfect match. :(
// Example: ClinVar NM_000179.2(MSH6):c.1573_3439-429dupinsTAT -- the insertion is right-
// shifted by 1 and then there's no longer a perfect dup. :b We would have to do dup-aware
// indel shifting!!
int searchLimit = 5;
int offset;
for (offset = 1; offset < searchLimit; offset++)
{
if (sameStringN(alt, precedingRef+offset, altLen-offset))
return altLen-offset;
}
}
return 0;
}
static int tweakInsDup(struct vpTxPosition *startPos, struct vpTxPosition *endPos, char *alt,
struct seqWindow *gSeqWin, struct psl *txAli, struct dnaSeq *txSeq)
/* If this variant is an insertion, HGVS needs the 2-base range surrounding the insertion point.
* However, if an inserted sequence happens to be identical to the preceding sequence then
* HGVS calls it a dup on the range of preceding sequence. If this is a dup, return the length of
* duplicated sequence (else return 0).
* This modifies startPos and endPos if the variant is an insertion/dup.
* For non-exonic insertions, the preceding reference sequence is from the genome not tx.
* Note: this doesn't yet detect "dupins", where most of the inserted sequence (starting from the
* beginning of the inserted sequence) matches the previous ref sequence */
{
int dupLen = 0;
int isIns = vpTxPosIsInsertion(startPos, endPos);
if (isIns)
{
// Yes, this is a zero-length insertion point -- see if it happens to be a duplication.
// "insTCA" is preferable to "dupTinsCA"... where to draw the line for when dup is worth it??
// For now, just say half of altLen... but an argument could be made for some small constant too
int minDup = strlen(alt) / 2;
// Fetch preceding sequence from tx if exonic, otherwise from genome.
if (startPos->region == vpExon && endPos->region == vpExon)
{
struct seqWindow *txSeqWin = memSeqWindowNew(txSeq->name, txSeq->dna);
dupLen = findDup(alt, txSeqWin, startPos->txOffset, FALSE);
if (dupLen > minDup)
{
// Since txSeqWin was used to find dup, the new startPos must also be exonic.
// In case startPos was looking forward from the boundary between an exon and an intron
// or downstream, adjust its other fields to be exonic now:
startPos->region = vpExon;
startPos->txOffset -= dupLen;
startPos->gDistance = startPos->intron3TxOffset = startPos->intron3Distance = 0;
}
else
dupLen = 0;
memSeqWindowFree(&txSeqWin);
}
else
{
boolean isRc = (pslQStrand(txAli) == '-');
dupLen = findDup(alt, gSeqWin, startPos->gOffset, isRc);
if (dupLen > minDup)
{
uint newGOffset = startPos->gOffset + (isRc ? dupLen : -dupLen);
vpPosGenoToTx(newGOffset, txAli, startPos, FALSE);
}
else
dupLen = 0;
}
}
if (dupLen == 0 && isIns)
{
// Expand insertion point to 2-base region around insertion point for HGVS.
// startPos = base to left of endPos whose region looks left/5';
// endPos = base to right of startPos whose region looks right/3'.
struct vpTxPosition newStart = *endPos;
vpTxPosSlideInSameRegion(&newStart, -1);
struct vpTxPosition newEnd = *startPos;
vpTxPosSlideInSameRegion(&newEnd, 1);
*startPos = newStart;
*endPos = newEnd;
}
return dupLen;
}
static uint hgvsTxToCds(uint txOffset, struct genbankCds *cds, boolean isStart, char pPrefix[2])
/* Return the cds-relative HGVS coord and prefix corresponding to 0-based txOffset & cds. */
{
// Open end adjustment for determining CDS/UTR region:
int endCmp = isStart ? 0 : 1;
// For adjusting non-negative coords to HGVS's positive 1-based closed:
int oneBased = isStart ? 1 : 0;
// For adjusting negative coords to HGVS's negative 0-based closed:
int closedEnd = isStart ? 0 : 1;
pPrefix[1] = '\0';
uint cdsCoord = 0;
if (txOffset - endCmp < cds->start)
{
// 5'UTR: coord is negative distance from cdsStart
pPrefix[0] = '-';
cdsCoord = cds->start - txOffset + closedEnd;
}
else if (txOffset - endCmp < cds->end)
{
// CDS: coord is positive distance from cdsStart
pPrefix[0] = '\0';
cdsCoord = txOffset - cds->start + oneBased;
}
else
{
// 3'UTR: coord is positive distance from cdsEnd
pPrefix[0] = '*';
cdsCoord = txOffset - cds->end + oneBased;
}
return cdsCoord;
}
static void appendHgvsNucPos(struct dyString *dy, struct vpTxPosition *txPos, boolean isStart,
struct genbankCds *cds)
/* Translate txPos (start or end) into an HGVS position (coding if cds is non-NULL). */
{
// Open end adjustment for determining region:
int endCmp = isStart ? 0 : 1;
// For adjusting non-negative coords to HGVS's positive 1-based closed:
int oneBased = isStart ? 1 : 0;
// For adjusting negative coords to HGVS's negative 0-based closed:
int closedEnd = isStart ? 0 : 1;
if (txPos->region == vpUpstream)
{
uint distance = txPos->gDistance;
if (cds)
distance += cds->start;
dyStringPrintf(dy, "-%u", distance + closedEnd);
}
else if (txPos->region == vpDownstream)
{
uint distance = txPos->txOffset + txPos->gDistance;
if (cds)
dyStringPrintf(dy, "*%u", distance - cds->end + oneBased);
else
dyStringPrintf(dy, "%u", distance + oneBased);
}
else if (txPos->region == vpExon)
{
char cdsPrefix[2];
cdsPrefix[0] = '\0';
uint hgvsCoord = cds ? hgvsTxToCds(txPos->txOffset, cds, isStart, cdsPrefix) :
txPos->txOffset + oneBased;
dyStringPrintf(dy, "%s%u", cdsPrefix, hgvsCoord);
}
else if (txPos->region == vpIntron)
{
// If intron length is odd, bias toward picking the 5' exon (middle base has positive offset).
char cdsPrefix[2];
cdsPrefix[0] = '\0';
char direction;
uint exonAnchor, intronOffset;
boolean anchorIsStart;
if (txPos->gDistance - endCmp < txPos->intron3Distance)
{
exonAnchor = txPos->txOffset;
direction = '+';
intronOffset = txPos->gDistance + oneBased;
anchorIsStart = FALSE;
}
else
{
exonAnchor = txPos->intron3TxOffset;
direction = '-';
intronOffset = txPos->intron3Distance + closedEnd;
anchorIsStart = TRUE;
}
exonAnchor = cds ? hgvsTxToCds(exonAnchor, cds, anchorIsStart, cdsPrefix) :
exonAnchor + (anchorIsStart ? oneBased : 0);
dyStringPrintf(dy, "%s%u%c%u", cdsPrefix, exonAnchor, direction, intronOffset);
}
else
errAbort("appendHgvsNucPos: unrecognized vpTxRegion value %d", txPos->region);
}
+static char *refFromVpTx(struct vpTx *vpTx)
+/* If vpTx->txRef is non-NULL and both start & end are exonic, return txRef;
+ * otherwise return genomic. For example, if a deletion spans exon/intron boundary, use genomic
+ * ref because it includes the intron bases. Do not free the returned value. */
+{
+if (vpTx->txRef != NULL &&
+ vpTx->start.region == vpExon && vpTx->end.region == vpExon)
+ return vpTx->txRef;
+return vpTx->gRef;
+}
+
char *hgvsNFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli,
struct dnaSeq *txSeq, boolean breakDelIns)
/* Return an HGVS n. (noncoding transcript) term for a variant projected onto a transcript.
* gSeqWin must already have at least the correct seqName if not the surrounding sequence.
* If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
{
struct dyString *dy = dyStringCreate("%s:n.", vpTx->txName);
// Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup.
struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end;
int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq);
appendHgvsNucPos(dy, &startPos, TRUE, NULL);
if (!vpTxPosRangeIsSingleBase(&startPos, &endPos))
{
dyStringAppendC(dy, '_');
appendHgvsNucPos(dy, &endPos, FALSE, NULL);
}
-char *ref = vpTx->txRef ? vpTx->txRef : vpTx->gRef;
+char *ref = refFromVpTx(vpTx);
hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns);
return dyStringCannibalize(&dy);
}
char *hgvsCFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli,
struct genbankCds *cds, struct dnaSeq *txSeq, boolean breakDelIns)
/* Return an HGVS c. (coding transcript) term for a variant projected onto a transcript w/cds.
* gSeqWin must already have at least the correct seqName if not the surrounding sequence.
* If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */
{
struct dyString *dy = dyStringCreate("%s:c.", vpTx->txName);
// Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup.
struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end;
int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq);
appendHgvsNucPos(dy, &startPos, TRUE, cds);
if (!vpTxPosRangeIsSingleBase(&startPos, &endPos))
{
dyStringAppendC(dy, '_');
appendHgvsNucPos(dy, &endPos, FALSE, cds);
}
-char *ref = vpTx->txRef ? vpTx->txRef : vpTx->gRef;
+char *ref = refFromVpTx(vpTx);
hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns);
return dyStringCannibalize(&dy);
}
char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens)
/* Return an HGVS p. (protein) term for a variant projected into protein space.
* Strict HGVS compliance requires parentheses around predicted protein changes, but
* nobody seems to do that in practice.
* Return NULL if an input is NULL. */
{
if (vpPep == NULL || protSeq == NULL)
return NULL;
struct dyString *dy = dyStringCreate("%s:p.", vpPep->name);
if (addParens)
dyStringAppendC(dy, '(');
int refLen = vpPep->end - vpPep->start;
// When predicting frameshift/extension, the length of ref may be different from refLen
int refExtLen = vpPep->ref ? strlen(vpPep->ref) : refLen;
int altLen = vpPep->alt ? strlen(vpPep->alt) : 0;
char refStartAbbr[4];
if (vpPep->ref)
aaToAbbr(vpPep->ref[0], refStartAbbr, sizeof(refStartAbbr));
else
// If ref is null then we should be writing just '=' or '?' but prevent garbage just in case:
safecpy(refStartAbbr, sizeof(refStartAbbr), "?");
// protSeq may or may not end with X, so treat protSeq->size accordingly
boolean hitsStopCodon = (vpPep->end > protSeq->size ||
((protSeq->dna[protSeq->size-1] == 'X') && vpPep->end == protSeq->size));
if (vpPep->likelyNoChange)
dyStringAppend(dy, "=");
else if (vpPep->cantPredict || vpPep->spansUtrCds)
dyStringAppend(dy, "?");
else if (vpPep->frameshift)
{
dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1);
if (altLen == 1)
dyStringAppend(dy, "Ter");
else
{
char altStartAbbr[4];
aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr));
// For stop-loss extension, make it "ext*"
if (hitsStopCodon && altLen > refExtLen)
dyStringPrintf(dy, "%sext*%d", altStartAbbr, altLen - refExtLen);
else
dyStringPrintf(dy, "%sfsTer%d", altStartAbbr, altLen);
}
}
else if (hitsStopCodon && altLen > refExtLen)
{
// Stop loss extension from something that doesn't disrupt frame
char altStartAbbr[4];
aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr));
dyStringPrintf(dy, "%s%d%sext*%d", refStartAbbr, vpPep->start+1,
altStartAbbr, altLen - refExtLen);
}
else
{
int dupLen = 0;
if (refLen == 0 && altLen > 0)
{
// It's an insertion; is it a duplication?
struct seqWindow *pSeqWin = memSeqWindowNew(protSeq->name, protSeq->dna);
dupLen = findDup(vpPep->alt, pSeqWin, vpPep->start, FALSE);
memSeqWindowFree(&pSeqWin);
}
if (refLen == 1)
dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1);
else
{
int rangeStart = vpPep->start, rangeEnd = vpPep->end;
char *pSeq = protSeq->dna;
if (dupLen > 0)
{
// Duplication; position range changes to preceding bases.
rangeEnd = rangeStart;
rangeStart -= dupLen;
aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr));
}
else if (refLen == 0)
{
// Insertion; expand to two-AA range around insertion point
rangeStart--;
aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr));
rangeEnd++;
}
hitsStopCodon = (rangeEnd > protSeq->size ||
((protSeq->dna[protSeq->size-1] == 'X') && rangeEnd == protSeq->size));
char refLastAbbr[4];
if (hitsStopCodon)
aaToAbbr('X', refLastAbbr, sizeof(refLastAbbr));
else
aaToAbbr(pSeq[rangeEnd-1], refLastAbbr, sizeof(refLastAbbr));
dyStringPrintf(dy, "%s%d_%s%d", refStartAbbr, rangeStart+1, refLastAbbr, rangeEnd);
}
hgvsAppendChangesFromPepRefAlt(dy, vpPep->ref, vpPep->alt, dupLen);
}
if (addParens)
dyStringAppendC(dy, ')');
return dyStringCannibalize(&dy);
}