de29d745d32fb04a5187fb5998cc94bbb9bf41a4 angie Wed Aug 9 13:39:59 2017 -0700 Add optional HGVS output to annoGratorGpVar and hgVai. Since annoGratorGpVar is genePred-based, it can't yet take advantage of variantProjector's full PSL+CDS+sequence support, so when transcripts don't align cleanly to genome, HGVS c./n./p. output may be incorrect. refs #19968 diff --git src/hg/inc/hgHgvs.h src/hg/inc/hgHgvs.h index 6be2bb1..070e0ae 100644 --- src/hg/inc/hgHgvs.h +++ src/hg/inc/hgHgvs.h @@ -1,26 +1,29 @@ /* hgHgvs - support for a subset of the Human Genome Variation Society's (HGVS) nomenclature * for describing variants with respect to a specific gene sequence (or genome assembly). * See http://www.hgvs.org/mutnomen/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #ifndef HGHGVS_H #define HGHGVS_H #include "bed.h" +#include "dnaseq.h" +#include "seqWindow.h" +#include "variantProjector.h" /* The full nomenclature is extremely complicated, able to encode things such as gene fusions and * advanced clinical info (e.g. "=/" for somatic mosaicism, "=//" for chimerism). UCSC supports * substitutions, insertions, deletions, duplications and inversions. Conversions are parsed out * of HGVS terms but not detected in genomic variants when generating HGVS terms. * UCSC does not fully support repeated sequences because in practice they seem to be frequently * incorrect and inherently error-prone. * * At the same time, since the spec has repeatedly changed, we will need to be flexible in our * parsing in order to support previously published HGVS (or HGVS-like) terms. */ enum hgvsSeqType // HGVS describes changes relative to several types of sequence: genomic, coding, protein, etc { hgvstUndefined, // the usual int default value means we haven't actually checked @@ -141,30 +144,45 @@ struct hgvsChangeFrameshift fs; // Optional info about early termination of translation struct hgvsChangeRepeat repeat; // Optional repeated sequence and min/max observed counts. int count; // Repeat unit count in alternate sequence (ref count is 1) }; struct hgvsChange // Usually this contains a sequence of bases from the reference and a sequence of bases from // the alternate allele, but can contain nested HGVS terms. For deletions, reference bases // are usually not specified, except for protein which includes the first and last amino acid. { struct hgvsChange *next; // One HGVS term can specify a sequence of changes enum hgvsChangeType type; // HGVS operator: >, del, ins, fs etc. union hgvsChangeValue value; // the actual sequences changed (possibly complex) }; +// +// HGVS output option bit flags +// +// Output an HGVS genomic (g.) term: +#define HGVS_OUT_G 0x01 +// Output either an HGVS coding (c.) term if applicable, otherwise noncoding (n.) term: +#define HGVS_OUT_CN 0x02 +// Output an HGVS protein (p.) term if applicable: +#define HGVS_OUT_P 0x04 +// Add parentheses around predicted protein (p.) changes e.g. p.(Arg159del): +#define HGVS_OUT_P_ADD_PARENS 0x10 +// Add deleted sequence to delins changes (e.g. show 'delAGinsTT' instead of 'delinsTT'): +#define HGVS_OUT_BREAK_DELINS 0x20 + + void hgvsVariantFree(struct hgvsVariant **pHgvs); // Free *pHgvs and its contents, and set *pHgvs to NULL. struct hgvsVariant *hgvsParseTerm(char *term); /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions or alleles. */ struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term); /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS. */ boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele); /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. @@ -222,16 +240,41 @@ "Description=\"Asserted reference sequence in HGVS term does not match actual " \ "reference sequence\">\n" \ "##FILTER=\n" \ "##INFO=\n" \ "##INFO=\n" struct vcfRow *hgvsToVcfRow(char *db, char *term, boolean doLeftShift, struct dyString *dyError); /* Convert HGVS to a row of VCF suitable for sorting & printing. If unable, return NULL and * put the reason in dyError. Protein terms are ambiguous at the nucleotide level so they are * not supported at this point. */ +char *hgvsGFromVariant(struct seqWindow *gSeqWin, struct bed3 *variantBed, char *alt, char *acc, + boolean breakDelIns); +/* Return an HGVS g. string representing the genomic variant at the position of variantBed with + * reference allele from gSeqWin and alternate allele alt. If acc is non-NULL it is used + * instead of variantBed->chrom. + * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ + +char *hgvsNFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli, + struct dnaSeq *txSeq, boolean breakDelIns); +/* Return an HGVS n. (noncoding transcript) term for a variant projected onto a transcript. + * gSeqWin must already have at least the correct seqName if not the surrounding sequence. + * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ + +char *hgvsCFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli, + struct genbankCds *cds, struct dnaSeq *txSeq, boolean breakDelIns); +/* Return an HGVS c. (coding transcript) term for a variant projected onto a transcript w/cds. + * gSeqWin must already have at least the correct seqName (chrom) if not the surrounding sequence. + * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ + +char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens); +/* Return an HGVS p. (protein) term for a variant projected into protein space. + * Strict HGVS compliance requires parentheses around predicted protein changes (addParens=TRUE), + * but nobody seems to do that in practice. + * Return NULL if an input is NULL. */ + #endif /* HGHGVS_H */