aaf72102b545c05c42f66b7a3fc22d65b1ecf4fe angie Mon Aug 8 14:12:39 2016 -0700 Added recognition of a small subset of HGVS terms: coding (c.) SNVs relative to RefSeq NM_ or LRG transcript IDs, and protein (p.) simple substitutions relative to NP_. Also accepted (not HGVS but similar and popular): geneSymbol and abbreviated protein subst like "ALK G1494E". hgFind will map terms to the current genome if possible, and will display warnings about unrecognized accessions, out-of-bounds coordinates and mismatching reference alleles. refs #15071, #15554 diff --git src/hg/inc/hgHgvs.h src/hg/inc/hgHgvs.h new file mode 100644 index 0000000..2b461f4 --- /dev/null +++ src/hg/inc/hgHgvs.h @@ -0,0 +1,74 @@ +/* hgHgvs - support for a subset of the Human Genome Variation Society's (HGVS) nomenclature + * for describing variants with respect to a specific gene sequence (or genome assembly). + * See http://www.hgvs.org/mutnomen/ */ + +/* Copyright (C) 2016 The Regents of the University of California + * See README in this or parent directory for licensing information. */ + +#ifndef HGHGVS_H +#define HGHGVS_H + +#include "psl.h" + +/* The full nomenclature is extremely complicated, able to encode things such as gene fusions and + * advanced clinical info (e.g. "=/" for somatic mosaicism, "=//" for chimerism). I am starting + * simple with single-base substitutions, which should cover the majority of use cases, and will + * work up from there. + * + * At the same time, since the spec has repeatedly changed, we will need to be flexible in our + * parsing in order to support previously published HGVS (or HGVS-like) terms. */ + +enum hgvsTermType +/* HGVS describes changes relative to several types of sequence: genomic, coding, protein, etc */ + { + hgvstUndefined, // the usual int default value means we haven't actually checked + hgvstCoding, // "c.": Coding DNA sequence only. Beware: complex coords for intron & UTR. + hgvstGenomic, // "g.": Genomic DNA + hgvstMito, // "m.": Mitochondrial DNA + hgvstNoncoding, // "n.": non-coding RNA + hgvstRna, // "r.": RNA (like DNA but lowercase and 'u' instead of 'T') + hgvstProtein, // "p.": Protein + }; + +struct hgvsVariant +/* Components that we are able parse out of an HGVS term. (there's more to HGVS than just this) */ +{ + struct hgvsVariant *next; + char *seqAcc; // The reference sequence for the variant (optional -- may be NULL!) + char *seqGeneSymbol; // Usually NULL, but DNA/RNA can have HGNC gene symbol in ()'s. + char *changeSymbol; // ">" for subst; "ins", "del", "delins", "inv", "con", "=", ... + char *refAllele; // Reference allele base (poss. NULL, not given for multi-base changes) + char *altAllele; // Alternate allele base (poss. NULL, not always given) + int start1; // 1-based start of the variant in reference seq: can be negative ! + int end; // end of the variant in the reference seq; can be negative ! + enum hgvsTermType type; // type of sequence: genomic, coding, protein, etc + // The following fields apply only to hgvstCoding ("c.") terms: + int intronStart1; // 1-based, 0 means N/A; can be negative! + int intronEnd; // 0 means N/A; can be negative! + boolean startIsUtr3; // TRUE if start is relative to *end* of coding sequence + boolean endIsUtr3; // TRUE if end is relative to *end* of coding sequence +}; + +struct hgvsVariant *hgvsParseTerm(char *term); +/* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. + * This does not check validity of accessions or alleles. */ + +struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term); +/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS. */ + +boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, + char **retDiffRefAllele); +/* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. + * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version + * of hgvs->seqAcc) or NULL if we can't find any match. + * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), + * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). + * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords + * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ + +struct psl *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable); +/* Return a psl with target coords from db, mapping the variant to the genome. + * Return NULL if unable to map. + * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ + +#endif /* HGHGVS_H */