31dffd2604b7ba4ab85a2dc0d45246ef58838907 angie Wed Mar 15 13:59:23 2017 -0700 Better warning message for HGVS protein terms pasted into hgVai. refs #11460 notes 30, 31. diff --git src/hg/inc/hgHgvs.h src/hg/inc/hgHgvs.h index 95c4b74..17c96c7 100644 --- src/hg/inc/hgHgvs.h +++ src/hg/inc/hgHgvs.h @@ -1,234 +1,235 @@ /* hgHgvs - support for a subset of the Human Genome Variation Society's (HGVS) nomenclature * for describing variants with respect to a specific gene sequence (or genome assembly). * See http://www.hgvs.org/mutnomen/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #ifndef HGHGVS_H #define HGHGVS_H #include "bed.h" /* The full nomenclature is extremely complicated, able to encode things such as gene fusions and * advanced clinical info (e.g. "=/" for somatic mosaicism, "=//" for chimerism). I am starting * simple with single-base substitutions, which should cover the majority of use cases, and will * work up from there. * * At the same time, since the spec has repeatedly changed, we will need to be flexible in our * parsing in order to support previously published HGVS (or HGVS-like) terms. */ enum hgvsSeqType // HGVS describes changes relative to several types of sequence: genomic, coding, protein, etc { hgvstUndefined, // the usual int default value means we haven't actually checked hgvstCoding, // "c.": Coding DNA sequence only. Beware: complex coords for intron & UTR. hgvstGenomic, // "g.": Genomic DNA hgvstMito, // "m.": Mitochondrial DNA hgvstNoncoding, // "n.": non-coding RNA hgvstRna, // "r.": RNA (like DNA but lowercase and 'u' instead of 'T') hgvstProtein, // "p.": Protein }; struct hgvsVariant // Components that we are able parse out of an HGVS term. (there's more to HGVS than just this) { struct hgvsVariant *next; char *seqAcc; // The reference sequence for the variant (optional -- may be NULL!) char *seqGeneSymbol; // Usually NULL, but DNA/RNA can have HGNC gene symbol in ()'s. char *changes; // The sequence change(s) (following the position in nucleotide terms, // with embedded position in protein terms); can be parsed into // syntax tree by hgvsParseNucleotideChange (below) int start1; // 1-based start of the variant in reference seq: can be negative ! // For coding terms with complex intron or UTR coords, this is the // anchor coord (nearest CDS exon boundary) int end; // End of the variant in the reference seq; can be negative ! // For a range, this is the second anchor coord; else same as start // These two fields apply only to hgvstCoding ("c.") terms and Rna ("r.") for coding RNAs: int startOffset; // offset into intron; 1-based, 0 means N/A; can be negative! int endOffset; // offset into intron; 0 means N/A; can be negative! // For a range, this is the second offset; else same as startOffset enum hgvsSeqType type; // type of sequence: genomic, coding, protein, etc boolean startIsUtr3; // TRUE if start is relative to *end* of coding sequence boolean endIsUtr3; // TRUE if end is relative to *end* of coding sequence }; // // HGVS sequence change(s) syntax tree // // An HGVS description contains two parts: position and sequence change. // Positions may be simple 1-based, nested ranges, or relative to various CDS boundaries. // Currently positions are parsed using regular expressions in hgHgvs.c // Sequence changes may contain nested HGVS terms (insertion from some other sequence). // They are tokenized and parsed in hgHgvsParse.c. enum hgvsChangeType // There may be no change, or a change that specifies ref and/or alt bases, or a change // in which the sequence is all implied by the reference sequence. { hgvsctUndefined, hgvsctNoChange, // No change; ref sequence is implied, alt sequence is the same. hgvsctSubst, // Substitution: ref sequence and (possibly complex) alternate sequence hgvsctDel, // Deletion; ref sequence is implied (or maybe asserted). hgvsctDup, // Duplication; ref sequence is implied (or maybe asserted) hgvsctInv, // Inversion; ref sequence is implied (or maybe asserted) hgvsctIns, // Insertion: ref is implied; (possibly complex) inserted sequence follows hgvsctCon, // Conversion: basically Deletion followed by Insertion hgvsctRepeat, // Ref sequence is implied (first unit); alt sequence is some # of copies hgvsctFrameshift, // First alt amino acid and possibly offset of early stop }; // Sometimes instead of a sequence (or in addition to a del, dup etc), a length is given, // sometimes not. Use int's sign to specify whether length was included or not. #define HGVS_LENGTH_UNSPECIFIED -1 enum hgvsSeqChangeType // A sequence is usually just a sequence, but it might come from a nested expression and/or // be subjected to yet another change. { hgvsstUndefined, hgvsstSimple, // IUPAC single-letter codes + '?' if some unspecified hgvsstLength, // Only the length of the sequence is provided hgvsstNestedChange, // Simple sequence... and another change that's applied to this one. hgvsstNestedTerm, // HGVS term for sequence copied from some other reference sequence. }; union hgvsSeqValue // A sequence string, nested change, or nested term (external sequence) { char *seq; // IUPAC single-letter codes + '?' if some unspecified int length; // Sequence length (content unspecified) struct hgvsVariant term; // Nested HGVS term: acc, pos, possibly change }; struct hgvsChangeRefAlt // 'Before and after' (reference and alternate) sequence representations { char *refSequence; // NULL if the reference sequence is implied by the position, // or if only length is asserted; otherwise it's asserted // reference sequence in IUPAC single-letter codes // and when some but not all bases are included in HGVS term, // '?' for unspecified bases. int refLength; // Sometimes, (redundant) length is given after del, dup, etc. enum hgvsSeqChangeType altType; // Type of alternate sequence -- can be complex. union hgvsSeqValue altValue; // IUPAC single-letter sequence, or a sequence of nested changes }; struct hgvsChangeFrameshift // Info that may or may not be given along with a frameshift that disrupts the protein sequence { int offset; // The number of AA bases downstream of a new termination boolean offsetValid; // FALSE if offset was not provided boolean elongation; // TRUE if offset appears after '*', i.e. after old termination char *altSequence; // Sequence of IUPAC single-letter AA codes or NULL if not given char refBase; // IUPAC single-letter AA code (or '\0' if not given) }; struct hgvsChangeRepeat // Repeat count (range) and optional repeated sequence { char *seq; // The sequence that is repeated, may be NULL (implied from ref) int min; // Minimum repeat count specified int max; // Max repeat count specified }; union hgvsChangeValue // Depending on type of change, ref & alt sequences, frameshift description or repeat count { struct hgvsChangeRefAlt refAlt; // A description of reference and alternate sequence struct hgvsChangeFrameshift fs; // Optional info about early termination of translation struct hgvsChangeRepeat repeat; // Optional repeated sequence and min/max observed counts. int count; // Repeat unit count in alternate sequence (ref count is 1) }; struct hgvsChange // Usually this contains a sequence of bases from the reference and a sequence of bases from // the alternate allele, but can contain nested HGVS terms. For deletions, reference bases // are usually not specified, except for protein which includes the first and last amino acid. { struct hgvsChange *next; // One HGVS term can specify a sequence of changes enum hgvsChangeType type; // HGVS operator: >, del, ins, fs etc. union hgvsChangeValue value; // the actual sequences changed (possibly complex) }; void hgvsVariantFree(struct hgvsVariant **pHgvs); // Free *pHgvs and its contents, and set *pHgvs to NULL. struct hgvsVariant *hgvsParseTerm(char *term); /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions or alleles. */ struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term); /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS. */ boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele); /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ struct bed *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable); /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ struct bed *hgvsValidateAndMap(struct hgvsVariant *hgvs, char *db, char *term, struct dyString *dyWarn, char **retPslTable); /* If we have sequence for hgvs->seqAcc and the hgvs coords are within the bounds * of that sequence, and we are able to map the coords in seqAcc to genomic coords in * db, return a bed6 with those coords and strand. If unsuccessful, or if the reference * sequence differs at the mapped location, then write a warning message to dyWarn. * If mapping is successful and retPslTable is not NULL, set it to the psl table * used for mapping. */ struct hgvsChange *hgvsParseNucleotideChange(char *changeStr, enum hgvsSeqType type, struct dyString *dyError); /* Return a parse tree of an HGVS nt sequence change (the part after the position range) * possibly followed by additional changes. If there are any parse errors, they will be * appended to dyError. */ struct vcfRow /* char* fields of VCF row (no genotypes) except POS (int for sorting) and QUAL (omitted). */ // See src/inc/vcf.h and VCF spec for definitions of fields. // Currently this is used only in the context of HGVS. If it turns out to be useful elsewhere // then it could be libified on its own or perhaps into vcf.h. { struct vcfRow *next; char *chrom; int pos; char *id; char *ref; char *alt; char *filter; char *info; }; int vcfRowCmp(const void *va, const void *vb); /* Compare for sorting based on chrom,pos. */ void vcfRowTabOutList(FILE *f, struct vcfRow *rowList); /* Write out each row of VCF to f. */ // VCF header definitions of FILTER and INFO terms created by hgvsToVcfRow: #define HGVS_VCF_HEADER_DEFS \ "##FILTER=\n" \ "##FILTER=\n" \ "##INFO=\n" \ "##INFO=\n" struct vcfRow *hgvsToVcfRow(char *db, char *term, boolean doLeftShift, struct dyString *dyError); /* Convert HGVS to a row of VCF suitable for sorting & printing. If unable, return NULL and - * put the reason in dyError. */ + * put the reason in dyError. Protein terms are ambiguous at the nucleotide level so they are + * not supported at this point. */ #endif /* HGHGVS_H */