b91a917e39c8d58d0de5e6325606f098ef0fb60b angie Wed Mar 21 09:22:21 2018 -0700 New util pslMismatchGapToBed searches for sequence mismatches and indels between reference genome and transcripts. Five output BED+ files are created, with corresponding hg/lib/txAli*.as autoSql files for generating bigBed. This could be used to create 'Anomalies' subtracks for the NCBI RefSeq track. refs #21079 diff --git src/hg/inc/hgHgvs.h src/hg/inc/hgHgvs.h index 070e0ae..2314781 100644 --- src/hg/inc/hgHgvs.h +++ src/hg/inc/hgHgvs.h @@ -1,280 +1,283 @@ /* hgHgvs - support for a subset of the Human Genome Variation Society's (HGVS) nomenclature * for describing variants with respect to a specific gene sequence (or genome assembly). * See http://www.hgvs.org/mutnomen/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #ifndef HGHGVS_H #define HGHGVS_H #include "bed.h" #include "dnaseq.h" #include "seqWindow.h" #include "variantProjector.h" /* The full nomenclature is extremely complicated, able to encode things such as gene fusions and * advanced clinical info (e.g. "=/" for somatic mosaicism, "=//" for chimerism). UCSC supports * substitutions, insertions, deletions, duplications and inversions. Conversions are parsed out * of HGVS terms but not detected in genomic variants when generating HGVS terms. * UCSC does not fully support repeated sequences because in practice they seem to be frequently * incorrect and inherently error-prone. * * At the same time, since the spec has repeatedly changed, we will need to be flexible in our * parsing in order to support previously published HGVS (or HGVS-like) terms. */ enum hgvsSeqType // HGVS describes changes relative to several types of sequence: genomic, coding, protein, etc { hgvstUndefined, // the usual int default value means we haven't actually checked hgvstCoding, // "c.": Coding cDNA sequence only. Beware: complex coords for intron & UTR. hgvstGenomic, // "g.": Genomic DNA hgvstMito, // "m.": Mitochondrial DNA hgvstNoncoding, // "n.": non-coding cDNA w/complex coords for intron hgvstRna, // "r.": RNA (like DNA but lowercase and 'u' instead of 'T') hgvstProtein, // "p.": Protein }; struct hgvsVariant // Components that we are able parse out of an HGVS term. (there's more to HGVS than just this) { struct hgvsVariant *next; char *seqAcc; // The reference sequence for the variant (optional -- may be NULL!) char *seqGeneSymbol; // Usually NULL, but DNA/RNA can have HGNC gene symbol in ()'s. char *changes; // The sequence change(s) (following the position in nucleotide terms, // with embedded position in protein terms); can be parsed into // syntax tree by hgvsParseNucleotideChange (below) int start1; // 1-based start of the variant in reference seq: can be negative ! // For [cnr]. terms with complex intron coords, this is the // anchor coord (nearest exon boundary) int end; // End of the variant in the reference seq; can be negative ! // For a range, this is the second anchor coord; else same as start int startOffset; // offset into intron; 1-based, 0 means N/A; can be negative! int endOffset; // offset into intron; 0 means N/A; can be negative! // For a range, this is the second offset; else same as startOffset enum hgvsSeqType type; // type of sequence: genomic/mito, coding, noncoding, protein // These two fields apply only to hgvstCoding ("c.") terms and Rna ("r.") for coding RNAs: boolean startIsUtr3; // TRUE if start is relative to *end* of coding sequence boolean endIsUtr3; // TRUE if end is relative to *end* of coding sequence }; // // HGVS sequence change(s) syntax tree // // An HGVS description contains two parts: position and sequence change. // Positions may be simple 1-based, nested ranges, or relative to various CDS boundaries. // Currently positions are parsed using regular expressions in hgHgvs.c // Sequence changes may contain nested HGVS terms (insertion from some other sequence). // They are tokenized and parsed in hgHgvsParse.c. enum hgvsChangeType // There may be no change, or a change that specifies ref and/or alt bases, or a change // in which the sequence is all implied by the reference sequence. { hgvsctUndefined, hgvsctNoChange, // No change; ref sequence is implied, alt sequence is the same. hgvsctSubst, // Substitution: ref sequence and (possibly complex) alternate sequence hgvsctDel, // Deletion; ref sequence is implied (or maybe asserted). hgvsctDup, // Duplication; ref sequence is implied (or maybe asserted) hgvsctInv, // Inversion; ref sequence is implied (or maybe asserted) hgvsctIns, // Insertion: ref is implied; (possibly complex) inserted sequence follows hgvsctCon, // Conversion: basically Deletion followed by Insertion hgvsctRepeat, // Ref sequence is implied (first unit); alt sequence is some # of copies hgvsctFrameshift, // First alt amino acid and possibly offset of early stop }; // Sometimes instead of a sequence (or in addition to a del, dup etc), a length is given, // sometimes not. Use int's sign to specify whether length was included or not. #define HGVS_LENGTH_UNSPECIFIED -1 enum hgvsSeqChangeType // A sequence is usually just a sequence, but it might come from a nested expression and/or // be subjected to yet another change. { hgvsstUndefined, hgvsstSimple, // IUPAC single-letter codes + '?' if some unspecified hgvsstLength, // Only the length of the sequence is provided hgvsstNestedChange, // Simple sequence... and another change that's applied to this one. hgvsstNestedTerm, // HGVS term for sequence copied from some other reference sequence. }; union hgvsSeqValue // A sequence string, nested change, or nested term (external sequence) { char *seq; // IUPAC single-letter codes + '?' if some unspecified int length; // Sequence length (content unspecified) struct hgvsVariant term; // Nested HGVS term: acc, pos, possibly change }; struct hgvsChangeRefAlt // 'Before and after' (reference and alternate) sequence representations { char *refSequence; // NULL if the reference sequence is implied by the position, // or if only length is asserted; otherwise it's asserted // reference sequence in IUPAC single-letter codes // and when some but not all bases are included in HGVS term, // '?' for unspecified bases. int refLength; // Sometimes, (redundant) length is given after del, dup, etc. enum hgvsSeqChangeType altType; // Type of alternate sequence -- can be complex. union hgvsSeqValue altValue; // IUPAC single-letter sequence, or a sequence of nested changes }; struct hgvsChangeFrameshift // Info that may or may not be given along with a frameshift that disrupts the protein sequence { int offset; // The number of AA bases downstream of a new termination boolean offsetValid; // FALSE if offset was not provided boolean elongation; // TRUE if offset appears after '*', i.e. after old termination char *altSequence; // Sequence of IUPAC single-letter AA codes or NULL if not given char refBase; // IUPAC single-letter AA code (or '\0' if not given) }; struct hgvsChangeRepeat // Repeat count (range) and optional repeated sequence { char *seq; // The sequence that is repeated, may be NULL (implied from ref) int min; // Minimum repeat count specified int max; // Max repeat count specified }; union hgvsChangeValue // Depending on type of change, ref & alt sequences, frameshift description or repeat count { struct hgvsChangeRefAlt refAlt; // A description of reference and alternate sequence struct hgvsChangeFrameshift fs; // Optional info about early termination of translation struct hgvsChangeRepeat repeat; // Optional repeated sequence and min/max observed counts. int count; // Repeat unit count in alternate sequence (ref count is 1) }; struct hgvsChange // Usually this contains a sequence of bases from the reference and a sequence of bases from // the alternate allele, but can contain nested HGVS terms. For deletions, reference bases // are usually not specified, except for protein which includes the first and last amino acid. { struct hgvsChange *next; // One HGVS term can specify a sequence of changes enum hgvsChangeType type; // HGVS operator: >, del, ins, fs etc. union hgvsChangeValue value; // the actual sequences changed (possibly complex) }; // // HGVS output option bit flags // // Output an HGVS genomic (g.) term: #define HGVS_OUT_G 0x01 // Output either an HGVS coding (c.) term if applicable, otherwise noncoding (n.) term: #define HGVS_OUT_CN 0x02 // Output an HGVS protein (p.) term if applicable: #define HGVS_OUT_P 0x04 // Add parentheses around predicted protein (p.) changes e.g. p.(Arg159del): #define HGVS_OUT_P_ADD_PARENS 0x10 // Add deleted sequence to delins changes (e.g. show 'delAGinsTT' instead of 'delinsTT'): #define HGVS_OUT_BREAK_DELINS 0x20 void hgvsVariantFree(struct hgvsVariant **pHgvs); // Free *pHgvs and its contents, and set *pHgvs to NULL. struct hgvsVariant *hgvsParseTerm(char *term); /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions or alleles. */ struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term); /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS. */ boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele); /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ struct bed *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable); /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ struct bed *hgvsValidateAndMap(struct hgvsVariant *hgvs, char *db, char *term, struct dyString *dyWarn, char **retPslTable); /* If we have sequence for hgvs->seqAcc and the hgvs coords are within the bounds * of that sequence, and we are able to map the coords in seqAcc to genomic coords in * db, return a bed6 with those coords and strand. If unsuccessful, or if the reference * sequence differs at the mapped location, then write a warning message to dyWarn. * If mapping is successful and retPslTable is not NULL, set it to the psl table * used for mapping. */ struct hgvsChange *hgvsParseNucleotideChange(char *changeStr, enum hgvsSeqType type, struct dyString *dyError); /* Return a parse tree of an HGVS nt sequence change (the part after the position range) * possibly followed by additional changes. If there are any parse errors, they will be * appended to dyError. */ struct vcfRow /* char* fields of VCF row (no genotypes) except POS (int for sorting) and QUAL (omitted). */ // See src/inc/vcf.h and VCF spec for definitions of fields. // Currently this is used only in the context of HGVS. If it turns out to be useful elsewhere // then it could be libified on its own or perhaps into vcf.h. { struct vcfRow *next; char *chrom; int pos; char *id; char *ref; char *alt; char *filter; char *info; }; int vcfRowCmp(const void *va, const void *vb); /* Compare for sorting based on chrom,pos. */ void vcfRowTabOutList(FILE *f, struct vcfRow *rowList); /* Write out each row of VCF to f. */ // VCF header definitions of FILTER and INFO terms created by hgvsToVcfRow: #define HGVS_VCF_HEADER_DEFS \ "##FILTER=<ID=HgvsRefAssertedMismatch," \ "Description=\"Asserted reference sequence in HGVS term does not match actual " \ "reference sequence\">\n" \ "##FILTER=<ID=HgvsRefGenomicMismatch," \ "Description=\"HGVS reference sequence does not match genomic sequence; " \ "HGVS reference sequence is included in ALT\">\n" \ "##INFO=<ID=DupToIns,Number=0,Type=Flag," \ "Description=\"HGVS dup (duplication) was converted to insertion\">\n" \ "##INFO=<ID=BasesShifted,Number=1,Type=Integer," \ "Description=\"Position of HGVS variant was shifted this number of bases to the left\">\n" struct vcfRow *hgvsToVcfRow(char *db, char *term, boolean doLeftShift, struct dyString *dyError); /* Convert HGVS to a row of VCF suitable for sorting & printing. If unable, return NULL and * put the reason in dyError. Protein terms are ambiguous at the nucleotide level so they are * not supported at this point. */ +uint hgvsTxToCds(uint txOffset, struct genbankCds *cds, boolean isStart, char pPrefix[2]); +/* Return the cds-relative HGVS coord and prefix corresponding to 0-based txOffset & cds. */ + char *hgvsGFromVariant(struct seqWindow *gSeqWin, struct bed3 *variantBed, char *alt, char *acc, boolean breakDelIns); /* Return an HGVS g. string representing the genomic variant at the position of variantBed with * reference allele from gSeqWin and alternate allele alt. If acc is non-NULL it is used * instead of variantBed->chrom. * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ char *hgvsNFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli, struct dnaSeq *txSeq, boolean breakDelIns); /* Return an HGVS n. (noncoding transcript) term for a variant projected onto a transcript. * gSeqWin must already have at least the correct seqName if not the surrounding sequence. * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ char *hgvsCFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli, struct genbankCds *cds, struct dnaSeq *txSeq, boolean breakDelIns); /* Return an HGVS c. (coding transcript) term for a variant projected onto a transcript w/cds. * gSeqWin must already have at least the correct seqName (chrom) if not the surrounding sequence. * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens); /* Return an HGVS p. (protein) term for a variant projected into protein space. * Strict HGVS compliance requires parentheses around predicted protein changes (addParens=TRUE), * but nobody seems to do that in practice. * Return NULL if an input is NULL. */ #endif /* HGHGVS_H */