b534e5e167df93880881df0478ecec0225fdf136 angie Wed Aug 31 17:01:24 2016 -0700 This commit adds the capability to pick apart complex HGVS sequence change descriptions, and apply those changes to reference sequence, in order to translate HGVS nucleotide terms into a variant representation suitable for functional prediction in hgVai. VCF was chosen since it is easy to integrate into hgVai. refs #11460 Changes to existing code: * hgvsMapToGenome maps to BED6 instead of BED3 because we need to know strand in order to convert transcript changes into VCF forward-strand genomic changes. * hgvsMapToGenome maps insertions to zero-length points instead of 2-base ranges as in HGVS. New file hgHgvsParse.c contains a tokenizer and parser for HGVS sequence change descriptions; top-level interface is hgvsParseNucleotideChange. hgHgvs.c has new code to translate parsed HGVS nucleotide change(s) into VCF, optionally left-shifting ambiguous alignments (VCF convention, at odds with HGVS right-shifting convention); top-level interface is hgvsToVcfRow. New hgvsToVcf utility enables testing of corner cases and may come in handy as a command-line util. HGVS terms for testing have been taken from ClinVar and do not reflect the diversity of terms in the wild, nor do they cover the full HGVS spec. For example, the HGVS repeat notation can be parsed but not mapped to the genome because all of the ClinVar repeat terms that I looked at looked wonky to me and I believe the HGVS repeat notation is inherently error-prone. The repeat notation is supposed to use the position of the first repeat unit and to specify the number of repeated copies starting at that point (right-shifted if ambiguous). However, in ClinVar, sometimes the given repeat unit sequence did not match the reference sequence at the given position; sometimes the number of sepeats made sense only if they were not perfect repeats (some differing bases); sometimes ranges of repeat numbers were given. Also, the reference assembly's number of repeats can change from one assembly to the next. So it is hard given an HGVS repeat term to determine 1) whether it makes sense in relation to the reference assembly with/without fuzzy matching and 2) what the exact change is relative to the reference assembly. Insertions of inverted sequence from elsewhere in the same reference have not yet been tested. http://varnomen.hgvs.org/recommendations/DNA/variant/inversion/ gives some complicated examples like "g.122_123ins213_234invinsAins123_211inv" but I have not yet seen terms like that in the wild. diff --git src/inc/dystring.h src/inc/dystring.h index c022ccc..c060cf1 100644 --- src/inc/dystring.h +++ src/inc/dystring.h @@ -1,102 +1,116 @@ /* dystring - dynamically resizing string. * * This file is copyright 2002 Jim Kent, but license is hereby * granted for all use - public, private or commercial. */ #ifndef DYSTRING_H /* Wrapper to avoid including this twice. */ #define DYSTRING_H #include "common.h" struct dyString /* Dynamically resizable string that you can do formatted * output to. */ { struct dyString *next; /* Next in list. */ char *string; /* Current buffer. */ int bufSize; /* Size of buffer. */ int stringSize; /* Size of string. */ }; struct dyString *newDyString(int initialBufSize); /* Allocate dynamic string with initial buffer size. (Pass zero for default) */ #define dyStringNew newDyString void freeDyString(struct dyString **pDs); /* Free up dynamic string. */ #define dyStringFree(a) freeDyString(a); void freeDyStringList(struct dyString **pDs); /* Free up a list of dynamic strings */ #define dyStringFreeList(a) freeDyStringList(a); void dyStringAppend(struct dyString *ds, char *string); /* Append zero terminated string to end of dyString. */ void dyStringAppendN(struct dyString *ds, char *string, int stringSize); /* Append string of given size to end of string. */ char dyStringAppendC(struct dyString *ds, char c); /* Append char to end of string. */ void dyStringAppendMultiC(struct dyString *ds, char c, int n); /* Append N copies of char to end of string. */ void dyStringAppendEscapeQuotes(struct dyString *dy, char *string, char quot, char esc); /* Append escaped-for-quotation version of string to dy. */ #define dyStringWriteOne(dy, var) dyStringAppendN(dy, (char *)(&var), sizeof(var)) /* Write one variable (binary!) to dyString - for cases when want to treat string like * a file stream. */ void dyStringVaPrintf(struct dyString *ds, char *format, va_list args); /* VarArgs Printf to end of dyString. */ void dyStringPrintf(struct dyString *ds, char *format, ...) /* Printf to end of dyString. */ #ifdef __GNUC__ __attribute__((format(printf, 2, 3))) #endif ; struct dyString *dyStringCreate(char *format, ...) /* Create a dyString with a printf style initial content */ #if defined(__GNUC__) __attribute__((format(printf, 1, 2))) #endif ; #define dyStringClear(ds) (ds->string[0] = ds->stringSize = 0) /* Clear string. */ struct dyString * dyStringSub(char *orig, char *in, char *out); /* Make up a duplicate of orig with all occurences of in substituted * with out. */ void dyStringBumpBufSize(struct dyString *ds, int size); /* Force dyString buffer to be at least given size. */ char *dyStringCannibalize(struct dyString **pDy); /* Kill dyString, but return the string it is wrapping * (formerly dy->string). This should be free'd at your * convenience. */ #define dyStringContents(ds) (ds)->string /* return raw string. */ #define dyStringLen(ds) ds->stringSize /* return raw string length. */ +#define dyStringIsEmpty(ds) (ds->stringSize == 0) +/* Return TRUE if dyString is empty. */ + +#define dyStringIsNotEmpty(ds) (ds->stringSize > 0) +/* Return TRUE if dyString is not empty. */ + void dyStringResize(struct dyString *ds, int newSize); /* resize a string, if the string expands, blanks are appended */ void dyStringQuoteString(struct dyString *dy, char quotChar, char *text); /* Append quotChar-quoted text (with any internal occurrences of quotChar * \-escaped) onto end of dy. */ +INLINE void dyStringAppendSep(struct dyString *dy, char *sep) +/* If dy is not empty then append sep; otherwise leave it empty. For building up lists without + * a separator at the end. */ +{ +if (dyStringIsNotEmpty(dy)) + dyStringAppend(dy, sep); +} + #endif /* DYSTRING_H */