aaf72102b545c05c42f66b7a3fc22d65b1ecf4fe angie Mon Aug 8 14:12:39 2016 -0700 Added recognition of a small subset of HGVS terms: coding (c.) SNVs relative to RefSeq NM_ or LRG transcript IDs, and protein (p.) simple substitutions relative to NP_. Also accepted (not HGVS but similar and popular): geneSymbol and abbreviated protein subst like "ALK G1494E". hgFind will map terms to the current genome if possible, and will display warnings about unrecognized accessions, out-of-bounds coordinates and mismatching reference alleles. refs #15071, #15554 diff --git src/lib/regexHelper.c src/lib/regexHelper.c index 04e8ab5..fe2e938 100644 --- src/lib/regexHelper.c +++ src/lib/regexHelper.c @@ -1,93 +1,137 @@ /* regexHelper: easy wrappers on POSIX Extended Regular Expressions (man 7 regex, man 3 regex) */ /* Copyright (C) 2012 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "regexHelper.h" #include "hash.h" const regex_t *regexCompile(const char *exp, const char *description, int compileFlags) /* Compile exp (or die with an informative-as-possible error message). * Cache pre-compiled regex's internally (so don't free result after use). */ { static struct hash *reHash = NULL; struct hashEl *hel = NULL; char key[512]; safef(key, sizeof(key), "%d.%s", compileFlags, exp); if (reHash == NULL) reHash = newHash(10); hel = hashLookup(reHash, key); if (hel != NULL) return((regex_t *)hel->val); else { regex_t *compiledExp = NULL; int errNum = 0; AllocVar(compiledExp); errNum = regcomp(compiledExp, exp, compileFlags); if (errNum != 0) { char errBuf[512]; regerror(errNum, compiledExp, errBuf, sizeof(errBuf)); errAbort("%s \"%s\" got regular expression compilation error %d:\n%s\n", description, exp, errNum, errBuf); } hashAdd(reHash, key, compiledExp); return(compiledExp); } } static boolean regexMatchSubstrMaybeCase(const char *string, const char *exp, regmatch_t substrArr[], size_t substrArrSize, boolean isCaseInsensitive) /* Return TRUE if string matches regular expression exp; * regexec fills in substrArr with substring offsets. */ { if (string == NULL) return FALSE; int compileFlags = REG_EXTENDED; char desc[256]; safecpy(desc, sizeof(desc), "Regular expression"); if (isCaseInsensitive) { compileFlags |= REG_ICASE; safecat(desc, sizeof(desc), " (case insensitive)"); } if (substrArr == NULL) compileFlags |= REG_NOSUB; else safecat(desc, sizeof(desc), " with substrings"); const regex_t *compiledExp = regexCompile(exp, desc, compileFlags); return(regexec(compiledExp, string, substrArrSize, substrArr, 0) == 0); } boolean regexMatch(const char *string, const char *exp) /* Return TRUE if string matches regular expression exp (case sensitive). */ { return regexMatchSubstrMaybeCase(string, exp, NULL, 0, FALSE); } boolean regexMatchNoCase(const char *string, const char *exp) /* Return TRUE if string matches regular expression exp (case insensitive). */ { return regexMatchSubstrMaybeCase(string, exp, NULL, 0, TRUE); } boolean regexMatchSubstr(const char *string, const char *exp, regmatch_t substrArr[], size_t substrArrSize) /* Return TRUE if string matches regular expression exp (case sensitive); * regexec fills in substrArr with substring offsets. */ { return regexMatchSubstrMaybeCase(string, exp, substrArr, substrArrSize, FALSE); } boolean regexMatchSubstrNoCase(const char *string, const char *exp, regmatch_t substrArr[], size_t substrArrSize) /* Return TRUE if string matches regular expression exp (case insensitive); * regexec fills in substrArr with substring offsets. */ { return regexMatchSubstrMaybeCase(string, exp, substrArr, substrArrSize, TRUE); } +void regexSubstringCopy(const char *string, const regmatch_t substr, + char *buf, size_t bufSize) +/* Copy a substring from string into buf using start and end offsets from substr. + * If the substring was not matched then make buf an empty string. */ +{ +if (regexSubstrMatched(substr)) + safencpy(buf, bufSize, string + substr.rm_so, substr.rm_eo - substr.rm_so); +else + *buf = '\0'; +} + +char *regexSubstringClone(const char *string, const regmatch_t substr) +/* Clone and return a substring from string using start and end offsets from substr. + * If the substring was not matched then return a cloned empty string. */ +{ +char *clone = NULL; +if (regexSubstrMatched(substr)) + { + int len = substr.rm_eo - substr.rm_so; + clone = needMem(len + 1); + regexSubstringCopy(string, substr, clone, len + 1); + } +else + clone = cloneString(""); +return clone; +} + +int regexSubstringInt(const char *string, const regmatch_t substr) +/* Return the integer value of the substring specified by substr. + * If substr was not matched, return 0; you can check first with regexSubstrMatched() if + * that's not the desired behavior for unmatched substr. */ +{ +int val = 0; +if (regexSubstrMatched(substr)) + { + int len = substr.rm_eo - substr.rm_so; + char buf[len+1]; + regexSubstringCopy(string, substr, buf, sizeof(buf)); + val = atoi(buf); + } +else + val = 0; +return val; +}