98aa95938f3af948c60e20044318595f10fee780
braney
  Thu May 13 15:22:00 2021 -0700
ongoing work on cart rewrite system

diff --git src/lib/regexHelper.c src/lib/regexHelper.c
index fe2e938..6d1c9bd 100644
--- src/lib/regexHelper.c
+++ src/lib/regexHelper.c
@@ -1,137 +1,296 @@
 /* regexHelper: easy wrappers on POSIX Extended Regular Expressions (man 7 regex, man 3 regex) */
 
 /* Copyright (C) 2012 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "regexHelper.h"
 #include "hash.h"
 
 const regex_t *regexCompile(const char *exp, const char *description, int compileFlags)
 /* Compile exp (or die with an informative-as-possible error message).
  * Cache pre-compiled regex's internally (so don't free result after use). */
 {
 static struct hash *reHash = NULL;
 struct hashEl *hel = NULL;
 char key[512];
 safef(key, sizeof(key), "%d.%s", compileFlags, exp);
 
 if (reHash == NULL)
     reHash = newHash(10);
 hel = hashLookup(reHash, key);
 if (hel != NULL)
     return((regex_t *)hel->val);
 else
     {
     regex_t *compiledExp = NULL;
     int errNum = 0;
     AllocVar(compiledExp);
     errNum = regcomp(compiledExp, exp, compileFlags);
     if (errNum != 0)
 	{
 	char errBuf[512];
 	regerror(errNum, compiledExp, errBuf, sizeof(errBuf));
 	errAbort("%s \"%s\" got regular expression compilation error %d:\n%s\n",
 		 description, exp, errNum, errBuf);
 	}
     hashAdd(reHash, key, compiledExp);
     return(compiledExp);
     }
 }
 
 static boolean regexMatchSubstrMaybeCase(const char *string, const char *exp,
 					 regmatch_t substrArr[], size_t substrArrSize,
 					 boolean isCaseInsensitive)
 /* Return TRUE if string matches regular expression exp;
  * regexec fills in substrArr with substring offsets. */
 {
 if (string == NULL)
     return FALSE;
 int compileFlags = REG_EXTENDED;
 char desc[256];
 safecpy(desc, sizeof(desc), "Regular expression");
 if (isCaseInsensitive)
     {
     compileFlags |= REG_ICASE;
     safecat(desc, sizeof(desc), " (case insensitive)");
     }
 if (substrArr == NULL)
     compileFlags |= REG_NOSUB;
 else
     safecat(desc, sizeof(desc), " with substrings");
 
 const regex_t *compiledExp = regexCompile(exp, desc, compileFlags);
 return(regexec(compiledExp, string, substrArrSize, substrArr, 0) == 0);
 }
 
 boolean regexMatch(const char *string, const char *exp)
 /* Return TRUE if string matches regular expression exp (case sensitive). */
 {
 return regexMatchSubstrMaybeCase(string, exp, NULL, 0, FALSE);
 }
 
 boolean regexMatchNoCase(const char *string, const char *exp)
 /* Return TRUE if string matches regular expression exp (case insensitive). */
 {
 return regexMatchSubstrMaybeCase(string, exp, NULL, 0, TRUE);
 }
 
 boolean regexMatchSubstr(const char *string, const char *exp,
 			 regmatch_t substrArr[], size_t substrArrSize)
 /* Return TRUE if string matches regular expression exp (case sensitive);
  * regexec fills in substrArr with substring offsets. */
 {
 return regexMatchSubstrMaybeCase(string, exp, substrArr, substrArrSize, FALSE);
 }
 
 boolean regexMatchSubstrNoCase(const char *string, const char *exp,
 			       regmatch_t substrArr[], size_t substrArrSize)
 /* Return TRUE if string matches regular expression exp (case insensitive);
  * regexec fills in substrArr with substring offsets. */
 {
 return regexMatchSubstrMaybeCase(string, exp, substrArr, substrArrSize, TRUE);
 }
 
 void regexSubstringCopy(const char *string, const regmatch_t substr,
                         char *buf, size_t bufSize)
 /* Copy a substring from string into buf using start and end offsets from substr.
  * If the substring was not matched then make buf an empty string. */
 {
 if (regexSubstrMatched(substr))
     safencpy(buf, bufSize, string + substr.rm_so, substr.rm_eo - substr.rm_so);
 else
     *buf = '\0';
 }
 
 char *regexSubstringClone(const char *string, const regmatch_t substr)
 /* Clone and return a substring from string using start and end offsets from substr.
  * If the substring was not matched then return a cloned empty string. */
 {
 char *clone = NULL;
 if (regexSubstrMatched(substr))
     {
     int len = substr.rm_eo - substr.rm_so;
     clone = needMem(len + 1);
     regexSubstringCopy(string, substr, clone, len + 1);
     }
 else
     clone = cloneString("");
 return clone;
 }
 
 int regexSubstringInt(const char *string, const regmatch_t substr)
 /* Return the integer value of the substring specified by substr.
  * If substr was not matched, return 0; you can check first with regexSubstrMatched() if
  * that's not the desired behavior for unmatched substr. */
 {
 int val = 0;
 if (regexSubstrMatched(substr))
     {
     int len = substr.rm_eo - substr.rm_so;
     char buf[len+1];
     regexSubstringCopy(string, substr, buf, sizeof(buf));
     val = atoi(buf);
     }
 else
     val = 0;
 return val;
 }
+
+static struct regexSnippet *parseSnippets(char *input)
+/* Generate a data structure that describes how the parenthetical
+ * regular expressions should be substituted in the output.
+ */
+{
+char *in = input;
+struct regexSnippet *out = NULL;
+
+char *prev = input;
+for(;;)
+    {
+    if ((*in == 0) || ((*in == '\\') && isdigit(in[1])))
+        {
+        struct regexSnippet *snippet;
+        AllocVar(snippet);
+
+        int size = in - prev;
+        if (size)
+            {
+            char buffer[size + 1];
+            strncpy(buffer, prev, size);
+            buffer[size] = 0;
+            snippet->precursor = cloneString(buffer);
+            snippet->precursorLen = size;
+            prev = in;
+            }
+
+        if (*in)
+            {
+            in++;
+            snippet->num = atoi(in);
+            while (isdigit(*in))
+                in++;
+            }
+
+        slAddHead(&out, snippet);
+        if (*in == 0)
+            break;
+        }
+    else
+        in++;
+    }
+
+slReverse(&out);
+
+return out;
+}
+
+static struct regexCompiledEdit *compileEdits(struct regexEdit *editArray, unsigned numEdits, boolean quiet)
+/* Compile all the edits. */
+{
+struct regexCompiledEdit *compiledEdits, *compiledEdit;
+
+AllocArray(compiledEdits, numEdits);
+compiledEdit = compiledEdits;
+
+for(; numEdits; numEdits--, editArray++, compiledEdit++)
+    {
+    regex_t *compiledExp = NULL;
+    int errNum = 0;
+    int compileFlags = 0;
+
+    AllocVar(compiledExp);
+    errNum = regcomp(compiledExp, editArray->query, compileFlags);
+
+    if (errNum != 0)
+        {
+        if (quiet)
+            return NULL;
+
+        char errBuf[4096];
+        regerror(errNum, compiledExp, errBuf, sizeof(errBuf));
+        errAbort("regular expression compilation error %d: %s", errNum, errBuf);
+        }
+
+    compiledEdit->compiledExp = compiledExp;
+    compiledEdit->snippets = parseSnippets(editArray->substitution);
+    }
+
+return compiledEdits;
+}
+
+static char *doSubEdits(struct regexSnippet *snippets, regmatch_t *matches, char *source, int *plength)
+/* Do the substitions on parenthetical expressions. */
+{
+char output[40 * 1024], *out = output;
+*out = 0;
+
+for(; snippets ; matches++, snippets = snippets->next)
+    {
+    // copy the part before the match
+    strncpy(out, snippets->precursor, snippets->precursorLen);
+    out += snippets->precursorLen;
+    *plength += snippets->precursorLen;
+
+    if (matches->rm_so == -1)
+        break;
+
+    // copy in the part that matches the regular expression
+    int size = matches->rm_eo - matches->rm_so;
+    strncpy(out, &source[matches->rm_so], size);
+    *plength += size;
+    out += size;
+    }
+*out = 0;
+return cloneString(output);
+}
+
+static char *doOneEdit( struct regexCompiledEdit *edit, char *input, boolean quiet)
+/* Perform one edit on the input string.  Errabort if !quiet and there is an error. */
+{
+char buffer[40 * 1024];
+char *source = input;
+regmatch_t matches[1024];
+int lastSrc = 0;
+int offset = 0;
+
+for(;;)
+    {
+    /* if there's not a match, we're done. */
+    if (regexec(edit->compiledExp, source, ArraySize(matches), matches, 0))
+        break;
+
+    int size =  matches->rm_so;
+    strncpy(&buffer[lastSrc], source, size);
+    lastSrc += size;
+
+    int subSize = 0;
+    // do the substitions on any matching parenthetical expressions
+    char *subEdit = doSubEdits(edit->snippets, matches+1, source, &subSize);
+
+    strncpy(&buffer[lastSrc], subEdit, subSize);
+    lastSrc += subSize;
+    offset += matches->rm_eo;
+    //
+    // keep looking after the last match
+    source = &input[offset];
+    }
+
+strcpy(&buffer[lastSrc], source);
+
+return cloneString(buffer);
+}
+
+char *regexEdit(struct regexEdit *editArray, unsigned numEdits, char *input, boolean quiet)
+/* Perform a list of edits on a string. */
+{
+struct regexCompiledEdit *compiledEdits = compileEdits(editArray, numEdits, quiet);
+
+if (compiledEdits == NULL)
+    return FALSE;
+
+char *outString = input;
+for(; numEdits && outString; compiledEdits++, numEdits--)
+    outString = doOneEdit(compiledEdits, outString, quiet);
+
+return outString;
+}