27f5d7f9b4cc6d337ab34422b25a1c94553623e9
kent
  Fri Aug 16 11:54:34 2019 -0700
Adding a define stanza with variables that get evaluated each line.

diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c
index 55bf29e..4dee597 100644
--- src/tabFile/tabToTabDir/tabToTabDir.c
+++ src/tabFile/tabToTabDir/tabToTabDir.c
@@ -23,31 +23,37 @@
 "where:\n"
 "   in.tsv is a tab-separated input file.  The first line is the label names and may start with #\n"
 "   spec.txt is a file that says what columns to put into the output, described in more detail below\n"
 "   outDir is a directory that will be populated with tab-separated files\n"
 "The spec.txt file contains one blank line separated stanza per output table.\n"
 "Each stanza should look like:\n"
 "        table tableName    key-column\n"
 "        columnName1	sourceField1\n"
 "        columnName2	sourceField2\n"
 "              ...\n"
 "if the sourceField is missing it is assumed to be a column of the same name in in.tsv\n"
 "The sourceField can either be a column name in the in.tsv, or a string enclosed literal\n"
 "or an @ followed by a table name, in which case it refers to the key of that table.\n"
 "If the source column is in comma-separated-values format then the sourceField can include a\n"
 "constant array index to pick out an item from the csv list.\n"
-"You can also use strex expressions for more complicated situations.  See src/lib/strex.doc\n"
+"You can also use strex expressions for more complicated situations.\n"
+"            See src/lib/strex.doc\n"
+"In addition to the table stanza there can be a 'define' stanza that defines variables\n"
+"that can be used in sourceFields for tables.  This looks like:\n"
+"         define\n"
+"         variable1 sourceField1\n"
+"         variable2 sourceField2\n"
 );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 
 static int firstDifferentIx(char **aa, char **bb, int count)
 /* Return true if first count of strings between aa and bb are the same */
 {
 int i;
 for (i=0; i<count; ++i)
     if (!sameString(aa[i], bb[i]))
@@ -107,131 +113,201 @@
 
 boolean isTotallySimple(char *s)
 /* We are only alphanumerical and dotty things, we even begin with a alnum or _*/
 {
 char c = *s++;
 if (!isalpha(c) && (c != '_'))
     return FALSE;
 while ((c = *s++) != 0)
     {
     if (!(isalnum(c) || (c == '_') || (c == '.')))
 	return FALSE;
     }
 return TRUE;
 }
 
-struct newFieldInfo *parseFieldVal(char *name, struct hash *inFieldHash,
+struct newFieldInfo *parseFieldVal(char *name, 
     char *input, char *fileName, int fileLineNumber, void *symbols, StrexLookup lookup)
 /* return a newFieldInfo based on the contents of input, which are not destroyed */
 {
 /* Make up return structure. */
 
 struct newFieldInfo *fv;
 AllocVar(fv);
 fv->name = cloneString(name);
 
 char *s = skipLeadingSpaces(input);
 if (isEmpty(s))
     {
     fv->type = fvVar;
     fv->val = cloneString(name);
     }
 else
     {
     char c = s[0];
     if (c == '@')
 	{
 	char *val = fv->val = cloneString(skipLeadingSpaces(s+1));
 	trimSpaces(val);
 	if (isEmpty(val))
 	    errAbort("Nothing following %c", c);
 	fv->type = fvLink;
 	}
     else 
         {
-	if (isTotallySimple(s))
+	if (isTotallySimple(s) && lookup(symbols, s) == NULL)
 	    {
 	    fv->val = cloneString(skipLeadingSpaces(s));
 	    eraseTrailingSpaces(fv->val);
 	    fv->type = fvVar;
 	    }
 	else
 	    {
 	    fv->val = cloneString(s);
 	    fv->exp = strexParseString(fv->val, fileName, fileLineNumber-1, symbols, lookup);
 	    fv->type = fvExp;
 	    }
 	}
     }
 return fv;
 }
 
+struct varVal
+/* A variable, what we need to compute it, and it's value */
+     {
+     struct varVal *next;   /* Next in list */
+     char *name;	    /* Variable name */
+     struct strexParse *exp;  /* Parsed out expression. */
+     char *val;		    /* Computed value - not owned by us. */
+     };
+
+struct varVal *varValNew(char *name, struct strexParse *exp)
+/* Allocate new varVal structure */
+{
+struct varVal *v;
+AllocVar(v);
+v->name = cloneString(name);
+v->exp = exp;
+return v;
+}
+
+
 struct symRec
 /* Something we pass as a record to symLookup */
     {
-    struct hash *hash;	    /* The hash with symbol to row index */
-    char **row;		    /* The row we are working on */
+    struct hash *rowHash;	    /* The hash with symbol to row index */
+    char **tableRow;		    /* The input row we are working on. You own.*/
+    char **varRow;		    /* A slot for each computed variables results. We own */
+    struct hash *varHash;	    /* Variables with varVal values */
+    struct varVal *varList;	    /* List of all variables, same info as in hash above. */
+    struct lm *lm;		    /* Local memory to use during eval phase */
     };
 
+struct symRec *symRecNew(struct hash *rowHash, struct hash *varHash)
+/* Return a new symRec. The rowHash is required and contains a hash with
+ * values that are indexes into the table row.  The varHash is optional,
+ * and if present should have variable names keying parseExp values. */
+{
+struct symRec *rec;
+AllocVar(rec);
+rec->rowHash = rowHash;
+if (varHash != NULL)
+    {
+    rec->varHash = varHash;
+    }
+return rec;
+}
+
+static void symRecSetupPrecomputes(struct symRec *symbols)
+/* Clear out any precomputed variable values - should be
+ * executed on each new line of table. */
+{
+/* Clear up any old precomputes - sort of sad these can't currently
+ * be shared between output tables. Probably not enough of a time
+ * bottleneck to be worth fixing though. */
+struct varVal *v;
+for (v = symbols->varList; v != NULL; v = v->next)
+    {
+    freez(&v->val);
+    }
+}
+
 static char *symLookup(void *record, char *key)
 /* Lookup symbol in hash */
 {
 struct symRec *rec = record;
-struct hash *hash = rec->hash;
-char **row = rec->row;
-int rowIx = hashIntValDefault(hash, key, -1);
-if (rowIx < 0)
-    return NULL;
+char *value = NULL;
+struct varVal *v = hashFindVal(rec->varHash, key);
+if (v != NULL)
+    {
+    if (v->val == NULL)
+       {
+       v->val = strexEvalAsString(v->exp, record, symLookup);
+       }
+    value = v->val;
+    }
 else
-    return row[rowIx];
+    {
+    int rowIx = hashIntValDefault(rec->rowHash, key, -1);
+    if (rowIx >= 0)
+	value = rec->tableRow[rowIx];
+    }
+return value;
 }
 
 
-void selectUniqueIntoTable(struct fieldedTable *inTable,  struct hash *inFieldHash,
+void selectUniqueIntoTable(struct fieldedTable *inTable,  struct symRec *symbols,
     char *specFile,  // Just for error reporting
     struct newFieldInfo *fieldList, int keyFieldIx, struct fieldedTable *outTable)
 /* Populate out table with selected rows from newTable */
 {
 struct hash *uniqHash = hashNew(0);
 struct fieldedRow *fr;
 int outFieldCount = outTable->fieldCount;
 char *outRow[outFieldCount];
 
 if (slCount(fieldList) != outFieldCount)	// A little cheap defensive programming on inputs
     internalErr();
 
 struct dyString *csvScratch = dyStringNew(0);
 for (fr = inTable->rowList; fr != NULL; fr = fr->next)
     {
     /* Create new row from a scan through old table */
     char **inRow = fr->row;
     int i;
     struct newFieldInfo *unlinkedFv;
+    boolean firstSymInRow = TRUE;  // Avoid updating symbol table until we have to
+
     for (i=0, unlinkedFv=fieldList; i<outFieldCount && unlinkedFv != NULL; 
 	++i, unlinkedFv = unlinkedFv->next)
 	{
 	/* Skip through links. */
 	struct newFieldInfo *fv = unlinkedFv;
 	while (fv->type == fvLink)
 	    fv = fv->link;
 	
 	if (fv->type == fvVar)
 	    outRow[i] = inRow[fv->oldIx];
 	else if (fv->type == fvExp)
 	    {
-	    struct symRec symRec = {inFieldHash, inRow};
-	    outRow[i] = strexEvalAsString(fv->exp, &symRec, symLookup);
+	    if (firstSymInRow)
+	        {
+		symbols->tableRow = inRow;
+		symRecSetupPrecomputes(symbols);
+		firstSymInRow = FALSE;
+		}
+	    outRow[i] = strexEvalAsString(fv->exp, symbols, symLookup);
 	    verbose(2, "evaluated %s to %s\n", fv->val, outRow[i]);
 	    }
 	}
 
     char *key = outRow[keyFieldIx];
     if (!isEmpty(key))
 	{
 	struct fieldedRow *uniqFr = hashFindVal(uniqHash, key);
 	if (uniqFr == NULL)
 	    {
 	    uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0);
 	    hashAdd(uniqHash, key, uniqFr);
 	    }
 	else    /* Do error checking for true uniqueness of key */
 	    {
@@ -260,67 +336,93 @@
 for (i=0; i<fieldCount; ++i)
    hashAdd(hash, fields[i], intToPt(i));
 return hash;
 }
 
 
 void tabToTabDir(char *inTabFile, char *specFile, char *outDir)
 /* tabToTabDir - Convert a large tab-separated table to a directory full of such tables 
  * according to a specification.. */
 {
 /* Read input table */
 struct fieldedTable *inTable = fieldedTableFromTabFile(inTabFile, inTabFile, NULL, 0);
 verbose(1, "Read %d columns, %d rows from %s\n", inTable->fieldCount, inTable->rowCount,
     inTabFile);
 
-/* Compute info on the fields */
+/* Create what we need for managing strex's symbol table. */
 struct hash *inFieldHash = hashFieldIx(inTable->fields, inTable->fieldCount);
-struct symRec symbols = {inFieldHash, inTable->fields}; // Sym lookup just returns symbol name during parsing
+struct hash *varHash = hashNew(5);
+struct symRec *symbols = symRecNew(inFieldHash, varHash); 
+symbols->tableRow = inTable->fields;   // During parse pass fields will act as proxy for tableRow
+/* Open spec file, check first real line, and maybe start defining variables. */
 
-/* Read in spec file as ra file stanzas that we convert into tableInfos. */
+/* Snoop for a define stanza first that'll hold our variables. */
 struct lineFile *lf = lineFileOpen(specFile, TRUE);
+char *defLine;
+if (!lineFileNextReal(lf, &defLine))
+     errAbort("%s is empty", specFile);
+if (startsWithWord("define",  defLine))  // Whee, we got vars! 
+    {
+    char *varName, *varSpec;
+    while (raNextTagVal(lf, &varName, &varSpec, NULL))
+        {
+	verbose(1, "var %s (%s)\n", varName, varSpec);
+	struct strexParse *exp = strexParseString(varSpec, lf->fileName, lf->lineIx-1, 
+	    symbols, symLookup);
+	strexParseDump(exp, 1, uglyOut);
+	struct varVal *v = varValNew(varName, exp);
+	hashAdd(varHash, varName, v);
+	slAddHead(&symbols->varList, v);
+	}
+    slReverse(&symbols->varList);
+    }
+else
+    lineFileReuse(lf);
+
+
+/* Read in rest of spec file as ra stanzas full of tables more or less */
 struct newTableInfo *newTableList = NULL, *newTable;
 while (raSkipLeadingEmptyLines(lf, NULL))
     {
     /* Read first tag, which we know is there because it's right after raSkipLeadingEmptyLines.
      * Make sure the tag is table, and that there is a following table name and key field name. */
     char *tableString, *tableSpec;
     raNextTagVal(lf, &tableString, &tableSpec, NULL);
     verbose(2, "Processing table %s '%s' line %d of %s\n",  tableString, tableSpec, 
 	lf->lineIx, lf->fileName);
     if (!sameString(tableString, "table"))
         errAbort("stanza that doesn't start with 'table' ending line %d of %s",
 	    lf->lineIx, lf->fileName);
     char *tableName = nextWord(&tableSpec);
     char *keyFieldName = cloneString(nextWord(&tableSpec));
     if (isEmpty(keyFieldName))
        errAbort("No key field for table %s line %d of %s", tableName, lf->lineIx, lf->fileName);
 
     /* Start filling out newTable with these fields */
     AllocVar(newTable);
     newTable->name = cloneString(tableName);
     tableName = newTable->name;  /* Keep this handy variable. */
 
     /* Make up field list out of rest of the stanza */
     struct newFieldInfo *fvList = NULL;
     char *fieldName, *fieldSpec;
     int fieldCount = 0;
     while (raNextTagVal(lf, &fieldName, &fieldSpec, NULL))
         {
 	verbose(2, "  fieldName %s fieldSpec (%s)\n", fieldName, fieldSpec);
-	struct newFieldInfo *fv = parseFieldVal(fieldName, inFieldHash,
-	    fieldSpec, lf->fileName, lf->lineIx, &symbols, symLookup);
+	struct newFieldInfo *fv = parseFieldVal(fieldName, 
+	    fieldSpec, lf->fileName, lf->lineIx, symbols, symLookup);
 	if (fv->type == fvVar)
 	    {
 	    char *oldName = fieldSpec;
 	    if (isEmpty(oldName))
 	       oldName = fieldName;
 	    int oldIx = stringArrayIx(oldName, inTable->fields, inTable->fieldCount);
 	    if (oldIx < 0)
 	       errAbort("%s doesn't exist in the %d fields of %s line %d of %s", 
 		oldName, inTable->fieldCount, inTable->name,
 		    lf->lineIx, lf->fileName);
 	    fv->oldIx = oldIx;
 	    }
 	fv->newIx = fieldCount++;
 	slAddHead(&fvList, fv);
 	}
@@ -361,35 +463,36 @@
     for (field = newTable->fieldList; field != NULL; field = field->next)
       {
       if (field->type == fvLink)
           {
 	  struct newTableInfo *linkedTable = findTable(newTableList, field->val);
 	  if (linkedTable == NULL)
 	     errAbort("@%s doesn't exist", field->name);
 	  field->link = linkedTable->keyField;
 	  }
       }
     }
 
 makeDirsOnPath(outDir);
 
 /* Output tables */
+verbose(1, "Outputting %d tables to %s\n", slCount(newTableList), outDir);
 for (newTable = newTableList; newTable != NULL; newTable = newTable->next)
     {
     /* Populate table */
     struct fieldedTable *outTable = newTable->table;
-    selectUniqueIntoTable(inTable, inFieldHash, specFile,
+    selectUniqueIntoTable(inTable, symbols, specFile,
 	newTable->fieldList, newTable->keyField->newIx, outTable);
 
     /* Create output file name and save file. */
     char outTabName[FILENAME_LEN];
     safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, newTable->name);
     verbose(1, "Writing %s of %d columns %d rows\n",  
 	outTabName, outTable->fieldCount, outTable->rowCount);
     fieldedTableToTabFile(outTable, outTabName);
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);