src/tabFile/tabToTabDir/tabToTabDir.c be4311c07e14feb728abc6425ee606ffaa611a58

be4311c07e14feb728abc6425ee606ffaa611a58
markd
  Fri Jan 22 06:46:58 2021 -0800
merge with master

diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c
index 2cdfbc9..19d48d7 100644
--- src/tabFile/tabToTabDir/tabToTabDir.c
+++ src/tabFile/tabToTabDir/tabToTabDir.c
@@ -1,643 +1,803 @@
 /* tabToTabDir - Convert a large tab-separated table to a directory full of such tables according 
  * to a specification.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "obscure.h"
 #include "sqlNum.h"
 #include "portable.h"
 #include "ra.h"
 #include "csv.h"
 #include "fieldedTable.h"
 #include "strex.h"
 #include "localmem.h"
 
 char *clId = NULL;  // Flag set from command line to add an id column
 int clStartId = 1;  // What number id column should start with
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 "tabToTabDir - Convert a large tab-separated table to a directory full of such tables according\n"
-"to a specification.\n"
-"command line:\n"
-"   tabToTabDir in.tsv spec.txt outDir\n"
+"to a specification. The program is designed to make it relatively easy to unpack overloaded\n"
+"single fields into multiple fields, and to created normalized less redundant representations.\n"
+"The command line is:\n"
+"   tabToTabDir in.tsv spec.x outDir\n"
 "options:\n"
 "   -id=fieldName - Add a numeric id field of given name that starts at 1 and autoincrements \n"
 "                   for each table\n"
 "   -startId=fieldName - sets starting ID to be something other than 1\n"
 "usage:\n"
 "   in.tsv is a tab-separated input file.  The first line is the label names and may start with #\n"
-"   spec.txt is a file that says what columns to put into the output, described in more detail below\n"
-"   outDir is a directory that will be populated with tab-separated files\n"
-"The spec.txt file contains one blank line separated stanza per output table.\n"
+"   spec.x is a file that says what columns to put into the output, described in more detail below.\n"
+"The spec.x file contains one blank line separated stanza per output table.\n"
 "Each stanza should look like:\n"
 "        table tableName    key-column\n"
-"        columnName1	sourceField1\n"
-"        columnName2	sourceField2\n"
+"        columnName1	sourceExpression1\n"
+"        columnName2	sourceExpression2\n"
 "              ...\n"
-"if the sourceField is missing it is assumed to be a column of the same name in in.tsv\n"
-"The sourceField can either be a column name in the in.tsv, or a string enclosed literal\n"
-"or an @ followed by a table name, in which case it refers to the key of that table.\n"
-"If the source column is in comma-separated-values format then the sourceField can include a\n"
-"constant array index to pick out an item from the csv list.\n"
+"if the sourceExpression is missing it is assumed to be a just a field of the same name from in.tsv\n"
+"Otherwise the sourceExpression can be a strex expression involving fields in in.tsv.\n"
 "\n"
-"If there is a '?' in front of the column name it is taken to mean an optional field.\n"
-"if the corresponding source field does not exist then there's no error (and no output)\n"
-"for that column\n"
-"\n"
-"You can also use strex expressions for more complicated situations.\n"
-"            See src/lib/strex.doc\n"
-"In addition to the table stanza there can be a 'define' stanza that defines variables\n"
-"that can be used in sourceFields for tables.  This looks like:\n"
-"         define\n"
-"         variable1 sourceField1\n"
-"         variable2 sourceField2\n"
+"Each output table has duplicate rows merged using the key-column to determine uniqueness.\n"
+"Please see tabToTabDir.doc in the source code for more information on what can go into spec.x.\n"
 );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"id", OPTION_STRING},
    {"startId", OPTION_INT},
    {NULL, 0},
 };
 
 
-static int firstDifferentIx(char **aa, char **bb, int count)
-/* Return true if first count of strings between aa and bb are the same */
-{
-int i;
-for (i=0; i<count; ++i)
-    if (!sameString(aa[i], bb[i]))
-        return i;
-return -1;
-}
-
 enum fieldValType
 /* A type */
     {
-    fvVar, fvLink, fvExp,
+    fvVar, fvLink, fvExp, fvCount,
+    };
+
+enum combineType
+/* A way to combine values from a field */
+    {
+    ctCount, ctUniq, ctStats,
     };
 
 struct newFieldInfo
 /* An expression that can define what fits in a field */
     {
     struct newFieldInfo *next;	/* Might want to hang these on a list. */
     char *name;			/* Name of field in new table */
     enum fieldValType type;	/* Constant, link, or variable */
     int oldIx;			/* For variable and link ones where field is in old table */
     int newIx;			/* Where field is in new table. */
     char *val;			/* For constant ones the string value */
     int arrayIx;		/* If it's an array then the value */
     struct newFieldInfo *link;	/* If it's fvLink then pointer to the linked field */
     struct strexParse *exp;	/* A parsed out string expression */
     boolean optional;		/* If true, then skip rather than stop if old field doesn't exist */
+    struct hash *combineHash;     /* If it's type fvCombine an int valued hash here */
+    enum combineType combineType;  /* How to combine if multiple values allowed */
     };
 
 struct newFieldInfo *findField(struct newFieldInfo *list, char *name)
 /* Find named element in list, or NULL if not found. */
 {
 struct newFieldInfo *el;
 for (el = list; el != NULL; el = el->next)
     if (sameString(name, el->name))
         return el;
 return NULL;
 }
 
 struct newTableInfo
 /* Info on a new table we are making */
     {
     struct newTableInfo *next;	/* Next in list */
     char *name;			/* Name of table */
     struct newFieldInfo *keyField;	/* Key field within table */
     struct newFieldInfo *fieldList; /* List of fields */
     struct fieldedTable *table;	    /* Table to fill in. */
     boolean unroll;		    /* If true it's a table we unroll from arrays */
     };
 
 struct newTableInfo *findTable(struct newTableInfo *list, char *name)
 /* Find named element in list, or NULL if not found. */
 {
 struct newTableInfo *el;
 for (el = list; el != NULL; el = el->next)
     if (sameString(name, el->name))
         return el;
 return NULL;
 }
 
 struct varVal
 /* A variable, what we need to compute it, and it's value */
      {
      struct varVal *next;   /* Next in list */
      char *name;	    /* Variable name */
      struct strexParse *exp;  /* Parsed out expression. */
      char *val;		    /* Computed value - not owned by us. */
      };
 
 struct varVal *varValNew(char *name, struct strexParse *exp)
 /* Allocate new varVal structure */
 {
 struct varVal *v;
 AllocVar(v);
 v->name = cloneString(name);
 v->exp = exp;
 return v;
 }
 
 
 struct symRec
 /* Something we pass as a record to symLookup */
     {
     struct hash *rowHash;	    /* The hash with symbol to row index */
     char **tableRow;		    /* The input row we are working on. You own.*/
     struct hash *varHash;	    /* Variables with varVal values */
     struct varVal *varList;	    /* List of all variables, same info as in hash above. */
     struct lm *lm;		    /* Local memory to use during eval phase */
     char *fileName;		    /* File name of big input tab file */
     int lineIx;			    /* Line number of big input tab file */
     };
 
 struct symRec *symRecNew(struct hash *rowHash, struct hash *varHash, char *fileName, int lineIx)
 /* Return a new symRec. The rowHash is required and contains a hash with
  * values that are indexes into the table row.  The varHash is optional,
  * and if present should have variable names keying parseExp values. */
 {
 struct symRec *rec;
 AllocVar(rec);
 rec->rowHash = rowHash;
 if (varHash != NULL)
     {
     rec->varHash = varHash;
     rec->fileName = fileName;
     rec->lineIx = lineIx;
     }
 return rec;
 }
 
 boolean isTotallySimple(char *s)
 /* We are only alphanumerical and dotty things, we even begin with a alnum or _*/
 {
 char c = *s++;
 if (!isalpha(c) && (c != '_'))
     return FALSE;
 while ((c = *s++) != 0)
     {
     if (!(isalnum(c) || (c == '_') || (c == '.')))
 	return FALSE;
     }
 return TRUE;
 }
 
 int gTotalFields = 0, gStrexFields = 0, gLinkFields = 0;
 
 struct newFieldInfo *parseFieldVal(char *name, 
     char *input, char *fileName, int fileLineNumber, struct symRec  *symbols, StrexLookup lookup)
 /* return a newFieldInfo based on the contents of input, which are not destroyed */
 {
 /* Make up return structure. */
 
 struct newFieldInfo *fv;
 AllocVar(fv);
 char c = name[0];
 if (c == '?')
     {
     fv->optional = TRUE;
     name += 1;
     }
 else if (!isalpha(c) && (c != '_'))
     {
     errAbort("Strange character %c starting line %d of %s", c, fileLineNumber, fileName);
     }
 fv->name = cloneString(name);
 
 char *s = trimSpaces(input);
 if (isEmpty(s))
     {
+    s = cloneString(name);
     fv->type = fvVar;
-    s = fv->val = cloneString(name);
     }
 c = s[0];
 if (c == '@')
     {
     char *val = fv->val = cloneString(skipLeadingSpaces(s+1));
     if (isEmpty(val))
-	errAbort("Nothing following %c", c);
+	errAbort("Nothing following %c line %d of %s", c, fileLineNumber, fileName);
     fv->type = fvLink;
     ++gLinkFields;
     }
 else 
     {
+    if (c == '$')
+	{
+	char *command = skipLeadingSpaces(s+1);
+	s = skipToSpaces(command);
+	fv->combineHash = hashNew(0);
+	if (startsWithWord("count", command))
+	    {
+	    if (!isEmpty(s))
+		errAbort("Something following $count line %d of %s", fileLineNumber, fileName);
+	    fv->combineType = ctCount;
+	    fv->type = fvCount;
+	    }
+        else if (startsWithWord("list", command))
+	    {
+	    fv->combineType = ctUniq;
+	    if (isEmpty(skipLeadingSpaces(s)))
+	        errAbort("Missing parameters to $list line %d of %s", fileLineNumber, fileName);
+	    }
+        else if (startsWithWord("stats", command))
+	    {
+	    fv->combineType = ctStats;
+	    if (isEmpty(skipLeadingSpaces(s)))
+	        errAbort("Missing parameters to $stats line %d of %s", fileLineNumber, fileName);
+	    }
+	else
+	    {
+	    errAbort("Unrecognized command $%s line %d of %s", command, fileLineNumber, fileName);
+	    }
+	}
+    if (fv->combineHash == NULL || fv->combineType != ctCount)
+	{
 	if (isTotallySimple(s) && hashLookup(symbols->varHash, s) == NULL)
 	    {
 	    fv->val = cloneString(skipLeadingSpaces(s));
 	    eraseTrailingSpaces(fv->val);
 	    fv->type = fvVar;
 	    }
 	else
 	    {
 	    fv->val = cloneString(s);
 	    fv->exp = strexParseString(fv->val, fileName, fileLineNumber-1, symbols, lookup);
 	    fv->type = fvExp;
 	    gStrexFields += 1;
 	    }
 	}
+    }
 gTotalFields += 1;
 return fv;
 }
 
 static void symRecSetupPrecomputes(struct symRec *symbols)
 /* Clear out any precomputed variable values - should be
  * executed on each new line of table. */
 {
 /* Clear up any old precomputes - sort of sad these can't currently
  * be shared between output tables. Probably not enough of a time
  * bottleneck to be worth fixing though. */
 struct varVal *v;
 for (v = symbols->varList; v != NULL; v = v->next)
     {
     freez(&v->val);
     }
 }
 
 static void warnHandler(void *record, char *message)
 /* Our warn handler keeps a little hash to keep from repeating
  * messages for every row of the input sometimes. */
 {
 struct symRec *rec = record;
 static struct hash *uniq = NULL;
 if (uniq == NULL) uniq = hashNew(0);
 if (hashLookup(uniq, message) == NULL)
     {
     hashAdd(uniq, message, NULL);
     warn("%s line %d of %s", message, rec->lineIx, rec->fileName);
     }
 }
 
 static char *symLookup(void *record, char *key)
 /* Lookup symbol in hash */
 {
 struct symRec *rec = record;
 char *value = NULL;
 struct varVal *v = hashFindVal(rec->varHash, key);
 if (v != NULL)
     {
     if (v->val == NULL)
        {
        v->val = strexEvalAsString(v->exp, record, symLookup, warnHandler, NULL);
        }
     value = v->val;
     }
 else
     {
     int rowIx = hashIntValDefault(rec->rowHash, key, -1);
     if (rowIx >= 0)
 	value = rec->tableRow[rowIx];
     }
 return value;
 }
 
 static char *symExists(void *record, char *key)
 /* Lookup symbol in hash to see if a variable is there but not to 
  * calculate it's values. */
 {
 struct symRec *rec = record;
 struct varVal *v = hashFindVal(rec->varHash, key);
 if (v != NULL)
     {
     return v->name;
     }
 else
     {
     int rowIx = hashIntValDefault(rec->rowHash, key, -1);
     if (rowIx < 0)
         return NULL;
     return rec->tableRow[rowIx];
     }
 }
 
 
+struct uniqValLister
+/* A list of unique values */
+   {
+   struct uniqValLister *next;
+   struct dyString *csv;    // Comma separated list of values seen so far
+   struct hash *uniq;	    // Hash of values seen so far.
+   };
+
+struct oneValCount
+/* Counts occurences of one */
+    {
+    struct oneValCount *next;
+    char *name;		// Name - not allocated here
+    int count;		// Number of times seen
+    };
+
+int oneValCountCmp(const void *va, const void *vb)
+/* Compare two oneValCounts. */
+{
+const struct oneValCount *a = *((struct oneValCount **)va);
+const struct oneValCount *b = *((struct oneValCount **)vb);
+return b->count - a->count;
+}
+
+struct uniqValCounter
+/* A list of unique values and how often they occur */
+    {
+    struct uniqValCounter *next;
+    struct hash *uniq;	    // Integer valued list of values seen so far - oneValCount values
+    struct oneValCount *list;    // List of uniq values seen so far
+    int total;	    /* Total of counts in list */
+    };
+
 
 void selectUniqueIntoTable(struct fieldedTable *inTable,  struct symRec *symbols,
     char *specFile,  // Just for error reporting
     struct newFieldInfo *fieldList, int keyFieldIx, struct fieldedTable *outTable)
 /* Populate out table with selected unique rows from newTable */
 {
 struct hash *uniqHash = hashNew(0);
 struct fieldedRow *fr;
 int outFieldCount = outTable->fieldCount;
 char *outRow[outFieldCount];
 
 if (slCount(fieldList) != outFieldCount)  // A little cheap defensive programming on inputs
     internalErr();
 
-struct dyString *csvScratch = dyStringNew(0);
 for (fr = inTable->rowList; fr != NULL; fr = fr->next)
     {
     symbols->lineIx = fr->id;
     /* Create new row from a scan through old table */
     char **inRow = fr->row;
     int i;
     struct newFieldInfo *unlinkedFv;
     boolean firstSymInRow = TRUE;  // Avoid updating symbol table until we have to
 
+    /* Fill out the normal fields */
     for (i=0, unlinkedFv=fieldList; i<outFieldCount && unlinkedFv != NULL; 
 	++i, unlinkedFv = unlinkedFv->next)
 	{
 	/* Skip through links. */
 	struct newFieldInfo *fv = unlinkedFv;
 	while (fv->type == fvLink)
 	    fv = fv->link;
 	
 	if (fv->type == fvVar)
 	    outRow[i] = inRow[fv->oldIx];
 	else if (fv->type == fvExp)
 	    {
 	    if (firstSymInRow)
 	        {
 		symbols->tableRow = inRow;
 		symRecSetupPrecomputes(symbols);
 		firstSymInRow = FALSE;
 		}
 	    outRow[i] = strexEvalAsString(fv->exp, symbols, symLookup, warnHandler, NULL);
 	    verbose(2, "evaluated %s to %s\n", fv->val, outRow[i]);
 	    }
+	else
+	    outRow[i] = NULL;
 	}
 
     char *key = outRow[keyFieldIx];
     if (!isEmpty(key))
 	{
+	/* Do any aggregate fields */
+	struct newFieldInfo *fv;
+	for (fv = fieldList; fv != NULL; fv = fv->next)
+	    {
+	    if (fv->combineHash != NULL)
+		{
+		switch (fv->combineType)
+		    {
+		    case ctCount:
+			hashIncInt(fv->combineHash, key);
+			break;
+		    case ctUniq:
+		        {
+			struct uniqValLister *lister = hashFindVal(fv->combineHash, key);
+			if (lister == NULL)
+			    {
+			    AllocVar(lister);
+			    lister->csv = dyStringNew(0);
+			    lister->uniq = hashNew(0);
+			    hashAdd(fv->combineHash, key, lister);
+			    }
+			char *val = outRow[fv->newIx];
+			if (hashLookup(lister->uniq, val) == NULL)
+			    {
+			    hashAdd(lister->uniq, val, NULL);
+			    csvEscapeAndAppend(lister->csv, val);
+			    }
+			break;
+			}
+		    case ctStats:
+		        {
+			struct uniqValCounter *counter = hashFindVal(fv->combineHash, key);
+			if (counter == NULL)
+			    {
+			    AllocVar(counter);
+			    counter->uniq = hashNew(0);
+			    hashAdd(fv->combineHash, key, counter);
+			    }
+			char *val = outRow[fv->newIx];
+			struct oneValCount *one = hashFindVal(counter->uniq, val);
+			if (one == NULL)
+			    {
+			    AllocVar(one);
+			    hashAddSaveName(counter->uniq, val, one, &one->name);
+			    slAddHead(&counter->list, one);
+			    }
+			one->count += 1;
+			counter->total += 1;
+			break;
+			}
+		    }
+		}
+	    }
+
 	struct fieldedRow *uniqFr = hashFindVal(uniqHash, key);
 	if (uniqFr == NULL)
 	    {
 	    uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0);
 	    hashAdd(uniqHash, key, uniqFr);
 	    }
 	else    /* Do error checking for true uniqueness of key */
 	    {
-	    int differentIx = firstDifferentIx(outRow, uniqFr->row, outFieldCount);
-	    if (differentIx >= 0)
+	    int i;
+	    char **uniqRow = uniqFr->row;
+	    for (i=0,fv=fieldList; fv != NULL; fv = fv->next, ++i)
+	        {
+		if (fv->combineHash == NULL)
+		     {
+		     if (!sameString(uniqRow[i], outRow[i]))
 		         {
-		warn("There is a problem with the key to table %s in %s", outTable->name, specFile);
-		warn("%s %s", uniqFr->row[keyFieldIx], uniqFr->row[differentIx]);
-		warn("%s %s", outRow[keyFieldIx], outRow[differentIx]);
+			 uglyf("fv->type of %s is %d\n", fv->name, (int)fv->type);
+			 warn("There is a problem with the key to table %s in %s", 
+			     outTable->name, specFile);
+			 warn("%s %s", uniqFr->row[keyFieldIx], uniqFr->row[i]);
+			 warn("%s %s", outRow[keyFieldIx], outRow[i]);
 			 warn("both exist, so key doesn't specify a unique %s field", 
-		    outTable->fields[differentIx]);
+			     outTable->fields[i]);
 			 errAbort("line %d of %s", fr->id, inTable->name);
 			 }
 		     }
 		}
 	    }
-dyStringFree(&csvScratch);
+	}
+    }
+
+/* Make a loop through output table fixing up aggregation-oriented fields */
+    {
+    struct newFieldInfo *fv;
+    for (fv = fieldList; fv != NULL; fv = fv->next)
+        {
+	if (fv->combineHash != NULL)
+	    {
+	    for (fr = outTable->rowList; fr != NULL; fr = fr->next)
+	        {
+		char *key = fr->row[keyFieldIx];
+		switch (fv->combineType)
+		    {
+		    case ctCount:
+			{
+			char countBuf[16];
+			safef(countBuf, sizeof(countBuf), "%d", hashIntVal(fv->combineHash, key));
+			fr->row[fv->newIx] = lmCloneString(outTable->lm, countBuf);
+			break;
+			}
+		    case ctUniq:
+			{
+			struct uniqValLister *lister = hashMustFindVal(fv->combineHash, key);
+			fr->row[fv->newIx] = lister->csv->string;
+			break;
+			}
+		    case ctStats:
+		        {
+			struct uniqValCounter *counter = hashMustFindVal(fv->combineHash, key);
+			struct dyString *dy = dyStringNew(0);
+			struct oneValCount *el;
+			slSort(&counter->list, oneValCountCmp);
+			for (el = counter->list; el != NULL; el = el->next)
+			    {
+			    dyStringPrintf(dy, "%s(%d %d%%),", el->name, el->count, 
+				round(100.0 * el->count / counter->total));
+			    }
+			fr->row[fv->newIx] = dyStringCannibalize(&dy);
+			break;
+			}
+		    }
+		}
+	    }
+	}
+    }
 }
 
 
 
 struct hash *hashFieldIx(char **fields, int fieldCount)
 /* Create a hash filled with fields with integer valued indexes */
 {
 int i;
 struct hash *hash = hashNew(0);
 for (i=0; i<fieldCount; ++i)
    hashAdd(hash, fields[i], intToPt(i));
 return hash;
 }
 
 struct fieldedTable *unrollTable(struct fieldedTable *input)
 /* Unroll input table,  which has to be filled with lockstepped CSV fields */
 {
 /* Make output table with fields matching input */
 int fieldCount = input->fieldCount;
 struct fieldedTable *output = fieldedTableNew(input->name, input->fields, fieldCount);
 output->startsSharp = input->startsSharp;
 
 /* We are going to be lots of splicing and dicing, so have some scratch space,
  * including some we'll store with the output tables local memory pool. */
 struct lm *lm = output->lm;
 struct dyString *scratch = dyStringNew(0);
 
 struct fieldedRow *inRow;
 for (inRow = input->rowList; inRow != NULL; inRow = inRow->next)
     {
     /* We are going to parse a bunch of csv's in parallel */
     char *inPos[fieldCount];
     int i;
     for (i=0; i<fieldCount; ++i)
 	inPos[i] = inRow->row[i];
 
     /* With this loop we parse out the next csv from all fields, and make sure that
      * they all actually do have the same number of values */
     int unrollCount = 0;
     for (;;)
        {
        char *uncsvRow[fieldCount];
        boolean anyNull = FALSE, allNull = TRUE;
        for (i=0; i<fieldCount; ++i)
            {
 	   char *oneVal = csvParseNext(&inPos[i], scratch);
 	   if (oneVal == NULL)
 	       anyNull = TRUE;
 	   else
 	       allNull = FALSE;
 	   uncsvRow[i] = lmCloneString(lm, oneVal);
 	   }
        if (anyNull)
            {
 	   if (allNull)
 	        break;	    // All is good!
 	   else
 	        errAbort("Can't unroll %s since not all fields have the same numbers of values.\n"
 		         "In row %d some have %d values, some more", 
 			 input->name, inRow->id, unrollCount);
 	   }
        ++unrollCount;
        fieldedTableAdd(output, uncsvRow, fieldCount, unrollCount);
        }
     }
 return output;
 }
 
 void tabToTabDir(char *inTabFile, char *specFile, char *outDir)
 /* tabToTabDir - Convert a large tab-separated table to a directory full of such tables 
  * according to a specification.. */
 {
 /* Read input table */
 struct fieldedTable *inTable = fieldedTableFromTabFile(inTabFile, inTabFile, NULL, 0);
 verbose(1, "Read %d columns, %d rows from %s\n", inTable->fieldCount, inTable->rowCount,
     inTabFile);
 
 /* Create what we need for managing strex's symbol table. */
 struct hash *inFieldHash = hashFieldIx(inTable->fields, inTable->fieldCount);
 struct hash *varHash = hashNew(5);
 struct symRec *symbols = symRecNew(inFieldHash, varHash, inTabFile, 0); 
 symbols->tableRow = inTable->fields;   // During parse pass fields will act as proxy for tableRow
 
 /* Snoop for a define stanza first that'll hold our variables. */
 struct lineFile *lf = lineFileOpen(specFile, TRUE);
 char *defLine;
 if (!lineFileNextReal(lf, &defLine))
      errAbort("%s is empty", specFile);
 if (startsWithWord("define",  defLine))  // Whee, we got vars! 
     {
     char *varName, *varSpec;
     while (raNextTagVal(lf, &varName, &varSpec, NULL))
         {
 	if (varSpec == NULL)
 	    errAbort("Expecting expression for variable %s line %d of %s", varName,
 		lf->lineIx, lf->fileName);
 	verbose(2, "var %s (%s)\n", varName, varSpec);
 	struct strexParse *exp = strexParseString(varSpec, lf->fileName, lf->lineIx-1, 
 	    symbols, symExists);
 	struct varVal *v = varValNew(varName, exp);
 	hashAdd(varHash, varName, v);
 	slAddHead(&symbols->varList, v);
 	}
     slReverse(&symbols->varList);
     }
 else
     lineFileReuse(lf);
 
 
 /* Read in rest of spec file as ra stanzas full of tables more or less */
 struct newTableInfo *newTableList = NULL, *newTable;
 while (raSkipLeadingEmptyLines(lf, NULL))
     {
     /* Read first tag, which we know is there because it's right after raSkipLeadingEmptyLines.
      * Make sure the tag is table, and that there is a following table name and key field name. */
     char *tableString, *tableSpec;
     boolean unroll = FALSE;
     raNextTagVal(lf, &tableString, &tableSpec, NULL);
     verbose(2, "Processing table %s '%s' line %d of %s\n",  tableString, tableSpec, 
 	lf->lineIx, lf->fileName);
     if (sameString(tableString, "unroll"))
         unroll = TRUE;
     else if (!sameString(tableString, "table"))
         errAbort("stanza that doesn't start with 'table' or 'unroll' ending line %d of %s",
 	    lf->lineIx, lf->fileName);
     char *tableName = nextWord(&tableSpec);
     char *keyFieldName = cloneString(nextWord(&tableSpec));
     if (isEmpty(keyFieldName))
        errAbort("No key field for table %s line %d of %s", tableName, lf->lineIx, lf->fileName);
 
     /* Start filling out newTable with these fields */
     AllocVar(newTable);
     newTable->unroll = unroll;
     newTable->name = cloneString(tableName);
     tableName = newTable->name;  /* Keep this handy variable. */
 
     /* Make up field list out of rest of the stanza */
     struct newFieldInfo *fvList = NULL;
     char *fieldName, *fieldSpec;
     int fieldCount = 0;
     while (raNextTagVal(lf, &fieldName, &fieldSpec, NULL))
         {
 	verbose(2, "  fieldName %s fieldSpec (%s)\n", fieldName, fieldSpec);
 	struct newFieldInfo *fv = parseFieldVal(fieldName, 
 	    fieldSpec, lf->fileName, lf->lineIx, symbols, symExists);
 	if (fv->type == fvVar)
 	    {
 	    char *oldName = fieldSpec;
 	    if (isEmpty(oldName))
 	       oldName = fieldName;
 	    int oldIx = stringArrayIx(oldName, inTable->fields, inTable->fieldCount);
 	    if (oldIx < 0)
 	       {
 	       if (fv->optional)
 	           continue;	    // Just skip optional ones we don't have
-	       errAbort("%s doesn't exist in the %d fields of %s line %d of %s", 
+	       errAbort("'%s' doesn't exist in the %d fields of %s line %d of %s", 
 		oldName, inTable->fieldCount, inTable->name,
 		    lf->lineIx, lf->fileName);
 	       }
 	    fv->oldIx = oldIx;
 	    }
 	fv->newIx = fieldCount++;
 	slAddHead(&fvList, fv);
 	}
     slReverse(&fvList);
 
     /* Create array of field names for output. */
     char *fieldNames[fieldCount];
     int i;
     struct newFieldInfo *fv = NULL;
     for (i=0, fv=fvList; i<fieldCount; ++i, fv=fv->next)
 	fieldNames[i] = fv->name;
 
     /* Create empty output table and track which fields of input go to output. */
     struct fieldedTable *outTable = fieldedTableNew(tableName, fieldNames, fieldCount);
     outTable->startsSharp = inTable->startsSharp;
 
     /* Make sure that key field is actually in field list */
     struct newFieldInfo *keyField = findField(fvList, keyFieldName);
     if (keyField == NULL)
        errAbort("key field %s is not found in field list for %s in %s\n", 
 	    keyFieldName, tableName, lf->fileName);
 
     /* Allocate structure to save results of this pass in and so so. */
     newTable->keyField = keyField;
     newTable->fieldList = fvList;
     newTable->table = outTable;
     slAddHead(&newTableList, newTable);
 
     /* Clean up */
     freez(&keyFieldName);
     }
 slReverse(&newTableList);
 
 /* Do links between tables */
 for (newTable = newTableList; newTable != NULL; newTable = newTable->next)
     {
     struct newFieldInfo *field;
     for (field = newTable->fieldList; field != NULL; field = field->next)
       {
       if (field->type == fvLink)
           {
 	  struct newTableInfo *linkedTable = findTable(newTableList, field->val);
 	  if (linkedTable == NULL)
 	     errAbort("@%s doesn't exist", field->name);
 	  field->link = linkedTable->keyField;
 	  }
       }
     }
 
 makeDirsOnPath(outDir);
 
 /* Output tables */
 verbose(1, "Outputting %d tables to %s\n", slCount(newTableList), outDir);
 for (newTable = newTableList; newTable != NULL; newTable = newTable->next)
     {
     /* Populate table */
     struct fieldedTable *outTable = newTable->table;
     selectUniqueIntoTable(inTable, symbols, specFile,
 	newTable->fieldList, newTable->keyField->newIx, outTable);
 
     /* If need be unroll table */
     if (newTable->unroll)
         {
 	outTable = unrollTable(outTable);
 	}
 
     /* Create output file name and save file. */
     char outTabName[FILENAME_LEN];
     safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, newTable->name);
     verbose(1, "Writing %s of %d columns %d rows\n",  
 	outTabName, outTable->fieldCount, outTable->rowCount);
     fieldedTableToTabFileWithId(outTable, outTabName, clId, clStartId);
     }
 verbose(1, "%d fields, %d (%g%%) evaluated with strex, %d (%.2f) links\n", 
     gTotalFields,  gStrexFields, 100.0 * gStrexFields / gTotalFields,
     gLinkFields, 100.0 * gLinkFields/gTotalFields);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 clId = optionVal("id", clId);
 clStartId = optionInt("startId", clStartId);
 if (argc != 4)
     usage();
 tabToTabDir(argv[1], argv[2], argv[3]);
 return 0;
 }