27f5d7f9b4cc6d337ab34422b25a1c94553623e9 kent Fri Aug 16 11:54:34 2019 -0700 Adding a define stanza with variables that get evaluated each line. diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c index 55bf29e..4dee597 100644 --- src/tabFile/tabToTabDir/tabToTabDir.c +++ src/tabFile/tabToTabDir/tabToTabDir.c @@ -23,31 +23,37 @@ "where:\n" " in.tsv is a tab-separated input file. The first line is the label names and may start with #\n" " spec.txt is a file that says what columns to put into the output, described in more detail below\n" " outDir is a directory that will be populated with tab-separated files\n" "The spec.txt file contains one blank line separated stanza per output table.\n" "Each stanza should look like:\n" " table tableName key-column\n" " columnName1 sourceField1\n" " columnName2 sourceField2\n" " ...\n" "if the sourceField is missing it is assumed to be a column of the same name in in.tsv\n" "The sourceField can either be a column name in the in.tsv, or a string enclosed literal\n" "or an @ followed by a table name, in which case it refers to the key of that table.\n" "If the source column is in comma-separated-values format then the sourceField can include a\n" "constant array index to pick out an item from the csv list.\n" -"You can also use strex expressions for more complicated situations. See src/lib/strex.doc\n" +"You can also use strex expressions for more complicated situations.\n" +" See src/lib/strex.doc\n" +"In addition to the table stanza there can be a 'define' stanza that defines variables\n" +"that can be used in sourceFields for tables. This looks like:\n" +" define\n" +" variable1 sourceField1\n" +" variable2 sourceField2\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; static int firstDifferentIx(char **aa, char **bb, int count) /* Return true if first count of strings between aa and bb are the same */ { int i; for (i=0; iname = cloneString(name); char *s = skipLeadingSpaces(input); if (isEmpty(s)) { fv->type = fvVar; fv->val = cloneString(name); } else { char c = s[0]; if (c == '@') { char *val = fv->val = cloneString(skipLeadingSpaces(s+1)); trimSpaces(val); if (isEmpty(val)) errAbort("Nothing following %c", c); fv->type = fvLink; } else { - if (isTotallySimple(s)) + if (isTotallySimple(s) && lookup(symbols, s) == NULL) { fv->val = cloneString(skipLeadingSpaces(s)); eraseTrailingSpaces(fv->val); fv->type = fvVar; } else { fv->val = cloneString(s); fv->exp = strexParseString(fv->val, fileName, fileLineNumber-1, symbols, lookup); fv->type = fvExp; } } } return fv; } +struct varVal +/* A variable, what we need to compute it, and it's value */ + { + struct varVal *next; /* Next in list */ + char *name; /* Variable name */ + struct strexParse *exp; /* Parsed out expression. */ + char *val; /* Computed value - not owned by us. */ + }; + +struct varVal *varValNew(char *name, struct strexParse *exp) +/* Allocate new varVal structure */ +{ +struct varVal *v; +AllocVar(v); +v->name = cloneString(name); +v->exp = exp; +return v; +} + + struct symRec /* Something we pass as a record to symLookup */ { - struct hash *hash; /* The hash with symbol to row index */ - char **row; /* The row we are working on */ + struct hash *rowHash; /* The hash with symbol to row index */ + char **tableRow; /* The input row we are working on. You own.*/ + char **varRow; /* A slot for each computed variables results. We own */ + struct hash *varHash; /* Variables with varVal values */ + struct varVal *varList; /* List of all variables, same info as in hash above. */ + struct lm *lm; /* Local memory to use during eval phase */ }; +struct symRec *symRecNew(struct hash *rowHash, struct hash *varHash) +/* Return a new symRec. The rowHash is required and contains a hash with + * values that are indexes into the table row. The varHash is optional, + * and if present should have variable names keying parseExp values. */ +{ +struct symRec *rec; +AllocVar(rec); +rec->rowHash = rowHash; +if (varHash != NULL) + { + rec->varHash = varHash; + } +return rec; +} + +static void symRecSetupPrecomputes(struct symRec *symbols) +/* Clear out any precomputed variable values - should be + * executed on each new line of table. */ +{ +/* Clear up any old precomputes - sort of sad these can't currently + * be shared between output tables. Probably not enough of a time + * bottleneck to be worth fixing though. */ +struct varVal *v; +for (v = symbols->varList; v != NULL; v = v->next) + { + freez(&v->val); + } +} + static char *symLookup(void *record, char *key) /* Lookup symbol in hash */ { struct symRec *rec = record; -struct hash *hash = rec->hash; -char **row = rec->row; -int rowIx = hashIntValDefault(hash, key, -1); -if (rowIx < 0) - return NULL; +char *value = NULL; +struct varVal *v = hashFindVal(rec->varHash, key); +if (v != NULL) + { + if (v->val == NULL) + { + v->val = strexEvalAsString(v->exp, record, symLookup); + } + value = v->val; + } else - return row[rowIx]; + { + int rowIx = hashIntValDefault(rec->rowHash, key, -1); + if (rowIx >= 0) + value = rec->tableRow[rowIx]; + } +return value; } -void selectUniqueIntoTable(struct fieldedTable *inTable, struct hash *inFieldHash, +void selectUniqueIntoTable(struct fieldedTable *inTable, struct symRec *symbols, char *specFile, // Just for error reporting struct newFieldInfo *fieldList, int keyFieldIx, struct fieldedTable *outTable) /* Populate out table with selected rows from newTable */ { struct hash *uniqHash = hashNew(0); struct fieldedRow *fr; int outFieldCount = outTable->fieldCount; char *outRow[outFieldCount]; if (slCount(fieldList) != outFieldCount) // A little cheap defensive programming on inputs internalErr(); struct dyString *csvScratch = dyStringNew(0); for (fr = inTable->rowList; fr != NULL; fr = fr->next) { /* Create new row from a scan through old table */ char **inRow = fr->row; int i; struct newFieldInfo *unlinkedFv; + boolean firstSymInRow = TRUE; // Avoid updating symbol table until we have to + for (i=0, unlinkedFv=fieldList; inext) { /* Skip through links. */ struct newFieldInfo *fv = unlinkedFv; while (fv->type == fvLink) fv = fv->link; if (fv->type == fvVar) outRow[i] = inRow[fv->oldIx]; else if (fv->type == fvExp) { - struct symRec symRec = {inFieldHash, inRow}; - outRow[i] = strexEvalAsString(fv->exp, &symRec, symLookup); + if (firstSymInRow) + { + symbols->tableRow = inRow; + symRecSetupPrecomputes(symbols); + firstSymInRow = FALSE; + } + outRow[i] = strexEvalAsString(fv->exp, symbols, symLookup); verbose(2, "evaluated %s to %s\n", fv->val, outRow[i]); } } char *key = outRow[keyFieldIx]; if (!isEmpty(key)) { struct fieldedRow *uniqFr = hashFindVal(uniqHash, key); if (uniqFr == NULL) { uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0); hashAdd(uniqHash, key, uniqFr); } else /* Do error checking for true uniqueness of key */ { @@ -260,67 +336,93 @@ for (i=0; ifieldCount, inTable->rowCount, inTabFile); -/* Compute info on the fields */ +/* Create what we need for managing strex's symbol table. */ struct hash *inFieldHash = hashFieldIx(inTable->fields, inTable->fieldCount); -struct symRec symbols = {inFieldHash, inTable->fields}; // Sym lookup just returns symbol name during parsing +struct hash *varHash = hashNew(5); +struct symRec *symbols = symRecNew(inFieldHash, varHash); +symbols->tableRow = inTable->fields; // During parse pass fields will act as proxy for tableRow +/* Open spec file, check first real line, and maybe start defining variables. */ -/* Read in spec file as ra file stanzas that we convert into tableInfos. */ +/* Snoop for a define stanza first that'll hold our variables. */ struct lineFile *lf = lineFileOpen(specFile, TRUE); +char *defLine; +if (!lineFileNextReal(lf, &defLine)) + errAbort("%s is empty", specFile); +if (startsWithWord("define", defLine)) // Whee, we got vars! + { + char *varName, *varSpec; + while (raNextTagVal(lf, &varName, &varSpec, NULL)) + { + verbose(1, "var %s (%s)\n", varName, varSpec); + struct strexParse *exp = strexParseString(varSpec, lf->fileName, lf->lineIx-1, + symbols, symLookup); + strexParseDump(exp, 1, uglyOut); + struct varVal *v = varValNew(varName, exp); + hashAdd(varHash, varName, v); + slAddHead(&symbols->varList, v); + } + slReverse(&symbols->varList); + } +else + lineFileReuse(lf); + + +/* Read in rest of spec file as ra stanzas full of tables more or less */ struct newTableInfo *newTableList = NULL, *newTable; while (raSkipLeadingEmptyLines(lf, NULL)) { /* Read first tag, which we know is there because it's right after raSkipLeadingEmptyLines. * Make sure the tag is table, and that there is a following table name and key field name. */ char *tableString, *tableSpec; raNextTagVal(lf, &tableString, &tableSpec, NULL); verbose(2, "Processing table %s '%s' line %d of %s\n", tableString, tableSpec, lf->lineIx, lf->fileName); if (!sameString(tableString, "table")) errAbort("stanza that doesn't start with 'table' ending line %d of %s", lf->lineIx, lf->fileName); char *tableName = nextWord(&tableSpec); char *keyFieldName = cloneString(nextWord(&tableSpec)); if (isEmpty(keyFieldName)) errAbort("No key field for table %s line %d of %s", tableName, lf->lineIx, lf->fileName); /* Start filling out newTable with these fields */ AllocVar(newTable); newTable->name = cloneString(tableName); tableName = newTable->name; /* Keep this handy variable. */ /* Make up field list out of rest of the stanza */ struct newFieldInfo *fvList = NULL; char *fieldName, *fieldSpec; int fieldCount = 0; while (raNextTagVal(lf, &fieldName, &fieldSpec, NULL)) { verbose(2, " fieldName %s fieldSpec (%s)\n", fieldName, fieldSpec); - struct newFieldInfo *fv = parseFieldVal(fieldName, inFieldHash, - fieldSpec, lf->fileName, lf->lineIx, &symbols, symLookup); + struct newFieldInfo *fv = parseFieldVal(fieldName, + fieldSpec, lf->fileName, lf->lineIx, symbols, symLookup); if (fv->type == fvVar) { char *oldName = fieldSpec; if (isEmpty(oldName)) oldName = fieldName; int oldIx = stringArrayIx(oldName, inTable->fields, inTable->fieldCount); if (oldIx < 0) errAbort("%s doesn't exist in the %d fields of %s line %d of %s", oldName, inTable->fieldCount, inTable->name, lf->lineIx, lf->fileName); fv->oldIx = oldIx; } fv->newIx = fieldCount++; slAddHead(&fvList, fv); } @@ -361,35 +463,36 @@ for (field = newTable->fieldList; field != NULL; field = field->next) { if (field->type == fvLink) { struct newTableInfo *linkedTable = findTable(newTableList, field->val); if (linkedTable == NULL) errAbort("@%s doesn't exist", field->name); field->link = linkedTable->keyField; } } } makeDirsOnPath(outDir); /* Output tables */ +verbose(1, "Outputting %d tables to %s\n", slCount(newTableList), outDir); for (newTable = newTableList; newTable != NULL; newTable = newTable->next) { /* Populate table */ struct fieldedTable *outTable = newTable->table; - selectUniqueIntoTable(inTable, inFieldHash, specFile, + selectUniqueIntoTable(inTable, symbols, specFile, newTable->fieldList, newTable->keyField->newIx, outTable); /* Create output file name and save file. */ char outTabName[FILENAME_LEN]; safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, newTable->name); verbose(1, "Writing %s of %d columns %d rows\n", outTabName, outTable->fieldCount, outTable->rowCount); fieldedTableToTabFile(outTable, outTabName); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options);