0b7fc0972a745005ac329190569f51b12877ebe0 kent Sun Sep 15 15:24:53 2019 -0700 Adding unroll for unpacking csv arrays into a table with multiple rows. The driving use case is the contacts/contributors merge for HCA. diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c index e3fc031..2cdfbc9 100644 --- src/tabFile/tabToTabDir/tabToTabDir.c +++ src/tabFile/tabToTabDir/tabToTabDir.c @@ -1,28 +1,29 @@ /* tabToTabDir - Convert a large tab-separated table to a directory full of such tables according * to a specification.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" #include "sqlNum.h" #include "portable.h" #include "ra.h" #include "csv.h" #include "fieldedTable.h" #include "strex.h" +#include "localmem.h" char *clId = NULL; // Flag set from command line to add an id column int clStartId = 1; // What number id column should start with void usage() /* Explain usage and exit. */ { errAbort( "tabToTabDir - Convert a large tab-separated table to a directory full of such tables according\n" "to a specification.\n" "command line:\n" " tabToTabDir in.tsv spec.txt outDir\n" "options:\n" " -id=fieldName - Add a numeric id field of given name that starts at 1 and autoincrements \n" " for each table\n" @@ -102,30 +103,31 @@ struct newFieldInfo *el; for (el = list; el != NULL; el = el->next) if (sameString(name, el->name)) return el; return NULL; } struct newTableInfo /* Info on a new table we are making */ { struct newTableInfo *next; /* Next in list */ char *name; /* Name of table */ struct newFieldInfo *keyField; /* Key field within table */ struct newFieldInfo *fieldList; /* List of fields */ struct fieldedTable *table; /* Table to fill in. */ + boolean unroll; /* If true it's a table we unroll from arrays */ }; struct newTableInfo *findTable(struct newTableInfo *list, char *name) /* Find named element in list, or NULL if not found. */ { struct newTableInfo *el; for (el = list; el != NULL; el = el->next) if (sameString(name, el->name)) return el; return NULL; } struct varVal /* A variable, what we need to compute it, and it's value */ { @@ -391,30 +393,83 @@ dyStringFree(&csvScratch); } struct hash *hashFieldIx(char **fields, int fieldCount) /* Create a hash filled with fields with integer valued indexes */ { int i; struct hash *hash = hashNew(0); for (i=0; i<fieldCount; ++i) hashAdd(hash, fields[i], intToPt(i)); return hash; } +struct fieldedTable *unrollTable(struct fieldedTable *input) +/* Unroll input table, which has to be filled with lockstepped CSV fields */ +{ +/* Make output table with fields matching input */ +int fieldCount = input->fieldCount; +struct fieldedTable *output = fieldedTableNew(input->name, input->fields, fieldCount); +output->startsSharp = input->startsSharp; + +/* We are going to be lots of splicing and dicing, so have some scratch space, + * including some we'll store with the output tables local memory pool. */ +struct lm *lm = output->lm; +struct dyString *scratch = dyStringNew(0); + +struct fieldedRow *inRow; +for (inRow = input->rowList; inRow != NULL; inRow = inRow->next) + { + /* We are going to parse a bunch of csv's in parallel */ + char *inPos[fieldCount]; + int i; + for (i=0; i<fieldCount; ++i) + inPos[i] = inRow->row[i]; + + /* With this loop we parse out the next csv from all fields, and make sure that + * they all actually do have the same number of values */ + int unrollCount = 0; + for (;;) + { + char *uncsvRow[fieldCount]; + boolean anyNull = FALSE, allNull = TRUE; + for (i=0; i<fieldCount; ++i) + { + char *oneVal = csvParseNext(&inPos[i], scratch); + if (oneVal == NULL) + anyNull = TRUE; + else + allNull = FALSE; + uncsvRow[i] = lmCloneString(lm, oneVal); + } + if (anyNull) + { + if (allNull) + break; // All is good! + else + errAbort("Can't unroll %s since not all fields have the same numbers of values.\n" + "In row %d some have %d values, some more", + input->name, inRow->id, unrollCount); + } + ++unrollCount; + fieldedTableAdd(output, uncsvRow, fieldCount, unrollCount); + } + } +return output; +} void tabToTabDir(char *inTabFile, char *specFile, char *outDir) /* tabToTabDir - Convert a large tab-separated table to a directory full of such tables * according to a specification.. */ { /* Read input table */ struct fieldedTable *inTable = fieldedTableFromTabFile(inTabFile, inTabFile, NULL, 0); verbose(1, "Read %d columns, %d rows from %s\n", inTable->fieldCount, inTable->rowCount, inTabFile); /* Create what we need for managing strex's symbol table. */ struct hash *inFieldHash = hashFieldIx(inTable->fields, inTable->fieldCount); struct hash *varHash = hashNew(5); struct symRec *symbols = symRecNew(inFieldHash, varHash, inTabFile, 0); symbols->tableRow = inTable->fields; // During parse pass fields will act as proxy for tableRow @@ -440,43 +495,47 @@ slAddHead(&symbols->varList, v); } slReverse(&symbols->varList); } else lineFileReuse(lf); /* Read in rest of spec file as ra stanzas full of tables more or less */ struct newTableInfo *newTableList = NULL, *newTable; while (raSkipLeadingEmptyLines(lf, NULL)) { /* Read first tag, which we know is there because it's right after raSkipLeadingEmptyLines. * Make sure the tag is table, and that there is a following table name and key field name. */ char *tableString, *tableSpec; + boolean unroll = FALSE; raNextTagVal(lf, &tableString, &tableSpec, NULL); verbose(2, "Processing table %s '%s' line %d of %s\n", tableString, tableSpec, lf->lineIx, lf->fileName); - if (!sameString(tableString, "table")) - errAbort("stanza that doesn't start with 'table' ending line %d of %s", + if (sameString(tableString, "unroll")) + unroll = TRUE; + else if (!sameString(tableString, "table")) + errAbort("stanza that doesn't start with 'table' or 'unroll' ending line %d of %s", lf->lineIx, lf->fileName); char *tableName = nextWord(&tableSpec); char *keyFieldName = cloneString(nextWord(&tableSpec)); if (isEmpty(keyFieldName)) errAbort("No key field for table %s line %d of %s", tableName, lf->lineIx, lf->fileName); /* Start filling out newTable with these fields */ AllocVar(newTable); + newTable->unroll = unroll; newTable->name = cloneString(tableName); tableName = newTable->name; /* Keep this handy variable. */ /* Make up field list out of rest of the stanza */ struct newFieldInfo *fvList = NULL; char *fieldName, *fieldSpec; int fieldCount = 0; while (raNextTagVal(lf, &fieldName, &fieldSpec, NULL)) { verbose(2, " fieldName %s fieldSpec (%s)\n", fieldName, fieldSpec); struct newFieldInfo *fv = parseFieldVal(fieldName, fieldSpec, lf->fileName, lf->lineIx, symbols, symExists); if (fv->type == fvVar) { char *oldName = fieldSpec; @@ -541,30 +600,36 @@ } } } makeDirsOnPath(outDir); /* Output tables */ verbose(1, "Outputting %d tables to %s\n", slCount(newTableList), outDir); for (newTable = newTableList; newTable != NULL; newTable = newTable->next) { /* Populate table */ struct fieldedTable *outTable = newTable->table; selectUniqueIntoTable(inTable, symbols, specFile, newTable->fieldList, newTable->keyField->newIx, outTable); + /* If need be unroll table */ + if (newTable->unroll) + { + outTable = unrollTable(outTable); + } + /* Create output file name and save file. */ char outTabName[FILENAME_LEN]; safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, newTable->name); verbose(1, "Writing %s of %d columns %d rows\n", outTabName, outTable->fieldCount, outTable->rowCount); fieldedTableToTabFileWithId(outTable, outTabName, clId, clStartId); } verbose(1, "%d fields, %d (%g%%) evaluated with strex, %d (%.2f) links\n", gTotalFields, gStrexFields, 100.0 * gStrexFields / gTotalFields, gLinkFields, 100.0 * gLinkFields/gTotalFields); } int main(int argc, char *argv[]) /* Process command line. */ {