5864663e6eb6f35d2f994a33e18494d1f393fc25 kent Wed Nov 24 14:17:48 2021 -0800 Added -sort option. diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c index 19d48d7..77eaa24 100644 --- src/tabFile/tabToTabDir/tabToTabDir.c +++ src/tabFile/tabToTabDir/tabToTabDir.c @@ -3,65 +3,68 @@ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" #include "sqlNum.h" #include "portable.h" #include "ra.h" #include "csv.h" #include "fieldedTable.h" #include "strex.h" #include "localmem.h" char *clId = NULL; // Flag set from command line to add an id column int clStartId = 1; // What number id column should start with +boolean clSort = FALSE; // Sort output? void usage() /* Explain usage and exit. */ { errAbort( "tabToTabDir - Convert a large tab-separated table to a directory full of such tables according\n" "to a specification. The program is designed to make it relatively easy to unpack overloaded\n" "single fields into multiple fields, and to created normalized less redundant representations.\n" "The command line is:\n" " tabToTabDir in.tsv spec.x outDir\n" "options:\n" " -id=fieldName - Add a numeric id field of given name that starts at 1 and autoincrements \n" " for each table\n" " -startId=fieldName - sets starting ID to be something other than 1\n" +" -sort - if set then sort tables before output\n" "usage:\n" " in.tsv is a tab-separated input file. The first line is the label names and may start with #\n" " spec.x is a file that says what columns to put into the output, described in more detail below.\n" "The spec.x file contains one blank line separated stanza per output table.\n" "Each stanza should look like:\n" " table tableName key-column\n" " columnName1 sourceExpression1\n" " columnName2 sourceExpression2\n" " ...\n" "if the sourceExpression is missing it is assumed to be a just a field of the same name from in.tsv\n" "Otherwise the sourceExpression can be a strex expression involving fields in in.tsv.\n" "\n" "Each output table has duplicate rows merged using the key-column to determine uniqueness.\n" "Please see tabToTabDir.doc in the source code for more information on what can go into spec.x.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"id", OPTION_STRING}, {"startId", OPTION_INT}, + {"sort", OPTION_BOOLEAN}, {NULL, 0}, }; enum fieldValType /* A type */ { fvVar, fvLink, fvExp, fvCount, }; enum combineType /* A way to combine values from a field */ { ctCount, ctUniq, ctStats, }; @@ -766,38 +769,43 @@ /* Output tables */ verbose(1, "Outputting %d tables to %s\n", slCount(newTableList), outDir); for (newTable = newTableList; newTable != NULL; newTable = newTable->next) { /* Populate table */ struct fieldedTable *outTable = newTable->table; selectUniqueIntoTable(inTable, symbols, specFile, newTable->fieldList, newTable->keyField->newIx, outTable); /* If need be unroll table */ if (newTable->unroll) { outTable = unrollTable(outTable); } + /* Optionally sort output */ + if (clSort) + fieldedTableSortOnField(outTable, newTable->keyField->name, FALSE); + /* Create output file name and save file. */ char outTabName[FILENAME_LEN]; safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, newTable->name); verbose(1, "Writing %s of %d columns %d rows\n", outTabName, outTable->fieldCount, outTable->rowCount); fieldedTableToTabFileWithId(outTable, outTabName, clId, clStartId); } verbose(1, "%d fields, %d (%g%%) evaluated with strex, %d (%.2f) links\n", gTotalFields, gStrexFields, 100.0 * gStrexFields / gTotalFields, gLinkFields, 100.0 * gLinkFields/gTotalFields); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); clId = optionVal("id", clId); clStartId = optionInt("startId", clStartId); +clSort = optionExists("sort"); if (argc != 4) usage(); tabToTabDir(argv[1], argv[2], argv[3]); return 0; }