1bd63a5b1364a075195e59a06c8161244f934c59 kent Wed Jul 31 13:31:19 2019 -0700 The tabToTabDir program seems to work. diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c new file mode 100644 index 0000000..3196e40 --- /dev/null +++ src/tabFile/tabToTabDir/tabToTabDir.c @@ -0,0 +1,145 @@ +/* tabToTabDir - Convert a large tab-separated table to a directory full of such tables according + * to a specification.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "portable.h" +#include "ra.h" +#include "fieldedTable.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( +"tabToTabDir - Convert a large tab-separated table to a directory full of such tables according\n" +"to a specification.\n" +"usage:\n" +" tabToTabDir in.tsv spec.ra outDir\n" +"where:\n" +" in.tsv is a tab-separated input file. The first line is the label names and may start with #\n" +" spec is a file that says what columns to put into the output, described in more detail below\n" +" outDir is a directory that will be populated with tab-separated files\n" +"spec.ra file format:\n" +" This is in a ra format with one stanza per output table.\n" +" Each stanza should look like:\n" +" tableName key-field\n" +" oldColumn [newColumn]\n" +" if the newColumn is missing it is assumed the same as the old\n" +"options:\n" +" -xxx=XXX\n" +); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + + +boolean allStringsSame(char **aa, char **bb, int count) +/* Return true if first count of strings between aa and bb are the same */ +{ +int i; +for (i=0; i<count; ++i) + if (!sameString(aa[i], bb[i])) + return FALSE; +return TRUE; +} + +void selectUniqueIntoTable(struct fieldedTable *inTable, + int keyFieldIx, int oldFieldIx[], struct fieldedTable *outTable) +/* Populate out table with selected rows from newTable */ +{ +struct hash *uniqHash = hashNew(0); +struct fieldedRow *fr; +int outFieldCount = outTable->fieldCount; +char *outRow[outFieldCount]; +for (fr = inTable->rowList; fr != NULL; fr = fr->next) + { + char **inRow = fr->row; + char *key = inRow[keyFieldIx]; + int i; + for (i=0; i<outFieldCount; ++i) + outRow[i] = inRow[oldFieldIx[i]]; + struct fieldedRow *uniqFr = hashFindVal(uniqHash, key); + if (uniqFr == NULL) + { + uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0); + hashAdd(uniqHash, key, uniqFr); + } + else /* Do error checking for true uniqueness of key */ + { + if (!allStringsSame(outRow, uniqFr->row, outFieldCount)) + errAbort("Duplicate id %s but different data in key field %s of %s.", + key, inTable->fields[keyFieldIx], outTable->name); + } + } +} + +void tabToTabDir(char *inTabFile, char *specFile, char *outDir) +/* tabToTabDir - Convert a large tab-separated table to a directory full of such tables + * according to a specification.. */ +{ +struct fieldedTable *inTable = fieldedTableFromTabFile(inTabFile, inTabFile, NULL, 0); +verbose(1, "Read %d columns, %d rows from %s\n", inTable->fieldCount, inTable->rowCount, + inTabFile); +struct lineFile *lf = lineFileOpen(specFile, TRUE); +makeDirsOnPath(outDir); + +struct slPair *specStanza = NULL; +while ((specStanza = raNextStanzAsPairs(lf)) != NULL) + { + /* Parse out table name and key field name. */ + verbose(1, "Processing spec stanza of %d lines\n", slCount(specStanza)); + struct slPair *table = specStanza; + char *tableName = table->name; + char *keyFieldName = trimSpaces(table->val); + if (isEmpty(keyFieldName)) + errAbort("No key field for table %s.", tableName); + + /* Make sure that key field is actually in field list */ + struct slPair *fieldList = table->next; + int keyFieldIx = fieldedTableMustFindFieldIx(inTable, keyFieldName); + if (keyFieldIx < 0) + errAbort("key field %s is not found in field list for %s\n", tableName, keyFieldName); + + + /* Create empty output table and track which fields of input go to output. */ + int fieldCount = slCount(fieldList); + uglyf("Got %d fields\n", fieldCount); + char *fieldNames[fieldCount]; + int oldFieldIx[fieldCount]; + int i; + struct slPair *field; + for (i=0, field=fieldList; i<fieldCount; ++i, field=field->next) + { + char *newName = trimSpaces(field->val); + if (isEmpty(newName)) + newName = field->name; + fieldNames[i] = newName; + oldFieldIx[i] = fieldedTableMustFindFieldIx(inTable, field->name); + } + struct fieldedTable *outTable = fieldedTableNew(tableName, fieldNames, fieldCount); + outTable->startsSharp = inTable->startsSharp; + + /* Populate table */ + selectUniqueIntoTable(inTable, keyFieldIx, oldFieldIx, outTable); + + /* Create output file name and save file. */ + char outTabName[FILENAME_LEN]; + safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, tableName); + uglyf("Saving table to %s\n", outTabName); + fieldedTableToTabFile(outTable, outTabName); + } +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +tabToTabDir(argv[1], argv[2], argv[3]); +return 0; +}