ab8341f7c9bc30766f0a0b766020437458f06d07 kent Tue Apr 17 17:32:25 2012 -0700 Moving out of hg since no need for MySQL. diff --git src/utils/verticalSplitSqlTable/verticalSplitSqlTable.c src/utils/verticalSplitSqlTable/verticalSplitSqlTable.c new file mode 100644 index 0000000..34dfb0a --- /dev/null +++ src/utils/verticalSplitSqlTable/verticalSplitSqlTable.c @@ -0,0 +1,349 @@ +/* verticalSplitSqlTable - Split a database table into two new related tables that share a field. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "ra.h" +#include "portable.h" +#include "obscure.h" +#include "errabort.h" +#include "asParse.h" + +/* Command line options. */ +boolean partialOk = FALSE; +boolean mergeOk = FALSE; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "verticalSplitSqlTable - Split a database table into two new related tables that share a field.\n" + "Note, this program just creates tab-separated files for the split tables, it does not actually\n" + "alter the database.\n" + "usage:\n" + " verticalSplitSqlTable oldTab oldAs splitSpec outDir\n" + "where:\n" + " oldTab is a tab-separated file for the unsplit table\n" + " oldAs is an autoSql file describing the unsplit table\n" + " splitSpec is a text file saying how to split the fields, see more below\n" + " outDir is a directory that will get populated with files for creating split tables\n" + "The splitSpec is a .ra formatted file with tags: table1 table2 fields1 fields2 sharedKey\n" + "Here's an example of this that splits a 'driver' tabel into a 'person' and 'car' table\n" + "Where the original driver table has the fields name, phone, ssn, carLicense, carMake, carModel\n" + " table1 person\n" + " fields1 name=name phone=phone ssn=ssn car=carLicense\n" + " description1 A person's name, phone, and social security number\n" + " table2 car\n" + " fields2 license=carLicense make=carMake model=carModel\n" + " description2 Basic info on a car\n" + " sharedKey carLicense\n" + "The result will be .as and .tab files for table1 and table2 in outDir\n" + "Note that the new table1 will have as many fields as the old table, while the\n" + "new one will just have one row for each unique sharedKey\n" + "options:\n" + " -partialOk - if set, ok for not all fields in old table to be in output\n" + " -mergeOk - if set, won't be considered an error to have multiple rows\n" + " corresponding to one sharedKey in table2. Just first row will\n" + " be output, rest will be in outDir/mergeErrs.txt\n" + ); +} + +static struct optionSpec options[] = { + {"partialOk", OPTION_BOOLEAN}, + {"mergeOk", OPTION_BOOLEAN}, + {NULL, 0}, +}; + +char *mustFindInSplitSpec(char *key, struct hash *hash, char *fileName) +/* Look up key in hash and return value. If not found complain in a message that + * includes file name. */ +{ +char *val = hashFindVal(hash, key); +if (val == NULL) + errAbort("Required field '%s' missing from %s\n", key, fileName); +return val; +} + +void checkFieldsInAs(struct slPair *list, char *listFile, struct asObject *as, char *asFile) +/* Make sure that all fields in list values are actually in the as. File names are just + * for error reporting. */ +{ +struct slPair *el; +for (el = list; el != NULL; el = el->next) + { + char *field = el->val; + if (asColumnFind(as, field) == NULL) + errAbort("Field %s is in %s but not %s\n", field, listFile, asFile); + } +} + +void checkSharedKeyInList(char *sharedKey, char *splitSpec, char *fields, struct slPair *fieldList) +/* Make sure that sharedKey really is in fieldList. splitSpec and fields are just + * for error reporting*/ +{ +struct slPair *el; +for (el = fieldList; el != NULL; el = el->next) + { + char *field = el->val; + if (sameString(field, sharedKey)) + return; // We are good + } +errAbort("Error: sharedKey %s not found after equals in '%s'", sharedKey, fields); +} + +void outputPartialAs(struct asObject *as, char *newTable, struct slPair *newFieldList, + char *newDescription, char *outDir) +/* Create outdir/newTable.as based on a subset of as. */ +{ +/* Create output file. */ +char outPath[PATH_LEN]; +safef(outPath, sizeof(outPath), "%s/%s.as", outDir, newTable); +FILE *f = mustOpen(outPath, "w"); + +/* Print table header. */ +fprintf(f, "table %s\n", newTable); +fprintf(f, "\"%s\"\n", newDescription); +char *indent = " "; +fprintf(f, "%s(\n", indent); + +/* Print selected columns. */ +struct slPair *fieldPair; +for (fieldPair = newFieldList; fieldPair != NULL; fieldPair = fieldPair->next) + { + char *newField = fieldPair->name; + char *oldField = fieldPair->val; + struct asColumn *col = asColumnFind(as, oldField); + fprintf(f, "%s%s %s; \"%s\"\n", indent, col->lowType->name, newField, col->comment); + } + +/* Close out table and file. */ +fprintf(f, "%s)\n", indent); +carefulClose(&f); +} + +int *makeNewToOldArray(struct asObject *as, struct slPair *fieldList) +/* Return an array where we can lookup old index given new index. */ +{ +int oldFieldCount = slCount(as->columnList); +int newFieldCount = slCount(fieldList); +int *oldIx; +AllocArray(oldIx, newFieldCount); +int i; +struct slPair *fieldPair; +for (i=0, fieldPair = fieldList; inext) + { + char *oldName = fieldPair->val; + struct asColumn *col = asColumnFind(as, oldName); + assert(col != NULL); /* We checked earlier but... */ + int ix = slIxFromElement(as->columnList, col); + assert(ix >= 0 && ix <= oldFieldCount); + oldIx[i] = ix; + } +return oldIx; +} + +void outputPartialTab(char *inTab, struct asObject *as, struct slPair *fieldList, char *outName) +/* Output columns in fieldList from inTab (described by as) into outName */ +{ +/* Open input and output. */ +struct lineFile *lf = lineFileOpen(inTab, TRUE); +FILE *f = mustOpen(outName, "w"); + +/* Set up array for input fields with more than we expect for better error reporting. */ +int oldFieldCount = slCount(as->columnList); +int newFieldCount = slCount(fieldList); +int allocFields = oldFieldCount+10; +char *words[allocFields]; + +/* Set up array for output fields that says where to find them in input. */ +int *oldIx = makeNewToOldArray(as, fieldList); + +/* Go through each line of input, outputting selected columns. */ +int fieldCount; +while ((fieldCount = lineFileChopNextTab(lf, words, allocFields)) > 0) + { + lineFileExpectWords(lf, oldFieldCount, fieldCount); + fprintf(f, "%s", words[oldIx[0]]); + int i; + for (i=1; icolumnList); +int newFieldCount = slCount(fieldList); +int allocFields = oldFieldCount+10; +char *words[allocFields]; + +/* Set up array for output fields that says where to find them in input. */ +int *oldIx = makeNewToOldArray(as, fieldList); + +/* Figure out index of key field. */ +int keyIx = slIxFromElement(as->columnList, keyCol); + +/* Go through each line of input, outputting selected columns. */ +struct hash *uniqHash = hashNew(18); +struct hash *errHash = hashNew(0); +struct dyString *dy = dyStringNew(1024); +int fieldCount; +while ((fieldCount = lineFileChopNextTab(lf, words, allocFields)) > 0) + { + lineFileExpectWords(lf, oldFieldCount, fieldCount); + + /* Collect possible output into dy. */ + dyStringClear(dy); + dyStringPrintf(dy, "%s", words[oldIx[0]]); + int i; + for (i=1; istring)) + { + /* Error reporting is a little complex. We want to output all lines associated + * with key, including the first one, but we only want to do first line once. */ + if (!hashLookup(errHash, key)) + { + hashAdd(errHash, key, NULL); + fputs(oldVal, fErr); + } + fputs(dy->string, fErr); + } + } + else + { + hashAdd(uniqHash, key, cloneString(dy->string)); + fputs(dy->string, f); + } + } + +/* Report error summary */ +if (errHash->elCount > 0) + { + warn("Warning: %d shared keys have multiple values in table 2. See %s.\n" + "Only first row for each key put in %s" , errHash->elCount, outErr, outTab); + if (!mergeOk) + noWarnAbort(); + } + +/* Clean up and go home. */ +freez(&oldIx); +carefulClose(&fErr); +carefulClose(&f); +lineFileClose(&lf); +} + +void verticalSplitSqlTable(char *oldTab, char *oldAs, char *splitSpec, char *outDir) +/* verticalSplitSqlTable - Split a database table into two new related tables that share a field. */ +{ +struct asObject *as = asParseFile(oldAs); +if (as->next != NULL) + errAbort("%d records in %s, only 1 allowed\n", slCount(as), oldAs); +uglyf("Read %s from %s\n", as->name, oldAs); + +/* Read fields from splitSpec, and make sure there are no extra. */ +struct hash *ra = raReadSingle(splitSpec); +char *table1 = mustFindInSplitSpec("table1", ra, splitSpec); +char *fields1 = mustFindInSplitSpec("fields1", ra, splitSpec); +char *description1 = mustFindInSplitSpec("description1", ra, splitSpec); +char *table2 = mustFindInSplitSpec("table2", ra, splitSpec); +char *fields2 = mustFindInSplitSpec("fields2", ra, splitSpec); +char *description2 = mustFindInSplitSpec("description2", ra, splitSpec); +char *sharedKey = mustFindInSplitSpec("sharedKey", ra, splitSpec); +if (ra->elCount > 7) + errAbort("Extra fields in %s", splitSpec); + +/* Convert this=that strings to lists of pairs. */ +struct slPair *fieldList1 = slPairFromString(fields1); +struct slPair *fieldList2 = slPairFromString(fields2); + +/* Do some more checks */ +if (sameString(table1, table2)) + errAbort("Error: table1 and table2 are the same (%s) in %s", table1, splitSpec); +checkSharedKeyInList(sharedKey, splitSpec, fields1, fieldList1); +checkSharedKeyInList(sharedKey, splitSpec, fields2, fieldList2); +struct asColumn *keyCol = asColumnFind(as, sharedKey); +if (keyCol == NULL) + errAbort("The sharedKey '%s' is not in %s", sharedKey, oldAs); + +/* Make sure that all fields in splitSpec are actually in the oldAs file. */ +checkFieldsInAs(fieldList1, splitSpec, as, oldAs); +checkFieldsInAs(fieldList2, splitSpec, as, oldAs); + +/* Make sure that all old table fields are covered */ +if (!partialOk) + { + struct hash *covered = hashNew(0); + struct slPair *field; + for (field = fieldList1; field != NULL; field = field->next) + hashAdd(covered, field->val, NULL); + for (field = fieldList2; field != NULL; field = field->next) + hashAdd(covered, field->val, NULL); + struct asColumn *col; + for (col = as->columnList; col != NULL; col = col->next) + { + if (!hashLookup(covered, col->name)) + errAbort("Field %s in %s not output, use -partialOk flag if this is intentional", + col->name, oldAs); + } + } + +/* Ok, input is checked, start on output.. */ +if (lastChar(outDir) == '/') + trimLastChar(outDir); +makeDirsOnPath(outDir); + +/* Output .as files. */ +outputPartialAs(as, table1, fieldList1, description1, outDir); +outputPartialAs(as, table2, fieldList2, description2, outDir); + +/* Output first split file - a straight up subset of columns. */ +char path[PATH_LEN]; +safef(path, sizeof(path), "%s/%s.tab", outDir, table1); +outputPartialTab(oldTab, as, fieldList1, path); + + +/* Output second split file */ +char errPath[PATH_LEN]; +safef(path, sizeof(path), "%s/%s.tab", outDir, table2); +safef(errPath, sizeof(path), "%s/mergeErrs.txt", outDir); +outputUniqueOnSharedKey(oldTab, as, keyCol, fieldList2, path, errPath); +} + + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +partialOk = optionExists("partialOk"); +mergeOk = optionExists("mergeOk"); +if (argc != 5) + usage(); +verticalSplitSqlTable(argv[1], argv[2], argv[3], argv[4]); +return 0; +}