3383c94388349981d5e84f542918e420502326cc kent Sun Aug 11 10:25:04 2019 -0700 Improving error message for when key is not unique. This was a fair bit of work for a good error message, but it comes up often in the most difficult part of wrangling, coming up with a good unique id for a subtable. diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c index db6cb5b..1a5eab7 100644 --- src/tabFile/tabToTabDir/tabToTabDir.c +++ src/tabFile/tabToTabDir/tabToTabDir.c @@ -33,38 +33,38 @@ "if the sourceField is missing it is assumed to be a column of the same name in in.tsv\n" "The sourceField can either be a column name in the in.tsv, or a string enclosed literal\n" "or an @ followed by a table name, in which case it refers to the key of that table.\n" "If the source column is in comma-separated-values format then the sourceField can include a\n" "constant array index to pick out an item from the csv list.\n" "You can also use strex expressions for more complicated situations. See src/lib/strex.doc\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; -boolean allStringsSame(char **aa, char **bb, int count) +static int firstDifferentIx(char **aa, char **bb, int count) /* Return true if first count of strings between aa and bb are the same */ { int i; for (i=0; i<count; ++i) if (!sameString(aa[i], bb[i])) - return FALSE; -return TRUE; + return i; +return -1; } enum fieldValType /* A type */ { fvVar, fvLink, fvExp, }; struct newFieldInfo /* An expression that can define what fits in a field */ { struct newFieldInfo *next; /* Might want to hang these on a list. */ char *name; /* Name of field in new table */ enum fieldValType type; /* Constant, link, or variable */ int oldIx; /* For variable and link ones where field is in old table */ @@ -165,36 +165,40 @@ } struct symRec /* Something we pass as a record to symLookup */ { struct hash *hash; /* The hash with symbol to row index */ char **row; /* The row we are working on */ }; static char *symLookup(void *record, char *key) /* Lookup symbol in hash */ { struct symRec *rec = record; struct hash *hash = rec->hash; char **row = rec->row; -int rowIx = hashIntVal(hash, key); +int rowIx = hashIntValDefault(hash, key, -1); +if (rowIx < 0) + return NULL; +else return row[rowIx]; } void selectUniqueIntoTable(struct fieldedTable *inTable, struct hash *inFieldHash, + char *specFile, // Just for error reporting struct newFieldInfo *fieldList, int keyFieldIx, struct fieldedTable *outTable) /* Populate out table with selected rows from newTable */ { struct hash *uniqHash = hashNew(0); struct fieldedRow *fr; int outFieldCount = outTable->fieldCount; char *outRow[outFieldCount]; if (slCount(fieldList) != outFieldCount) // A little cheap defensive programming on inputs internalErr(); struct dyString *csvScratch = dyStringNew(0); for (fr = inTable->rowList; fr != NULL; fr = fr->next) { /* Create new row from a scan through old table */ @@ -216,33 +220,38 @@ struct symRec symRec = {inFieldHash, inRow}; outRow[i] = strexEvalAsString(fv->exp, &symRec, symLookup); verbose(2, "evaluated %s to %s\n", fv->val, outRow[i]); } } char *key = outRow[keyFieldIx]; struct fieldedRow *uniqFr = hashFindVal(uniqHash, key); if (uniqFr == NULL) { uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0); hashAdd(uniqHash, key, uniqFr); } else /* Do error checking for true uniqueness of key */ { - if (!allStringsSame(outRow, uniqFr->row, outFieldCount)) - errAbort("Duplicate id %s but different data in key field %s of %s.", - key, outTable->fields[keyFieldIx], outTable->name); + int differentIx = firstDifferentIx(outRow, uniqFr->row, outFieldCount); + if (differentIx >= 0) + { + warn("There is a problem with the key to table %s in %s", outTable->name, specFile); + warn("%s %s", uniqFr->row[keyFieldIx], uniqFr->row[differentIx]); + warn("%s %s", outRow[keyFieldIx], outRow[differentIx]); + errAbort("both exist, so they key is not unique to all values"); + } } } dyStringFree(&csvScratch); } struct hash *hashFieldIx(char **fields, int fieldCount) /* Create a hash filled with fields with integer valued indexes */ { int i; struct hash *hash = hashNew(0); for (i=0; i<fieldCount; ++i) hashAdd(hash, fields[i], intToPt(i)); return hash; @@ -268,30 +277,31 @@ char *tableString, *tableSpec; raNextTagVal(lf, &tableString, &tableSpec, NULL); verbose(2, "Processing table %s '%s' line %d of %s\n", tableString, tableSpec, lf->lineIx, lf->fileName); if (!sameString(tableString, "table")) errAbort("stanza that doesn't start with 'table' ending line %d of %s", lf->lineIx, lf->fileName); char *tableName = nextWord(&tableSpec); char *keyFieldName = cloneString(nextWord(&tableSpec)); if (isEmpty(keyFieldName)) errAbort("No key field for table %s line %d of %s", tableName, lf->lineIx, lf->fileName); /* Start filling out newTable with these fields */ AllocVar(newTable); newTable->name = cloneString(tableName); + tableName = newTable->name; /* Keep this handy variable. */ /* Make up field list out of rest of the stanza */ struct newFieldInfo *fvList = NULL; char *fieldName, *fieldSpec; int fieldCount = 0; while (raNextTagVal(lf, &fieldName, &fieldSpec, NULL)) { verbose(2, " fieldName %s fieldSpec ((%s))\n", fieldName, fieldSpec); struct newFieldInfo *fv = parseFieldVal(fieldName, fieldSpec, lf->fileName, lf->lineIx); if (fv->type == fvVar) { char *oldName = fieldSpec; if (isEmpty(oldName)) oldName = fieldName; int oldIx = stringArrayIx(oldName, inTable->fields, inTable->fieldCount); @@ -345,31 +355,31 @@ struct newTableInfo *linkedTable = findTable(newTableList, field->val); if (linkedTable == NULL) errAbort("@%s doesn't exist", field->name); field->link = linkedTable->keyField; } } } struct hash *inFieldHash = hashFieldIx(inTable->fields, inTable->fieldCount); /* Output tables */ for (newTable = newTableList; newTable != NULL; newTable = newTable->next) { /* Populate table */ struct fieldedTable *outTable = newTable->table; - selectUniqueIntoTable(inTable, inFieldHash, + selectUniqueIntoTable(inTable, inFieldHash, specFile, newTable->fieldList, newTable->keyField->newIx, outTable); /* Create output file name and save file. */ char outTabName[FILENAME_LEN]; safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, newTable->name); verbose(1, "Writing %s of %d fields %d rows\n", outTabName, outTable->fieldCount, outTable->rowCount); fieldedTableToTabFile(outTable, outTabName); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options);