f2b012bfbb525fda8d876b13425f4237601522f7 kent Fri Aug 2 21:38:43 2019 -0700 Adding a # operator that'll give you a sha1 hash of the value. Finishing implementing @ operator. diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c index 6e009d4..0626f6f 100644 --- src/tabFile/tabToTabDir/tabToTabDir.c +++ src/tabFile/tabToTabDir/tabToTabDir.c @@ -1,27 +1,28 @@ /* tabToTabDir - Convert a large tab-separated table to a directory full of such tables according * to a specification.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "obscure.h" #include "sqlNum.h" #include "portable.h" #include "ra.h" #include "csv.h" #include "fieldedTable.h" +#include "hmac.h" void usage() /* Explain usage and exit. */ { errAbort( "tabToTabDir - Convert a large tab-separated table to a directory full of such tables according\n" "to a specification.\n" "usage:\n" " tabToTabDir in.tsv spec.txt outDir\n" "where:\n" " in.tsv is a tab-separated input file. The first line is the label names and may start with #\n" " spec.txt is a file that says what columns to put into the output, described in more detail below\n" " outDir is a directory that will be populated with tab-separated files\n" "The spec.txt file contains one blank line separated stanza per output table.\n" "Each stanza should look like:\n" @@ -53,52 +54,53 @@ return TRUE; } enum fieldValType /* A type */ { fvVar, fvArray, fvLink, fvConst, }; struct newFieldInfo /* An expression that can define what fits in a field */ { struct newFieldInfo *next; /* Might want to hang these on a list. */ char *name; /* Name of field in new table */ enum fieldValType type; /* Constant, link, or variable */ + boolean justHash; /* Just do hash of field */ int oldIx; /* For variable and link ones where field is in old table */ char *val; /* For constant ones the string value */ int arrayIx; /* If it's an array then the value */ + struct newFieldInfo *link; /* If it's fvLink then pointer to the linked field */ }; struct newFieldInfo *findField(struct newFieldInfo *list, char *name) /* Find named element in list, or NULL if not found. */ { struct newFieldInfo *el; for (el = list; el != NULL; el = el->next) if (sameString(name, el->name)) return el; return NULL; } struct newTableInfo /* Info on a new table we are making */ { struct newTableInfo *next; /* Next in list */ char *name; /* Name of table */ - char *keyField; /* Key field within table */ - int keyFieldIx; /* Index of key field */ + struct newFieldInfo *keyField; /* Key field within table */ struct newFieldInfo *fieldList; /* List of fields */ struct fieldedTable *table; /* Table to fill in. */ }; struct newTableInfo *findTable(struct newTableInfo *list, char *name) /* Find named element in list, or NULL if not found. */ { struct newTableInfo *el; for (el = list; el != NULL; el = el->next) if (sameString(name, el->name)) return el; return NULL; } struct newFieldInfo *parseFieldVal(char *name, char *input) @@ -106,31 +108,39 @@ { /* Make up return structure. */ struct newFieldInfo *fv; AllocVar(fv); fv->name = cloneString(name); char *s = skipLeadingSpaces(input); if (isEmpty(s)) { fv->type = fvVar; fv->val = cloneString(name); } else { + /* Set flag if we start with a hash */ char c = s[0]; + if (c == '#') + { + fv->justHash = TRUE; + s = skipLeadingSpaces(s+1); + c = s[0]; + } + if (c == '"' || c == '\'') { char *val = fv->val = cloneString(s); if (!parseQuotedString(val, val, NULL)) errAbort("in %s", input); fv->type = fvConst; } else if (c == '@') { char *val = fv->val = cloneString(skipLeadingSpaces(s+1)); trimSpaces(val); if (isEmpty(val)) errAbort("Nothing following %c", c); fv->type = fvLink; } @@ -165,56 +175,66 @@ struct hash *uniqHash = hashNew(0); struct fieldedRow *fr; int outFieldCount = outTable->fieldCount; char *outRow[outFieldCount]; if (slCount(fieldList) != outFieldCount) // A little cheap defensive programming on inputs internalErr(); struct dyString *csvScratch = dyStringNew(0); for (fr = inTable->rowList; fr != NULL; fr = fr->next) { /* Create new row from a scan through old table */ char **inRow = fr->row; char *key = inRow[keyFieldIx]; int i; - struct newFieldInfo *fv; - for (i=0, fv=fieldList; i<outFieldCount && fv != NULL; ++i, fv = fv->next) + struct newFieldInfo *unlinkedFv; + for (i=0, unlinkedFv=fieldList; i<outFieldCount && unlinkedFv != NULL; + ++i, unlinkedFv = unlinkedFv->next) { + /* Skip through links. */ + struct newFieldInfo *fv = unlinkedFv; + while (fv->type == fvLink) + fv = fv->link; + if (fv->type == fvConst) outRow[i] = fv->val; else if (fv->type == fvVar) outRow[i] = inRow[fv->oldIx]; else if (fv->type == fvArray) { char *csv = inRow[fv->oldIx]; int j; for (j=0; ; ++j) { char *el = csvParseNext(&csv, csvScratch); if (el == NULL) { outRow[i] = "out of range"; break; } - if (i >= fv->arrayIx) + if (j >= fv->arrayIx) { outRow[i] = cloneString(el); break; } } } + if (fv->justHash) + { + outRow[i] = hmacSha1("key", outRow[i]); + } } struct fieldedRow *uniqFr = hashFindVal(uniqHash, key); if (uniqFr == NULL) { uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0); hashAdd(uniqHash, key, uniqFr); } else /* Do error checking for true uniqueness of key */ { if (!allStringsSame(outRow, uniqFr->row, outFieldCount)) errAbort("Duplicate id %s but different data in key field %s of %s.", key, inTable->fields[keyFieldIx], outTable->name); } } @@ -262,61 +282,59 @@ fieldNames[i] = newName; slAddHead(&fvList, fv); } slReverse(&fvList); struct fieldedTable *outTable = fieldedTableNew(tableName, fieldNames, fieldCount); outTable->startsSharp = inTable->startsSharp; /* Make sure that key field is actually in field list */ struct newFieldInfo *keyField = findField(fvList, keyFieldName); if (keyField == NULL) errAbort("key field %s is not found in field list for %s\n", tableName, keyFieldName); /* Allocate structure to save results of this pass in and so so. */ AllocVar(newTable); newTable->name = tableName; - newTable->keyField = keyFieldName; + newTable->keyField = keyField; newTable->fieldList = fvList; newTable->table = outTable; - newTable->keyFieldIx = keyField->oldIx; slAddHead(&newTableList, newTable); } slReverse(&newTableList); /* Do links between tables */ for (newTable = newTableList; newTable != NULL; newTable = newTable->next) { struct newFieldInfo *field; for (field = newTable->fieldList; field != NULL; field = field->next) { if (field->type == fvLink) { struct newTableInfo *linkedTable = findTable(newTableList, field->val); if (linkedTable == NULL) errAbort("@%s doesn't exist", field->name); - field->oldIx = linkedTable->keyFieldIx; - field->type = fvVar; + field->link = linkedTable->keyField; } } } /* Output tables */ for (newTable = newTableList; newTable != NULL; newTable = newTable->next) { /* Populate table */ struct fieldedTable *outTable = newTable->table; - selectUniqueIntoTable(inTable, newTable->fieldList, newTable->keyFieldIx, outTable); + selectUniqueIntoTable(inTable, newTable->fieldList, newTable->keyField->oldIx, outTable); /* Create output file name and save file. */ char outTabName[FILENAME_LEN]; safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, newTable->name); verbose(1, "Writing %s of %d fields %d rows\n", outTabName, outTable->fieldCount, outTable->rowCount); fieldedTableToTabFile(outTable, outTabName); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4)