ff052c8366616779b8fe428bb6630b80c14d1d4d kent Fri Aug 2 22:18:37 2019 -0700 Switching to somewhat shorter md5 hash. Making first line of stanza have to start with 'table' to make the formatting stand out a little better. diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c index 0626f6f..b4926eb 100644 --- src/tabFile/tabToTabDir/tabToTabDir.c +++ src/tabFile/tabToTabDir/tabToTabDir.c @@ -23,30 +23,31 @@ "where:\n" " in.tsv is a tab-separated input file. The first line is the label names and may start with #\n" " spec.txt is a file that says what columns to put into the output, described in more detail below\n" " outDir is a directory that will be populated with tab-separated files\n" "The spec.txt file contains one blank line separated stanza per output table.\n" "Each stanza should look like:\n" " tableName key-column\n" " columnName1 sourceField1\n" " columnName2 sourceField2\n" " ...\n" "if the sourceField is missing it is assumed to be a column of the same name in in.tsv\n" "The sourceField can either be a column name in the in.tsv, or a string enclosed literal\n" "or an @ followed by a table name, in which case it refers to the key of that table.\n" "If the source column is in comma-separated-values format then the sourceField can include a\n" "constant array index to pick out an item from the csv list.\n" +"If sourceField begins with a #, a md5 hash of the value is used instead\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; boolean allStringsSame(char **aa, char **bb, int count) /* Return true if first count of strings between aa and bb are the same */ { int i; for (i=0; i<count; ++i) if (!sameString(aa[i], bb[i])) @@ -209,31 +210,31 @@ char *el = csvParseNext(&csv, csvScratch); if (el == NULL) { outRow[i] = "out of range"; break; } if (j >= fv->arrayIx) { outRow[i] = cloneString(el); break; } } } if (fv->justHash) { - outRow[i] = hmacSha1("key", outRow[i]); + outRow[i] = hmacMd5("", outRow[i]); } } struct fieldedRow *uniqFr = hashFindVal(uniqHash, key); if (uniqFr == NULL) { uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0); hashAdd(uniqHash, key, uniqFr); } else /* Do error checking for true uniqueness of key */ { if (!allStringsSame(outRow, uniqFr->row, outFieldCount)) errAbort("Duplicate id %s but different data in key field %s of %s.", key, inTable->fields[keyFieldIx], outTable->name); } @@ -246,33 +247,37 @@ * according to a specification.. */ { struct fieldedTable *inTable = fieldedTableFromTabFile(inTabFile, inTabFile, NULL, 0); verbose(1, "Read %d columns, %d rows from %s\n", inTable->fieldCount, inTable->rowCount, inTabFile); struct lineFile *lf = lineFileOpen(specFile, TRUE); makeDirsOnPath(outDir); /* Read in file as ra file stanzas that we convert into tableInfos. */ struct newTableInfo *newTableList = NULL, *newTable; struct slPair *specStanza = NULL; while ((specStanza = raNextStanzAsPairs(lf)) != NULL) { /* Parse out table name and key field name. */ verbose(2, "Processing spec stanza of %d lines\n", slCount(specStanza)); - struct slPair *table = specStanza; - char *tableName = table->name; - char *keyFieldName = trimSpaces(table->val); + struct slPair *tableSl = specStanza; + if (!sameString(tableSl->name, "table")) + errAbort("stanza that doesn't start with 'table' ending line %d of %s", + lf->lineIx, lf->fileName); + char *tableSpec = tableSl->val; + char *tableName = nextWord(&tableSpec); + char *keyFieldName = nextWord(&tableSpec); if (isEmpty(keyFieldName)) errAbort("No key field for table %s.", tableName); /* Have dealt with first line of stanza, which is about table, rest of lines are fields */ struct slPair *fieldList = specStanza->next; int fieldCount = slCount(fieldList); /* Create empty output table and track which fields of input go to output. */ char *fieldNames[fieldCount]; int i; struct slPair *field; struct newFieldInfo *fvList = NULL; for (i=0, field=fieldList; i<fieldCount; ++i, field=field->next) { char *newName = field->name;