src/tabFile/tabToTabDir/tabToTabDir.c ff052c8366616779b8fe428bb6630b80c14d1d4d

ff052c8366616779b8fe428bb6630b80c14d1d4d
kent
  Fri Aug 2 22:18:37 2019 -0700
Switching to somewhat shorter md5 hash. Making first line of stanza have to start with 'table' to make the formatting stand out a little better.

diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c
index 0626f6f..b4926eb 100644
--- src/tabFile/tabToTabDir/tabToTabDir.c
+++ src/tabFile/tabToTabDir/tabToTabDir.c
@@ -23,30 +23,31 @@
 "where:\n"
 "   in.tsv is a tab-separated input file.  The first line is the label names and may start with #\n"
 "   spec.txt is a file that says what columns to put into the output, described in more detail below\n"
 "   outDir is a directory that will be populated with tab-separated files\n"
 "The spec.txt file contains one blank line separated stanza per output table.\n"
 "Each stanza should look like:\n"
 "        tableName    key-column\n"
 "        columnName1	sourceField1\n"
 "        columnName2	sourceField2\n"
 "              ...\n"
 "if the sourceField is missing it is assumed to be a column of the same name in in.tsv\n"
 "The sourceField can either be a column name in the in.tsv, or a string enclosed literal\n"
 "or an @ followed by a table name, in which case it refers to the key of that table.\n"
 "If the source column is in comma-separated-values format then the sourceField can include a\n"
 "constant array index to pick out an item from the csv list.\n"
+"If sourceField begins with a #, a md5 hash of the value is used instead\n"
 );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 
 boolean allStringsSame(char **aa, char **bb, int count)
 /* Return true if first count of strings between aa and bb are the same */
 {
 int i;
 for (i=0; i<count; ++i)
     if (!sameString(aa[i], bb[i]))
@@ -209,31 +210,31 @@
 		char *el = csvParseNext(&csv, csvScratch);
 		if (el == NULL)
 		    {
 		    outRow[i] = "out of range";
 		    break;
 		    }
 		if (j >= fv->arrayIx)
 		    {
 		    outRow[i] = cloneString(el);
 		    break;
 		    }
 		}
 	    }
 	if (fv->justHash)
 	    {
-	    outRow[i] = hmacSha1("key", outRow[i]);
+	    outRow[i] = hmacMd5("", outRow[i]);
 	    }
 	}
 
     struct fieldedRow *uniqFr = hashFindVal(uniqHash, key);
     if (uniqFr == NULL)
         {
 	uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0);
 	hashAdd(uniqHash, key, uniqFr);
 	}
     else    /* Do error checking for true uniqueness of key */
         {
 	if (!allStringsSame(outRow, uniqFr->row, outFieldCount))
 	    errAbort("Duplicate id %s but different data in key field %s of %s.",
 		key, inTable->fields[keyFieldIx], outTable->name);
 	}
@@ -246,33 +247,37 @@
  * according to a specification.. */
 {
 struct fieldedTable *inTable = fieldedTableFromTabFile(inTabFile, inTabFile, NULL, 0);
 verbose(1, "Read %d columns, %d rows from %s\n", inTable->fieldCount, inTable->rowCount,
     inTabFile);
 struct lineFile *lf = lineFileOpen(specFile, TRUE);
 makeDirsOnPath(outDir);
 
 /* Read in file as ra file stanzas that we convert into tableInfos. */
 struct newTableInfo *newTableList = NULL, *newTable;
 struct slPair *specStanza = NULL;
 while ((specStanza = raNextStanzAsPairs(lf)) != NULL)
     {
     /* Parse out table name and key field name. */
     verbose(2, "Processing spec stanza of %d lines\n",  slCount(specStanza));
-    struct slPair *table = specStanza;
-    char *tableName = table->name;
-    char *keyFieldName = trimSpaces(table->val);
+    struct slPair *tableSl = specStanza;
+    if (!sameString(tableSl->name, "table"))
+        errAbort("stanza that doesn't start with 'table' ending line %d of %s",
+	    lf->lineIx, lf->fileName);
+    char *tableSpec = tableSl->val;
+    char *tableName = nextWord(&tableSpec);
+    char *keyFieldName = nextWord(&tableSpec);
     if (isEmpty(keyFieldName))
        errAbort("No key field for table %s.", tableName);
 
     /* Have dealt with first line of stanza, which is about table,  rest of lines are fields */
     struct slPair *fieldList = specStanza->next;
     int fieldCount = slCount(fieldList);
 
     /* Create empty output table and track which fields of input go to output. */
     char *fieldNames[fieldCount];
     int i;
     struct slPair *field;
     struct newFieldInfo *fvList = NULL;
     for (i=0, field=fieldList; i<fieldCount; ++i, field=field->next)
         {
 	char *newName = field->name;