src/tabFile/tabToTabDir/tabToTabDir.c 0b7fc0972a745005ac329190569f51b12877ebe0

0b7fc0972a745005ac329190569f51b12877ebe0
kent
  Sun Sep 15 15:24:53 2019 -0700
Adding unroll for unpacking csv arrays into a table with multiple rows.  The driving use case is the contacts/contributors merge for HCA.

diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c
index e3fc031..2cdfbc9 100644
--- src/tabFile/tabToTabDir/tabToTabDir.c
+++ src/tabFile/tabToTabDir/tabToTabDir.c
@@ -1,28 +1,29 @@
 /* tabToTabDir - Convert a large tab-separated table to a directory full of such tables according 
  * to a specification.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "obscure.h"
 #include "sqlNum.h"
 #include "portable.h"
 #include "ra.h"
 #include "csv.h"
 #include "fieldedTable.h"
 #include "strex.h"
+#include "localmem.h"
 
 char *clId = NULL;  // Flag set from command line to add an id column
 int clStartId = 1;  // What number id column should start with
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 "tabToTabDir - Convert a large tab-separated table to a directory full of such tables according\n"
 "to a specification.\n"
 "command line:\n"
 "   tabToTabDir in.tsv spec.txt outDir\n"
 "options:\n"
 "   -id=fieldName - Add a numeric id field of given name that starts at 1 and autoincrements \n"
 "                   for each table\n"
@@ -102,30 +103,31 @@
 struct newFieldInfo *el;
 for (el = list; el != NULL; el = el->next)
     if (sameString(name, el->name))
         return el;
 return NULL;
 }
 
 struct newTableInfo
 /* Info on a new table we are making */
     {
     struct newTableInfo *next;	/* Next in list */
     char *name;			/* Name of table */
     struct newFieldInfo *keyField;	/* Key field within table */
     struct newFieldInfo *fieldList; /* List of fields */
     struct fieldedTable *table;	    /* Table to fill in. */
+    boolean unroll;		    /* If true it's a table we unroll from arrays */
     };
 
 struct newTableInfo *findTable(struct newTableInfo *list, char *name)
 /* Find named element in list, or NULL if not found. */
 {
 struct newTableInfo *el;
 for (el = list; el != NULL; el = el->next)
     if (sameString(name, el->name))
         return el;
 return NULL;
 }
 
 struct varVal
 /* A variable, what we need to compute it, and it's value */
      {
@@ -391,30 +393,83 @@
 dyStringFree(&csvScratch);
 }
 
 
 
 struct hash *hashFieldIx(char **fields, int fieldCount)
 /* Create a hash filled with fields with integer valued indexes */
 {
 int i;
 struct hash *hash = hashNew(0);
 for (i=0; i<fieldCount; ++i)
    hashAdd(hash, fields[i], intToPt(i));
 return hash;
 }
 
+struct fieldedTable *unrollTable(struct fieldedTable *input)
+/* Unroll input table,  which has to be filled with lockstepped CSV fields */
+{
+/* Make output table with fields matching input */
+int fieldCount = input->fieldCount;
+struct fieldedTable *output = fieldedTableNew(input->name, input->fields, fieldCount);
+output->startsSharp = input->startsSharp;
+
+/* We are going to be lots of splicing and dicing, so have some scratch space,
+ * including some we'll store with the output tables local memory pool. */
+struct lm *lm = output->lm;
+struct dyString *scratch = dyStringNew(0);
+
+struct fieldedRow *inRow;
+for (inRow = input->rowList; inRow != NULL; inRow = inRow->next)
+    {
+    /* We are going to parse a bunch of csv's in parallel */
+    char *inPos[fieldCount];
+    int i;
+    for (i=0; i<fieldCount; ++i)
+	inPos[i] = inRow->row[i];
+
+    /* With this loop we parse out the next csv from all fields, and make sure that
+     * they all actually do have the same number of values */
+    int unrollCount = 0;
+    for (;;)
+       {
+       char *uncsvRow[fieldCount];
+       boolean anyNull = FALSE, allNull = TRUE;
+       for (i=0; i<fieldCount; ++i)
+           {
+	   char *oneVal = csvParseNext(&inPos[i], scratch);
+	   if (oneVal == NULL)
+	       anyNull = TRUE;
+	   else
+	       allNull = FALSE;
+	   uncsvRow[i] = lmCloneString(lm, oneVal);
+	   }
+       if (anyNull)
+           {
+	   if (allNull)
+	        break;	    // All is good!
+	   else
+	        errAbort("Can't unroll %s since not all fields have the same numbers of values.\n"
+		         "In row %d some have %d values, some more", 
+			 input->name, inRow->id, unrollCount);
+	   }
+       ++unrollCount;
+       fieldedTableAdd(output, uncsvRow, fieldCount, unrollCount);
+       }
+    }
+return output;
+}
 
 void tabToTabDir(char *inTabFile, char *specFile, char *outDir)
 /* tabToTabDir - Convert a large tab-separated table to a directory full of such tables 
  * according to a specification.. */
 {
 /* Read input table */
 struct fieldedTable *inTable = fieldedTableFromTabFile(inTabFile, inTabFile, NULL, 0);
 verbose(1, "Read %d columns, %d rows from %s\n", inTable->fieldCount, inTable->rowCount,
     inTabFile);
 
 /* Create what we need for managing strex's symbol table. */
 struct hash *inFieldHash = hashFieldIx(inTable->fields, inTable->fieldCount);
 struct hash *varHash = hashNew(5);
 struct symRec *symbols = symRecNew(inFieldHash, varHash, inTabFile, 0); 
 symbols->tableRow = inTable->fields;   // During parse pass fields will act as proxy for tableRow
@@ -440,43 +495,47 @@
 	slAddHead(&symbols->varList, v);
 	}
     slReverse(&symbols->varList);
     }
 else
     lineFileReuse(lf);
 
 
 /* Read in rest of spec file as ra stanzas full of tables more or less */
 struct newTableInfo *newTableList = NULL, *newTable;
 while (raSkipLeadingEmptyLines(lf, NULL))
     {
     /* Read first tag, which we know is there because it's right after raSkipLeadingEmptyLines.
      * Make sure the tag is table, and that there is a following table name and key field name. */
     char *tableString, *tableSpec;
+    boolean unroll = FALSE;
     raNextTagVal(lf, &tableString, &tableSpec, NULL);
     verbose(2, "Processing table %s '%s' line %d of %s\n",  tableString, tableSpec, 
 	lf->lineIx, lf->fileName);
-    if (!sameString(tableString, "table"))
-        errAbort("stanza that doesn't start with 'table' ending line %d of %s",
+    if (sameString(tableString, "unroll"))
+        unroll = TRUE;
+    else if (!sameString(tableString, "table"))
+        errAbort("stanza that doesn't start with 'table' or 'unroll' ending line %d of %s",
 	    lf->lineIx, lf->fileName);
     char *tableName = nextWord(&tableSpec);
     char *keyFieldName = cloneString(nextWord(&tableSpec));
     if (isEmpty(keyFieldName))
        errAbort("No key field for table %s line %d of %s", tableName, lf->lineIx, lf->fileName);
 
     /* Start filling out newTable with these fields */
     AllocVar(newTable);
+    newTable->unroll = unroll;
     newTable->name = cloneString(tableName);
     tableName = newTable->name;  /* Keep this handy variable. */
 
     /* Make up field list out of rest of the stanza */
     struct newFieldInfo *fvList = NULL;
     char *fieldName, *fieldSpec;
     int fieldCount = 0;
     while (raNextTagVal(lf, &fieldName, &fieldSpec, NULL))
         {
 	verbose(2, "  fieldName %s fieldSpec (%s)\n", fieldName, fieldSpec);
 	struct newFieldInfo *fv = parseFieldVal(fieldName, 
 	    fieldSpec, lf->fileName, lf->lineIx, symbols, symExists);
 	if (fv->type == fvVar)
 	    {
 	    char *oldName = fieldSpec;
@@ -541,30 +600,36 @@
 	  }
       }
     }
 
 makeDirsOnPath(outDir);
 
 /* Output tables */
 verbose(1, "Outputting %d tables to %s\n", slCount(newTableList), outDir);
 for (newTable = newTableList; newTable != NULL; newTable = newTable->next)
     {
     /* Populate table */
     struct fieldedTable *outTable = newTable->table;
     selectUniqueIntoTable(inTable, symbols, specFile,
 	newTable->fieldList, newTable->keyField->newIx, outTable);
 
+    /* If need be unroll table */
+    if (newTable->unroll)
+        {
+	outTable = unrollTable(outTable);
+	}
+
     /* Create output file name and save file. */
     char outTabName[FILENAME_LEN];
     safef(outTabName, sizeof(outTabName), "%s/%s.tsv", outDir, newTable->name);
     verbose(1, "Writing %s of %d columns %d rows\n",  
 	outTabName, outTable->fieldCount, outTable->rowCount);
     fieldedTableToTabFileWithId(outTable, outTabName, clId, clStartId);
     }
 verbose(1, "%d fields, %d (%g%%) evaluated with strex, %d (%.2f) links\n", 
     gTotalFields,  gStrexFields, 100.0 * gStrexFields / gTotalFields,
     gLinkFields, 100.0 * gLinkFields/gTotalFields);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {