src/tabFile/tabToTabDir/tabToTabDir.c 5d0b7bb9098685d0a547b24328de982f090b9100

5d0b7bb9098685d0a547b24328de982f090b9100
kent
  Sun Oct 4 09:23:46 2020 -0700
Adding first cut at stats aggregator

diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c
index d83c29d..852625e 100644
--- src/tabFile/tabToTabDir/tabToTabDir.c
+++ src/tabFile/tabToTabDir/tabToTabDir.c
@@ -34,31 +34,31 @@
 "   spec.txt is a file that says what columns to put into the output, described in more detail below.\n"
 "The spec.x file contains one blank line separated stanza per output table.\n"
 "Each stanza should look like:\n"
 "        table tableName    key-column\n"
 "        columnName1	sourceExpression1\n"
 "        columnName2	sourceExpression2\n"
 "              ...\n"
 "if the sourceExpression is missing it is assumed to be a just a field of the same name from in.tsv\n"
 "Otherwise the sourceField can be a strex expression involving fields in in.tsv.\n"
 "\n"
 "Each output table has duplicate rows merged using the key-column to determine uniqueness.\n"
 "If a more than one row of the input generates the same key in the output that is ok so long as\n"
 "all of the other fields that are generated agree as well.  An exception for this is made for\n"
 "summary expressions,  which all begin with the character '$'.   The allowed summary expressions are\n"
 "    $count - counts up number of input rows that yield this row\n"
-"    $all sourceExpression - creates comma separated list of all values of sourceExpression\n"
+"    $stats sourceExpression - creates comma separated list of all values and some statistics\n"
 "    $list sourceExpression - creates comma separated list of unique values of sourceExpression\n"
 "If the source field starts with '@' then it is followed\n"
 "by a table name and is intepreted as the same value as the key field in the this table\n" 
 "\n"
 "If there is a '?' in front of the column name it is taken to mean an optional field.\n"
 "if the corresponding source field does not exist then there's no error (and no output)\n"
 "for that column\n"
 "\n"
 "You can also use strex expressions for more complicated situations.\n"
 "            See src/lib/strex.doc\n"
 "In addition to the table stanza there can be a 'define' stanza that defines variables\n"
 "that can be used in sourceFields for tables.  This looks like:\n"
 "         define\n"
 "         variable1 sourceField1\n"
 "         variable2 sourceField2\n"
@@ -70,31 +70,31 @@
    {"id", OPTION_STRING},
    {"startId", OPTION_INT},
    {NULL, 0},
 };
 
 
 enum fieldValType
 /* A type */
     {
     fvVar, fvLink, fvExp, fvCount,
     };
 
 enum combineType
 /* A way to combine values from a field */
     {
-    ctCount, ctUniq, ctAll,
+    ctCount, ctUniq, ctStats,
     };
 
 struct newFieldInfo
 /* An expression that can define what fits in a field */
     {
     struct newFieldInfo *next;	/* Might want to hang these on a list. */
     char *name;			/* Name of field in new table */
     enum fieldValType type;	/* Constant, link, or variable */
     int oldIx;			/* For variable and link ones where field is in old table */
     int newIx;			/* Where field is in new table. */
     char *val;			/* For constant ones the string value */
     int arrayIx;		/* If it's an array then the value */
     struct newFieldInfo *link;	/* If it's fvLink then pointer to the linked field */
     struct strexParse *exp;	/* A parsed out string expression */
     boolean optional;		/* If true, then skip rather than stop if old field doesn't exist */
@@ -238,33 +238,34 @@
 	{
 	char *command = skipLeadingSpaces(s+1);
 	s = skipToSpaces(command);
 	fv->combineHash = hashNew(0);
 	if (startsWithWord("count", command))
 	    {
 	    if (!isEmpty(s))
 		errAbort("Something following $count line %d of %s", fileLineNumber, fileName);;
 	    fv->combineType = ctCount;
 	    fv->type = fvCount;
 	    }
         else if (startsWithWord("list", command))
 	    {
 	    fv->combineType = ctUniq;
 	    }
-        else if (startsWithWord("all", command))
+        else if (startsWithWord("stats", command))
 	    {
-	    fv->combineType = ctAll;
+	    fv->combineType = ctStats;
+	    uglyf("ctStats command\n");
 	    }
 	else
 	    {
 	    errAbort("Unrecognized command $%s line %d of %s", command, fileLineNumber, fileName);
 	    }
 	}
     if (fv->combineHash == NULL || fv->combineType != ctCount)
 	{
 	if (isTotallySimple(s) && hashLookup(symbols->varHash, s) == NULL)
 	    {
 	    fv->val = cloneString(skipLeadingSpaces(s));
 	    eraseTrailingSpaces(fv->val);
 	    fv->type = fvVar;
 	    }
 	else
@@ -342,35 +343,42 @@
     return v->name;
     }
 else
     {
     int rowIx = hashIntValDefault(rec->rowHash, key, -1);
     if (rowIx < 0)
         return NULL;
     return rec->tableRow[rowIx];
     }
 }
 
 
 struct uniqValLister
 /* A list of unique values */
    {
-   struct uniqValList *next;
+   struct uniqValLister *next;
    struct dyString *csv;    // Comma separated list of values seen so far
    struct hash *uniq;	    // Hash of values seen so far.
    };
 
+struct uniqValCounter
+/* A list of unique values and how often they occur */
+    {
+    struct uniqValCounter *next;
+    struct hash *uniq;	    // Integer valued list of values seen so far
+    struct slName *list;    // List of uniq values seen so far
+    };
 
 
 void selectUniqueIntoTable(struct fieldedTable *inTable,  struct symRec *symbols,
     char *specFile,  // Just for error reporting
     struct newFieldInfo *fieldList, int keyFieldIx, struct fieldedTable *outTable)
 /* Populate out table with selected unique rows from newTable */
 {
 struct hash *uniqHash = hashNew(0);
 struct fieldedRow *fr;
 int outFieldCount = outTable->fieldCount;
 char *outRow[outFieldCount];
 
 if (slCount(fieldList) != outFieldCount)  // A little cheap defensive programming on inputs
     internalErr();
 
@@ -393,30 +401,32 @@
 	    fv = fv->link;
 	
 	if (fv->type == fvVar)
 	    outRow[i] = inRow[fv->oldIx];
 	else if (fv->type == fvExp)
 	    {
 	    if (firstSymInRow)
 	        {
 		symbols->tableRow = inRow;
 		symRecSetupPrecomputes(symbols);
 		firstSymInRow = FALSE;
 		}
 	    outRow[i] = strexEvalAsString(fv->exp, symbols, symLookup, warnHandler, NULL);
 	    verbose(2, "evaluated %s to %s\n", fv->val, outRow[i]);
 	    }
+	else
+	    outRow[i] = NULL;
 	}
 
     char *key = outRow[keyFieldIx];
     if (!isEmpty(key))
 	{
 	/* Do any aggregate fields */
 	struct newFieldInfo *fv;
 	for (fv = fieldList; fv != NULL; fv = fv->next)
 	    {
 	    if (fv->combineHash != NULL)
 		{
 		switch (fv->combineType)
 		    {
 		    case ctCount:
 			hashIncInt(fv->combineHash, key);
@@ -427,33 +437,45 @@
 			if (lister == NULL)
 			    {
 			    AllocVar(lister);
 			    lister->csv = dyStringNew(0);
 			    lister->uniq = hashNew(0);
 			    hashAdd(fv->combineHash, key, lister);
 			    }
 			char *val = outRow[fv->newIx];
 			if (hashLookup(lister->uniq, val) == NULL)
 			    {
 			    hashAdd(lister->uniq, val, NULL);
 			    csvEscapeAndAppend(lister->csv, val);
 			    }
 			break;
 			}
-		    case ctAll:
+		    case ctStats:
+		        {
+			struct uniqValCounter *counter = hashFindVal(fv->combineHash, key);
+			if (counter == NULL)
+			    {
+			    AllocVar(counter);
+			    counter->uniq = hashNew(0);
+			    hashAdd(fv->combineHash, key, counter);
+			    }
+			char *val = outRow[fv->newIx];
+			if (hashLookup(counter->uniq, val) == NULL)
 			    {
-			errAbort("ctAll is Not yet implemented");
+			    slNameAddHead(&counter->list, val);
+			    }
+			hashIncInt(counter->uniq, val);
 			break;
 			}
 		    }
 		}
 	    }
 
 	struct fieldedRow *uniqFr = hashFindVal(uniqHash, key);
 	if (uniqFr == NULL)
 	    {
 	    uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0);
 	    hashAdd(uniqHash, key, uniqFr);
 	    }
 	else    /* Do error checking for true uniqueness of key */
 	    {
 	    int i;
@@ -492,33 +514,40 @@
 		switch (fv->combineType)
 		    {
 		    case ctCount:
 			{
 			char countBuf[16];
 			safef(countBuf, sizeof(countBuf), "%d", hashIntVal(fv->combineHash, key));
 			fr->row[fv->newIx] = lmCloneString(outTable->lm, countBuf);
 			break;
 			}
 		    case ctUniq:
 			{
 			struct uniqValLister *lister = hashMustFindVal(fv->combineHash, key);
 			fr->row[fv->newIx] = lister->csv->string;
 			break;
 			}
-		    case ctAll:
+		    case ctStats:
 		        {
-			errAbort("Ctal not implemented");
+			struct uniqValCounter *counter = hashMustFindVal(fv->combineHash, key);
+			struct dyString *dy = dyStringNew(0);
+			struct slName *el;
+			for (el = counter->list; el != NULL; el = el->next)
+			    {
+			    dyStringPrintf(dy, "%s(%d),", el->name, hashIntVal(counter->uniq, el->name) );
+			    }
+			fr->row[fv->newIx] = dyStringCannibalize(&dy);
 			break;
 			}
 		    }
 		}
 	    }
 	}
     }
 }
 
 
 
 struct hash *hashFieldIx(char **fields, int fieldCount)
 /* Create a hash filled with fields with integer valued indexes */
 {
 int i;