src/tabFile/tabToTabDir/tabToTabDir.c 7e420071260c119b5917eb52a2aa20edf4bc1d94

7e420071260c119b5917eb52a2aa20edf4bc1d94
kent
  Tue Oct 6 17:45:55 2020 -0700
Adding percentage after count in stats output.  Moving doc mostly from usage to new tabToTabDir.doc file.

diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c
index 1d34ddb..6e3f3f8 100644
--- src/tabFile/tabToTabDir/tabToTabDir.c
+++ src/tabFile/tabToTabDir/tabToTabDir.c
@@ -30,50 +30,31 @@
 "                   for each table\n"
 "   -startId=fieldName - sets starting ID to be something other than 1\n"
 "usage:\n"
 "   in.tsv is a tab-separated input file.  The first line is the label names and may start with #\n"
 "   spec.txt is a file that says what columns to put into the output, described in more detail below.\n"
 "The spec.x file contains one blank line separated stanza per output table.\n"
 "Each stanza should look like:\n"
 "        table tableName    key-column\n"
 "        columnName1	sourceExpression1\n"
 "        columnName2	sourceExpression2\n"
 "              ...\n"
 "if the sourceExpression is missing it is assumed to be a just a field of the same name from in.tsv\n"
 "Otherwise the sourceField can be a strex expression involving fields in in.tsv.\n"
 "\n"
 "Each output table has duplicate rows merged using the key-column to determine uniqueness.\n"
-"If a more than one row of the input generates the same key in the output that is ok so long as\n"
-"all of the other fields that are generated agree as well.  An exception for this is made for\n"
-"summary expressions,  which all begin with the character '$'.   The allowed summary expressions are\n"
-"    $count - counts up number of input rows that yield this row\n"
-"    $stats sourceExpression - creates comma separated list of all values and some statistics\n"
-"    $list sourceExpression - creates comma separated list of unique values of sourceExpression\n"
-"If the source field starts with '@' then it is followed\n"
-"by a table name and is intepreted as the same value as the key field in the this table\n" 
-"\n"
-"If there is a '?' in front of the column name it is taken to mean an optional field.\n"
-"if the corresponding source field does not exist then there's no error (and no output)\n"
-"for that column\n"
-"\n"
-"You can also use strex expressions for more complicated situations.\n"
-"            See src/lib/strex.doc\n"
-"In addition to the table stanza there can be a 'define' stanza that defines variables\n"
-"that can be used in sourceFields for tables.  This looks like:\n"
-"         define\n"
-"         variable1 sourceField1\n"
-"         variable2 sourceField2\n"
+"Please see tabToTabDir.doc in the source code for more information on what can go into spec.x.\n"
 );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"id", OPTION_STRING},
    {"startId", OPTION_INT},
    {NULL, 0},
 };
 
 
 enum fieldValType
 /* A type */
     {
     fvVar, fvLink, fvExp, fvCount,
@@ -369,30 +350,31 @@
 
 int oneValCountCmp(const void *va, const void *vb)
 /* Compare two oneValCounts. */
 {
 const struct oneValCount *a = *((struct oneValCount **)va);
 const struct oneValCount *b = *((struct oneValCount **)vb);
 return b->count - a->count;
 }
 
 struct uniqValCounter
 /* A list of unique values and how often they occur */
     {
     struct uniqValCounter *next;
     struct hash *uniq;	    // Integer valued list of values seen so far - oneValCount values
     struct oneValCount *list;    // List of uniq values seen so far
+    int total;	    /* Total of counts in list */
     };
 
 
 void selectUniqueIntoTable(struct fieldedTable *inTable,  struct symRec *symbols,
     char *specFile,  // Just for error reporting
     struct newFieldInfo *fieldList, int keyFieldIx, struct fieldedTable *outTable)
 /* Populate out table with selected unique rows from newTable */
 {
 struct hash *uniqHash = hashNew(0);
 struct fieldedRow *fr;
 int outFieldCount = outTable->fieldCount;
 char *outRow[outFieldCount];
 
 if (slCount(fieldList) != outFieldCount)  // A little cheap defensive programming on inputs
     internalErr();
@@ -470,30 +452,31 @@
 			if (counter == NULL)
 			    {
 			    AllocVar(counter);
 			    counter->uniq = hashNew(0);
 			    hashAdd(fv->combineHash, key, counter);
 			    }
 			char *val = outRow[fv->newIx];
 			struct oneValCount *one = hashFindVal(counter->uniq, val);
 			if (one == NULL)
 			    {
 			    AllocVar(one);
 			    hashAddSaveName(counter->uniq, val, one, &one->name);
 			    slAddHead(&counter->list, one);
 			    }
 			one->count += 1;
+			counter->total += 1;
 			break;
 			}
 		    }
 		}
 	    }
 
 	struct fieldedRow *uniqFr = hashFindVal(uniqHash, key);
 	if (uniqFr == NULL)
 	    {
 	    uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0);
 	    hashAdd(uniqHash, key, uniqFr);
 	    }
 	else    /* Do error checking for true uniqueness of key */
 	    {
 	    int i;
@@ -540,31 +523,32 @@
 			}
 		    case ctUniq:
 			{
 			struct uniqValLister *lister = hashMustFindVal(fv->combineHash, key);
 			fr->row[fv->newIx] = lister->csv->string;
 			break;
 			}
 		    case ctStats:
 		        {
 			struct uniqValCounter *counter = hashMustFindVal(fv->combineHash, key);
 			struct dyString *dy = dyStringNew(0);
 			struct oneValCount *el;
 			slSort(&counter->list, oneValCountCmp);
 			for (el = counter->list; el != NULL; el = el->next)
 			    {
-			    dyStringPrintf(dy, "%s(%d),", el->name, el->count);
+			    dyStringPrintf(dy, "%s(%d %d%%),", el->name, el->count, 
+				round(100.0 * el->count / counter->total));
 			    }
 			fr->row[fv->newIx] = dyStringCannibalize(&dy);
 			break;
 			}
 		    }
 		}
 	    }
 	}
     }
 }
 
 
 
 struct hash *hashFieldIx(char **fields, int fieldCount)
 /* Create a hash filled with fields with integer valued indexes */