3ab2c488f953187a6f6849b3d75f40126bb02703
kent
  Fri May 26 12:51:46 2017 -0700
Making a tagInfo utility that parallels tagStormInfo.

diff --git src/tabFile/tabInfo/tabInfo.c src/tabFile/tabInfo/tabInfo.c
new file mode 100644
index 0000000..3f67f86
--- /dev/null
+++ src/tabFile/tabInfo/tabInfo.c
@@ -0,0 +1,165 @@
+/* tabInfo - Get basic info on a tab-separated-file. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "fieldedTable.h"
+#include "tagToSql.h"
+#include "obscure.h"
+
+/* Global vars. */
+boolean anySchema;
+
+/* Command line variables. */
+boolean clCounts;
+int clVals = 0;	
+boolean clSchema = FALSE;
+boolean clLooseSchema = FALSE;
+boolean clTightSchema = FALSE;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "tabInfo - Get basic info on a tab-separated-file\n"
+  "usage:\n"
+  "   tabInfo input.tsv\n"
+  "options:\n"
+  "   -counts - if set output names, use counts, and value counts of each tag\n"
+  "   -vals=N - display tags and the top N values for them\n"
+  "   -schema - put a schema that will fit this tag storm in output.txt\n"
+  "   -looseSchema - put a less fussy schema instead\n"
+  "   -tightSchema - put a more fussy schema instead\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {"counts", OPTION_BOOLEAN},
+   {"vals", OPTION_INT},
+   {"schema", OPTION_BOOLEAN},
+   {"tightSchema", OPTION_BOOLEAN},
+   {"looseSchema", OPTION_BOOLEAN},
+   {NULL, 0},
+};
+
+struct tagInfo
+/* Keeps track of number of uses and unique values of a tag */
+    {
+    struct tagInfo *next;   /* Next in list */
+    char *tagName;	    /* Name of tag */
+    int useCount;	    /* Number of times tag is used */
+    struct hash *tagVals;   /* Hash of tag values, integer valued */
+    };
+
+void tagInfoAdd(struct tagInfo *tagInfo, char *tagVal)
+/* Add information about tag to tagInfo */
+{
+if (!isEmpty(tagVal))
+    tagInfo->useCount += 1;
+hashIncInt(tagInfo->tagVals, tagVal);
+}
+
+struct tagInfo *tagInfoNew(char *tagName)
+/* Create a new tagInfo structure */
+{
+struct tagInfo *tagInfo;
+AllocVar(tagInfo);
+tagInfo->tagName = cloneString(tagName);
+tagInfo->tagVals = hashNew(0);
+return tagInfo;
+}
+
+void tabInfo(char *fileName)
+/* tabInfo - Get basic info on a tab-separated-file. */
+{
+/* Read table from file and unpack some common fields into local vars */
+struct fieldedTable *table = fieldedTableFromTabFile(fileName, fileName, NULL, 0);
+int fieldCount = table->fieldCount;
+char **fields = table->fields;
+
+/* Do we do something fancy? */
+if (clCounts || clVals > 0 || anySchema)
+    {
+    /* Make up array of tagInfos and tagTypeInfos */
+    struct tagInfo *tagArray[fieldCount];
+    struct tagTypeInfo *typeArray[fieldCount];
+    int fieldIx;
+    for (fieldIx=0; fieldIx<fieldCount; ++fieldIx)
+	{
+	char *field = fields[fieldIx];
+	tagArray[fieldIx] = tagInfoNew(field);
+	if (anySchema)
+	    typeArray[fieldIx] = tagTypeInfoNew(field);
+	}
+
+    /* Loop through table collecting info */
+    struct fieldedRow *fr;
+    for (fr = table->rowList; fr != NULL; fr = fr->next)
+	{
+	char **row = fr->row;
+	for (fieldIx=0; fieldIx<fieldCount; ++fieldIx)
+	    {
+	    char *val = row[fieldIx];
+	    tagInfoAdd(tagArray[fieldIx], val);
+	    if (anySchema)
+		tagTypeInfoAdd(typeArray[fieldIx], val);
+	    }
+	}
+
+    /* Output information on each field */
+    for (fieldIx=0; fieldIx<fieldCount; ++fieldIx)
+	{
+	struct tagInfo *tagInfo = tagArray[fieldIx];
+	struct hash *valHash = tagInfo->tagVals;
+	if (clVals > 0)
+	    {
+	    struct hashEl *valEl, *valList = hashElListHash(valHash);
+	    printf("%s has %d uses with %d vals\n", tagInfo->tagName, tagInfo->useCount,
+		slCount(valList));
+	    slSort(&valList, hashElCmpIntValDesc);
+	    int soFar = 0, j;
+	    for (j=0, valEl = valList; j < clVals && valEl != NULL; ++j, valEl = valEl->next)
+	        {
+		int valCount = ptToInt(valEl->val);
+		soFar += valCount;
+		printf("   %d\t%s\n", valCount, valEl->name);
+		}
+	    int otherCount = tagInfo->useCount - soFar;
+	    if (otherCount > 0)
+	        printf("   %d\t(in %d others)\n", otherCount, slCount(valEl));
+	    slFreeList(&valList);
+	    }
+	else if (anySchema)
+	    {
+	    tagTypeInfoPrintSchemaLine(typeArray[fieldIx], tagInfo->useCount, valHash, 
+		clLooseSchema, clTightSchema, stdout);
+	    }
+	else
+	    {
+	    printf("%d\t%d\t%s\n", tagInfo->useCount, valHash->elCount, tagInfo->tagName);
+	    }
+	}
+    }
+else
+    {
+    printf("columns\t%d\n", fieldCount);
+    printf("rows\t%d\n", table->rowCount);
+    }
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+    usage();
+clCounts = optionExists("counts");
+clVals = optionInt("vals", clVals);
+clSchema = optionExists("schema");
+clLooseSchema = optionExists("looseSchema");
+clTightSchema = optionExists("tightSchema");
+anySchema = (clSchema || clLooseSchema || clTightSchema);
+tabInfo(argv[1]);
+return 0;
+}