46cbe5852e09da1ac8f78402722f7cd74229dcb1 kent Sat Oct 3 22:13:29 2020 -0700 Adding way to create a field that aggregates unique values. diff --git src/tabFile/tabToTabDir/tabToTabDir.c src/tabFile/tabToTabDir/tabToTabDir.c index 6cc868a..d83c29d 100644 --- src/tabFile/tabToTabDir/tabToTabDir.c +++ src/tabFile/tabToTabDir/tabToTabDir.c @@ -33,34 +33,33 @@ " in.tsv is a tab-separated input file. The first line is the label names and may start with #\n" " spec.txt is a file that says what columns to put into the output, described in more detail below.\n" "The spec.x file contains one blank line separated stanza per output table.\n" "Each stanza should look like:\n" " table tableName key-column\n" " columnName1 sourceExpression1\n" " columnName2 sourceExpression2\n" " ...\n" "if the sourceExpression is missing it is assumed to be a just a field of the same name from in.tsv\n" "Otherwise the sourceField can be a strex expression involving fields in in.tsv.\n" "\n" "Each output table has duplicate rows merged using the key-column to determine uniqueness.\n" "If a more than one row of the input generates the same key in the output that is ok so long as\n" "all of the other fields that are generated agree as well. An exception for this is made for\n" "summary expressions, which all begin with the character '$'. The allowed summary expressions are\n" -" $ - counts up number of input rows that yield this row\n" -// " $count - counts up number of input rows that yield this row\n" -// " $list sourceExpression - creates comma separated list of all values of sourceExpression\n" -// " $uniq sourceExpression - creates comma separated list of unique values of sourceExpression\n" +" $count - counts up number of input rows that yield this row\n" +" $all sourceExpression - creates comma separated list of all values of sourceExpression\n" +" $list sourceExpression - creates comma separated list of unique values of sourceExpression\n" "If the source field starts with '@' then it is followed\n" "by a table name and is intepreted as the same value as the key field in the this table\n" "\n" "If there is a '?' in front of the column name it is taken to mean an optional field.\n" "if the corresponding source field does not exist then there's no error (and no output)\n" "for that column\n" "\n" "You can also use strex expressions for more complicated situations.\n" " See src/lib/strex.doc\n" "In addition to the table stanza there can be a 'define' stanza that defines variables\n" "that can be used in sourceFields for tables. This looks like:\n" " define\n" " variable1 sourceField1\n" " variable2 sourceField2\n" ); @@ -68,44 +67,51 @@ /* Command line validation table. */ static struct optionSpec options[] = { {"id", OPTION_STRING}, {"startId", OPTION_INT}, {NULL, 0}, }; enum fieldValType /* A type */ { fvVar, fvLink, fvExp, fvCount, }; +enum combineType +/* A way to combine values from a field */ + { + ctCount, ctUniq, ctAll, + }; + struct newFieldInfo /* An expression that can define what fits in a field */ { struct newFieldInfo *next; /* Might want to hang these on a list. */ char *name; /* Name of field in new table */ enum fieldValType type; /* Constant, link, or variable */ int oldIx; /* For variable and link ones where field is in old table */ int newIx; /* Where field is in new table. */ char *val; /* For constant ones the string value */ int arrayIx; /* If it's an array then the value */ struct newFieldInfo *link; /* If it's fvLink then pointer to the linked field */ struct strexParse *exp; /* A parsed out string expression */ boolean optional; /* If true, then skip rather than stop if old field doesn't exist */ - struct hash *countHash; /* If it's type fvCount an int valued hash here */ + struct hash *combineHash; /* If it's type fvCombine an int valued hash here */ + enum combineType combineType; /* How to combine if multiple values allowed */ }; struct newFieldInfo *findField(struct newFieldInfo *list, char *name) /* Find named element in list, or NULL if not found. */ { struct newFieldInfo *el; for (el = list; el != NULL; el = el->next) if (sameString(name, el->name)) return el; return NULL; } struct newTableInfo /* Info on a new table we are making */ { @@ -210,57 +216,78 @@ { errAbort("Strange character %c starting line %d of %s", c, fileLineNumber, fileName); } fv->name = cloneString(name); char *s = trimSpaces(input); if (isEmpty(s)) { s = cloneString(name); } c = s[0]; if (c == '@') { char *val = fv->val = cloneString(skipLeadingSpaces(s+1)); if (isEmpty(val)) - errAbort("Nothing following %c", c); + errAbort("Nothing following %c line %d of %s", c, fileLineNumber, fileName); fv->type = fvLink; ++gLinkFields; } -else if (c == '$') +else + { + if (c == '$') { - if (!isEmpty(skipLeadingSpaces(s+1))) - errAbort("Something following %c", c); + char *command = skipLeadingSpaces(s+1); + s = skipToSpaces(command); + fv->combineHash = hashNew(0); + if (startsWithWord("count", command)) + { + if (!isEmpty(s)) + errAbort("Something following $count line %d of %s", fileLineNumber, fileName);; + fv->combineType = ctCount; fv->type = fvCount; - fv->countHash = hashNew(0); + } + else if (startsWithWord("list", command)) + { + fv->combineType = ctUniq; + } + else if (startsWithWord("all", command)) + { + fv->combineType = ctAll; } else { + errAbort("Unrecognized command $%s line %d of %s", command, fileLineNumber, fileName); + } + } + if (fv->combineHash == NULL || fv->combineType != ctCount) + { if (isTotallySimple(s) && hashLookup(symbols->varHash, s) == NULL) { fv->val = cloneString(skipLeadingSpaces(s)); eraseTrailingSpaces(fv->val); fv->type = fvVar; } else { fv->val = cloneString(s); fv->exp = strexParseString(fv->val, fileName, fileLineNumber-1, symbols, lookup); fv->type = fvExp; gStrexFields += 1; } } + } gTotalFields += 1; return fv; } static void symRecSetupPrecomputes(struct symRec *symbols) /* Clear out any precomputed variable values - should be * executed on each new line of table. */ { /* Clear up any old precomputes - sort of sad these can't currently * be shared between output tables. Probably not enough of a time * bottleneck to be worth fixing though. */ struct varVal *v; for (v = symbols->varList; v != NULL; v = v->next) { freez(&v->val); @@ -312,30 +339,39 @@ struct varVal *v = hashFindVal(rec->varHash, key); if (v != NULL) { return v->name; } else { int rowIx = hashIntValDefault(rec->rowHash, key, -1); if (rowIx < 0) return NULL; return rec->tableRow[rowIx]; } } +struct uniqValLister +/* A list of unique values */ + { + struct uniqValList *next; + struct dyString *csv; // Comma separated list of values seen so far + struct hash *uniq; // Hash of values seen so far. + }; + + void selectUniqueIntoTable(struct fieldedTable *inTable, struct symRec *symbols, char *specFile, // Just for error reporting struct newFieldInfo *fieldList, int keyFieldIx, struct fieldedTable *outTable) /* Populate out table with selected unique rows from newTable */ { struct hash *uniqHash = hashNew(0); struct fieldedRow *fr; int outFieldCount = outTable->fieldCount; char *outRow[outFieldCount]; if (slCount(fieldList) != outFieldCount) // A little cheap defensive programming on inputs internalErr(); for (fr = inTable->rowList; fr != NULL; fr = fr->next) @@ -362,81 +398,130 @@ { if (firstSymInRow) { symbols->tableRow = inRow; symRecSetupPrecomputes(symbols); firstSymInRow = FALSE; } outRow[i] = strexEvalAsString(fv->exp, symbols, symLookup, warnHandler, NULL); verbose(2, "evaluated %s to %s\n", fv->val, outRow[i]); } } char *key = outRow[keyFieldIx]; if (!isEmpty(key)) { - /* Increment any count fields */ + /* Do any aggregate fields */ struct newFieldInfo *fv; for (fv = fieldList; fv != NULL; fv = fv->next) { - if (fv->type == fvCount) - hashIncInt(fv->countHash, key); + if (fv->combineHash != NULL) + { + switch (fv->combineType) + { + case ctCount: + hashIncInt(fv->combineHash, key); + break; + case ctUniq: + { + struct uniqValLister *lister = hashFindVal(fv->combineHash, key); + if (lister == NULL) + { + AllocVar(lister); + lister->csv = dyStringNew(0); + lister->uniq = hashNew(0); + hashAdd(fv->combineHash, key, lister); + } + char *val = outRow[fv->newIx]; + if (hashLookup(lister->uniq, val) == NULL) + { + hashAdd(lister->uniq, val, NULL); + csvEscapeAndAppend(lister->csv, val); + } + break; + } + case ctAll: + { + errAbort("ctAll is Not yet implemented"); + break; + } + } + } } struct fieldedRow *uniqFr = hashFindVal(uniqHash, key); if (uniqFr == NULL) { uniqFr = fieldedTableAdd(outTable, outRow, outFieldCount, 0); hashAdd(uniqHash, key, uniqFr); } else /* Do error checking for true uniqueness of key */ { int i; char **uniqRow = uniqFr->row; for (i=0,fv=fieldList; fv != NULL; fv = fv->next, ++i) { - if (fv->type != fvCount) + if (fv->combineHash == NULL) { if (!sameString(uniqRow[i], outRow[i])) { + uglyf("fv->type of %s is %d\n", fv->name, (int)fv->type); warn("There is a problem with the key to table %s in %s", outTable->name, specFile); warn("%s %s", uniqFr->row[keyFieldIx], uniqFr->row[i]); warn("%s %s", outRow[keyFieldIx], outRow[i]); warn("both exist, so key doesn't specify a unique %s field", outTable->fields[i]); errAbort("line %d of %s", fr->id, inTable->name); } } } } } } -/* Make a loop through output table fixing up count-oriented fields */ +/* Make a loop through output table fixing up aggregation-oriented fields */ { struct newFieldInfo *fv; for (fv = fieldList; fv != NULL; fv = fv->next) { - if (fv->countHash != NULL) + if (fv->combineHash != NULL) { for (fr = outTable->rowList; fr != NULL; fr = fr->next) { char *key = fr->row[keyFieldIx]; + switch (fv->combineType) + { + case ctCount: + { char countBuf[16]; - safef(countBuf, sizeof(countBuf), "%d", hashIntVal(fv->countHash, key)); + safef(countBuf, sizeof(countBuf), "%d", hashIntVal(fv->combineHash, key)); fr->row[fv->newIx] = lmCloneString(outTable->lm, countBuf); + break; + } + case ctUniq: + { + struct uniqValLister *lister = hashMustFindVal(fv->combineHash, key); + fr->row[fv->newIx] = lister->csv->string; + break; + } + case ctAll: + { + errAbort("Ctal not implemented"); + break; + } + } } } } } } struct hash *hashFieldIx(char **fields, int fieldCount) /* Create a hash filled with fields with integer valued indexes */ { int i; struct hash *hash = hashNew(0); for (i=0; i<fieldCount; ++i) hashAdd(hash, fields[i], intToPt(i)); @@ -572,31 +657,31 @@ while (raNextTagVal(lf, &fieldName, &fieldSpec, NULL)) { verbose(2, " fieldName %s fieldSpec (%s)\n", fieldName, fieldSpec); struct newFieldInfo *fv = parseFieldVal(fieldName, fieldSpec, lf->fileName, lf->lineIx, symbols, symExists); if (fv->type == fvVar) { char *oldName = fieldSpec; if (isEmpty(oldName)) oldName = fieldName; int oldIx = stringArrayIx(oldName, inTable->fields, inTable->fieldCount); if (oldIx < 0) { if (fv->optional) continue; // Just skip optional ones we don't have - errAbort("%s doesn't exist in the %d fields of %s line %d of %s", + errAbort("'%s' doesn't exist in the %d fields of %s line %d of %s", oldName, inTable->fieldCount, inTable->name, lf->lineIx, lf->fileName); } fv->oldIx = oldIx; } fv->newIx = fieldCount++; slAddHead(&fvList, fv); } slReverse(&fvList); /* Create array of field names for output. */ char *fieldNames[fieldCount]; int i; struct newFieldInfo *fv = NULL; for (i=0, fv=fvList; i<fieldCount; ++i, fv=fv->next)