72d331125b6436857a882ac41a4588810da20651 kent Fri May 26 14:36:55 2017 -0700 Adding new utility to collapse fields with same name into a single comma-separated field. diff --git src/tabFile/tabRepeatedFieldsToArrayField/tabRepeatedFieldsToArrayField.c src/tabFile/tabRepeatedFieldsToArrayField/tabRepeatedFieldsToArrayField.c new file mode 100644 index 0000000..7db4209 --- /dev/null +++ src/tabFile/tabRepeatedFieldsToArrayField/tabRepeatedFieldsToArrayField.c @@ -0,0 +1,139 @@ +/* tabRepeatedFieldsToArrayField - Convert columns that are repeated in a tab-separated file to a + * single column with comma separated values.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "fieldedTable.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "tabRepeatedFieldsToArrayField - Convert columns that are repeated in a tab-separated file to a\n" + "single column with comma separated values.\n" + "usage:\n" + " tabRepeatedFieldsToArrayField in.tsv out.tsv\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +struct fieldInfo +/* Information about a field */ + { + struct fieldInfo *next; + char *name; + struct slInt *offsetList; + }; + +void csvItemOutput(char *val, FILE *f) +/* Rewrite val, which may have some quotes or commas in it, in a way to be more compatable with + * csv list representation */ +{ +/* If there are no commas just output it */ +if (strchr(val, ',') == NULL) + { + fputs(val, f); + return; + } + +/* Strip surrounding quotes if any */ +val = trimSpaces(val); +int valLen = strlen(val); +if (valLen > 2 && val[0] == '"' && lastChar(val) == '"') + { + val[valLen-1] = 0; + val += 1; + } + +/* Put quotes around it and output, escaping internal quotes with double quotes */ +fputc('"', f); +char c; +while ((c = *val++) != 0) + { + if (c == '"') + fputc('"', f); + fputc(c, f); + } +fputc('"', f); +} + +void tabRepeatedFieldsToArrayField(char *inFile, char *outFile) +/* tabRepeatedFieldsToArrayField - Convert columns that are repeated in a tab-separated file to a + * single column with comma separated values. */ +{ +/* Read in tab-sep file */ +struct fieldedTable *table = fieldedTableFromTabFile(inFile, inFile, NULL, 0); + +/* Build up list and hash of fieldInfo from table's field list */ +struct hash *hash = hashNew(0); +struct fieldInfo *list = NULL, *field; +int i; +for (i=0; i<table->fieldCount; ++i) + { + char *name = table->fields[i]; + field = hashFindVal(hash, name); + if (field == NULL) + { + AllocVar(field); + field->name = name; + slAddHead(&list, field); + hashAdd(hash, name, field); + } + struct slInt *si = slIntNew(i); + slAddTail(&field->offsetList, si); + } +slReverse(&list); + + +/* Open output file and write out header row with optional leading # */ +FILE *f = mustOpen(outFile, "w"); +if (table->startsSharp) + fputc('#', f); +char *sep = ""; +for (field = list; field != NULL; field = field->next) + { + fprintf(f, "%s%s", sep, field->name); + sep = "\t"; + } +fputc('\n', f); + +/* Write out main rows */ +struct fieldedRow *fr; +for (fr = table->rowList; fr != NULL; fr = fr->next) + { + char **row = fr->row; + char *sep = ""; + for (field = list; field != NULL; field = field->next) + { + fputs(sep, f); + struct slInt *offset; + for (offset = field->offsetList; offset != NULL; offset = offset->next) + { + char *val = row[offset->val]; + csvItemOutput(val, f); + if (offset->next != NULL) + fputc(',', f); + } + sep = "\t"; + } + fputc('\n', f); + } +fputc('\n', f); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +tabRepeatedFieldsToArrayField(argv[1], argv[2]); +return 0; +}