ff2840f7c732b4d9d368ce18ddcac2ce6c7148ee kent Mon Jan 11 14:50:23 2021 -0800 Adding options to do table lookup on labels, to trim them, and renaming some old options to be more clear. diff --git src/utils/matrixRelabel/matrixRelabel.c src/utils/matrixRelabel/matrixRelabel.c index 975a9c3..3f46ac8 100644 --- src/utils/matrixRelabel/matrixRelabel.c +++ src/utils/matrixRelabel/matrixRelabel.c @@ -1,119 +1,158 @@ /* matrixRelabel - Relabel rows and/or columns of a matrix. */ #include "common.h" #include "linefile.h" +#include "localmem.h" #include "hash.h" #include "options.h" #include "obscure.h" void usage() /* Explain usage and exit. */ { errAbort( - "matrixRelabel - Relabel rows and/or columns of a matrix\n" + "matrixRelabel - Relabel rows and/or columns of a matrix, that is a tab-sep-value\n" + "file where first row and first column are labels and rest are values\n" "usage:\n" " matrixRelabel in.tsv out.tsv\n" "options:\n" - " -newCol=colLabels - one line per label in a file\n" - " -newRow=rowLabels - one line per label in a file\n" + " -setCol=colLabels - set new column labels. Input is one line per label in a file\n" + " -setRow=rowLabels - set new row labels. Input is one line per label in a file\n" + " -lookupRow=lookup.tsv - pass row labels through lookup table of form old/new label\n" + " -trim - remove rows that don't lookup rather than aborting\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { - {"newCol", OPTION_STRING}, - {"newRow", OPTION_STRING}, - {"first", OPTION_STRING}, + {"setCol", OPTION_STRING}, + {"setRow", OPTION_STRING}, + {"lookupRow", OPTION_STRING}, + {"trim", OPTION_BOOLEAN}, {NULL, 0}, }; +struct hash *hashTwoColTsvFile(char *fileName) +/* Make a hash out of a two column tsv file where first col is key, second val */ +{ +struct hash *hash = hashNew(0); +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *row[2]; +while (lineFileRowTab(lf, row)) + { + hashAdd(hash, row[0], lmCloneString(hash->lm, row[1])); + } +lineFileClose(&lf); +return hash; +} + void readLineArray(char *fileName, int *retCount, char ***retLines) /* Return an array of strings, one for each line of file. Return # of lines in file too */ { /* This is sloppy with memory but it doesn't matter since we won't free it. */ struct slName *el, *list = readAllLines(fileName); if (list == NULL) errAbort("%s is empty", fileName); int count = slCount(list); char **lines; AllocArray(lines, count); int i; for (i=0, el=list; i<count; ++i, el = el->next) { lines[i] = el->name; } *retCount = count; *retLines = lines; } void matrixRelabel(char *input, char *output) /* matrixRelabel - Relabel rows and/or columns of a matrix. */ { /* Set up stuff to relabel a column if need be */ char **newColumns = NULL; int newColumnCount = 0; -char *newColumnFile = optionVal("newCol", NULL); +char *newColumnFile = optionVal("setCol", NULL); if (newColumnFile != NULL) readLineArray(newColumnFile, &newColumnCount, &newColumns); - /* Set up stuff to relabel a row if new be */ char **newRows = NULL; int newRowCount = 0; -char *newRowFile = optionVal("newRow", NULL); +char *newRowFile = optionVal("setRow", NULL); if (newRowFile != NULL) readLineArray(newRowFile, &newRowCount, &newRows); +/* Handle row lookup */ +struct hash *lookupHash = NULL; +char *lookupRowFile = optionVal("lookupRow", NULL); +if (lookupRowFile != NULL) + lookupHash = hashTwoColTsvFile(lookupRowFile); +boolean doTrim = optionExists("trim"); + +/* Open main input and output */ struct lineFile *lf = lineFileOpen(input, TRUE); FILE *f = mustOpen(output, "w"); /* Get first row. Set colCount from it */ char *line; int lineSize; lineFileNeedNext(lf, &line, &lineSize); int colCount = 0; if (newColumns != NULL) { colCount = newColumnCount; fputs(newColumns[0], f); int i; for (i=1; i<colCount; ++i) fprintf(f, "\t%s", newColumns[i]); fputc('\n', f); } else { - char *word; + char *word = nextTabWord(&line); + if (word != NULL) + fprintf(f, "%s", word); while ((word = nextTabWord(&line)) != NULL) { fprintf(f, "\t%s", word); colCount += 1; } fputc('\n', f); } int rowIx = 0; while (lineFileNext(lf, &line, NULL)) { + char *rowLabel = nextTabWord(&line); if (newRows != NULL) { ++rowIx; if (rowIx >= newRowCount) errAbort("Not enough lines in %s for %s", newRowFile, input); - fputs(newRows[rowIx], f); - fputc('\t', f); - nextTabWord(&line); // skip over old first word + rowLabel = newRows[rowIx]; } - fputs(line, f); - fputc('\n', f); + if (lookupHash != NULL) + { + char *newLabel = hashFindVal(lookupHash, rowLabel); + if (newLabel == NULL) + { + if (doTrim) + continue; + else + errAbort("Couldn't find %s from line %d of %s in %s", + rowLabel, lf->lineIx, lf->fileName, lookupRowFile); + } + rowLabel = newLabel; + } + fprintf(f, "%s\t%s\n", rowLabel, line); } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); matrixRelabel(argv[1], argv[2]); return 0; }