be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/utils/matrixRelabel/matrixRelabel.c src/utils/matrixRelabel/matrixRelabel.c new file mode 100644 index 0000000..47eecdb --- /dev/null +++ src/utils/matrixRelabel/matrixRelabel.c @@ -0,0 +1,181 @@ +/* matrixRelabel - Relabel rows and/or columns of a matrix. */ +#include "common.h" +#include "linefile.h" +#include "localmem.h" +#include "hash.h" +#include "options.h" +#include "obscure.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "matrixRelabel - Relabel rows and/or columns of a matrix, that is a tab-sep-value\n" + "file where first row and first column are labels and rest are values\n" + "usage:\n" + " matrixRelabel in.tsv out.tsv\n" + "options:\n" + " -setCol=colLabels - set new column labels. Input is one line per label in a file\n" + " -setRow=rowLabels - set new row labels. Input is one line per label in a file\n" + " -lookupRow=lookup.tsv - pass row labels through lookup table of form old/new label\n" + " -trim - remove rows that don't lookup rather than aborting\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"setCol", OPTION_STRING}, + {"setRow", OPTION_STRING}, + {"lookupRow", OPTION_STRING}, + {"trim", OPTION_BOOLEAN}, + {NULL, 0}, +}; + +struct hash *hashTwoColTsvFile(char *fileName) +/* Make a hash out of a two column tsv file where first col is key, second val */ +{ +struct hash *hash = hashNew(0); +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *row[2]; +while (lineFileRowTab(lf, row)) + { + hashAdd(hash, row[0], lmCloneString(hash->lm, row[1])); + } +lineFileClose(&lf); +return hash; +} + +void readLineArray(char *fileName, int *retCount, char ***retLines) +/* Return an array of strings, one for each line of file. Return # of lines in file too */ +{ +/* This is sloppy with memory but it doesn't matter since we won't free it. */ +struct slName *el, *list = readAllLines(fileName); +if (list == NULL) + errAbort("%s is empty", fileName); +int count = slCount(list); +char **lines; +AllocArray(lines, count); +int i; +for (i=0, el=list; i<count; ++i, el = el->next) + { + lines[i] = el->name; + } +*retCount = count; +*retLines = lines; +} + +void matrixRelabel(char *input, char *output) +/* matrixRelabel - Relabel rows and/or columns of a matrix. */ +{ +/* Set up stuff to relabel a column if need be */ +char **newColumns = NULL; +int newColumnCount = 0; +char *newColumnFile = optionVal("setCol", NULL); +if (newColumnFile != NULL) + readLineArray(newColumnFile, &newColumnCount, &newColumns); + +/* Set up stuff to relabel a row if new be */ +char **newRows = NULL; +int newRowCount = 0; +char *newRowFile = optionVal("setRow", NULL); +if (newRowFile != NULL) + readLineArray(newRowFile, &newRowCount, &newRows); + +/* Handle row lookup */ +struct hash *lookupHash = NULL; +char *lookupRowFile = optionVal("lookupRow", NULL); +if (lookupRowFile != NULL) + lookupHash = hashTwoColTsvFile(lookupRowFile); +boolean doTrim = optionExists("trim"); + +/* Open main input and output */ +struct lineFile *lf = lineFileOpen(input, TRUE); +FILE *f = mustOpen(output, "w"); + +/* Get first row. Set colCount from it */ +char *line; +int lineSize; +lineFileNeedNext(lf, &line, &lineSize); +int colCount = 0; +if (newColumns != NULL) + { + colCount = newColumnCount; + fputs(newColumns[0], f); + int i; + for (i=1; i<colCount; ++i) + fprintf(f, "\t%s", newColumns[i]); + fputc('\n', f); + } +else + { + char *word = nextTabWord(&line); + if (word != NULL) + { + fprintf(f, "%s", word); + colCount += 1; + } + while ((word = nextTabWord(&line)) != NULL) + { + fprintf(f, "\t%s", word); + colCount += 1; + } + fputc('\n', f); + } + +int rowIx = 1; +boolean checkedColCount = FALSE; +while (lineFileNext(lf, &line, NULL)) + { + char *rowLabel = nextTabWord(&line); + if (newRows != NULL) + { + if (rowIx >= newRowCount) + errAbort("Not enough lines in %s for %s", newRowFile, input); + rowLabel = newRows[rowIx]; + ++rowIx; + } + if (lookupHash != NULL) + { + char *newLabel = hashFindVal(lookupHash, rowLabel); + if (newLabel == NULL) + { + if (doTrim) + continue; + else + errAbort("Couldn't find %s from line %d of %s in %s", + rowLabel, lf->lineIx, lf->fileName, lookupRowFile); + } + rowLabel = newLabel; + } + if (!checkedColCount) + { + char *dupe = cloneString(line); + char *s = dupe; + int count = 1; // Include rowLabel + while (nextTabWord(&s) != NULL) + ++count; + freez(&dupe); + if (count != colCount) + lineFileExpectWords(lf, colCount, count); + checkedColCount = TRUE; + } + fprintf(f, "%s\t%s\n", rowLabel, line); + } +if (newRows != NULL) + { + if (rowIx != newRowCount) + errAbort("%s has %d rows and %s has %d, not all row labels written", + input, rowIx, newRowFile, newRowCount); + } +carefulClose(&f); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +matrixRelabel(argv[1], argv[2]); +return 0; +}