be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/utils/subColumn/subColumn.c src/utils/subColumn/subColumn.c index 00c2eaa..341d0b5 100644 --- src/utils/subColumn/subColumn.c +++ src/utils/subColumn/subColumn.c @@ -1,136 +1,156 @@ /* subColumn - Substitute one column in a tab-separated file.. */ /* Copyright (C) 2011 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" +#include "localmem.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "obscure.h" boolean isList = FALSE; FILE *fMiss = NULL; void usage() /* Explain usage and exit. */ { errAbort( "subColumn - Substitute one column in a tab-separated file.\n" "usage:\n" " subColumn column in.tab sub.tab out.tab\n" "Where:\n" " column is the column number (starting with 1)\n" " in.tab is a tab-separated file\n" " sub.tab is a where first column is old values, second new\n" " out.tab is the substituted output\n" "options:\n" " -list - Column is a comma-separated list. Substitute all elements in list\n" " -miss=fileName - Print misses to this file instead of aborting\n" ); } static struct optionSpec options[] = { {"list", OPTION_BOOLEAN}, {"miss", OPTION_STRING}, {NULL, 0}, }; char *subCommaList(struct hash *subHash, char *in) /* In is a comma-separated list. Return the list with * substitution done. This routine not reentrant. */ { static struct dyString *dy; if (dy == NULL) dy = dyStringNew(0); else dyStringClear(dy); while (in != NULL && in[0] != 0) { char *e = strchr(in, ','); if (e != NULL) *e++ = 0; char *s = hashMustFindVal(subHash, in); dyStringPrintf(dy, "%s,", s); in = e; } return dy->string; } int missCount = 0; +struct hash *hashTwoColumnTsv(char *fileName) +/* Given a two column file (key, value) return a hash. */ +{ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +struct hash *hash = hashNew(16); +char *row[3]; +int fields = 0; +while ((fields = lineFileChopTab(lf, row)) != 0) + { + lineFileExpectWords(lf, 2, fields); + char *name = row[0]; + char *value = lmCloneString(hash->lm, row[1]); + hashAdd(hash, name, value); + } +lineFileClose(&lf); +return hash; +} + + void subColumn(char *asciiColumn, char *inFile, char *subFile, char *outFile) /* subColumn - Substitute one column in a tab-separated file.. */ { -struct hash *subHash = hashTwoColumnFile(subFile); +struct hash *subHash = hashTwoColumnTsv(subFile); int column = atoi(asciiColumn); if (column == 0) usage(); else column -= 1; char *row[1024*4]; struct lineFile *lf = lineFileOpen(inFile, TRUE); FILE *f = mustOpen(outFile, "w"); int rowCount; while ((rowCount = lineFileChopNextTab(lf, row, ArraySize(row))) > 0) { if (rowCount == ArraySize(row)) errAbort("Too many columns (%d) line %d of %s.", rowCount, lf->lineIx, lf->fileName); if (column >= rowCount) errAbort("Not enough columns (%d) line %d of %s.", rowCount, lf->lineIx, lf->fileName); int i; for (i=0; i<rowCount; ++i) { char *s = row[i]; if (i == column) { if (isList) { s = subCommaList(subHash, s); } else { char *sub = hashFindVal(subHash, s); if (sub == NULL) { if (fMiss) { fprintf(fMiss, "%s\n", s); ++missCount; } else errAbort("%s not in %s line %d of %s", s, subFile, lf->lineIx, lf->fileName); } else s = sub; } } fputs(s, f); if (i == rowCount-1) fputc('\n', f); else fputc('\t', f); } } carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); isList = optionExists("list"); char *fileName = optionVal("miss", NULL); if (fileName != NULL) fMiss = mustOpen(fileName, "w"); subColumn(argv[1], argv[2], argv[3], argv[4]); if (fMiss != NULL) { carefulClose(&fMiss); warn("missed %d\n", missCount); } return 0; }