bfbbdcad6821fa4a7acf957b7061f5164c0018ad kent Tue Mar 22 18:01:12 2011 -0700 Utility to print out lines shared by all files (or from a column in all files) seems to work. diff --git src/utils/linesInAllFiles/linesInAllFiles.c src/utils/linesInAllFiles/linesInAllFiles.c new file mode 100644 index 0000000..f544cf1 --- /dev/null +++ src/utils/linesInAllFiles/linesInAllFiles.c @@ -0,0 +1,105 @@ +/* linesInAllFiles - Print lines that are in all input files.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" + +static char const rcsid[] = "$Id: newProg.c,v 1.30 2010/03/24 21:18:33 hiram Exp $"; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "linesInAllFiles - Print lines that are in all input files to stdout.\n" + "usage:\n" + " linesInAllFiles in1.txt in2.txt ... inN.txt\n" + "The order of output will follow the order in the last file.\n" + "This only puts out the first occurence of the line if it occurs multiple times\n" + "options:\n" + " -col=N - if set, file is tab-delimited and just the given column (starting with 1)\n" + " In this case only that column will be output\n" + ); +} + +int col = -1; + +static struct optionSpec options[] = { + {"col", OPTION_INT,}, + {NULL, 0}, +}; + +char *next(struct lineFile *lf) +/* Return next input from file or NULL if at end. */ +{ +if (col <= 0) + { + char *line; + if (!lineFileNext(lf, &line, NULL)) + return NULL; + return line; + } +else + { + char *row[col]; + if (!lineFileRow(lf, row)) + return NULL; + return row[col-1]; + } +} + +void linesInAllFiles(int inCount, char *inFiles[]) +/* linesInAllFiles - Print lines that are in all input files.. */ +{ +/* Hash first file. */ +struct hash *curHash = hashNew(17); +struct lineFile *lf = lineFileOpen(inFiles[0], TRUE); +char *s; +while ((s = next(lf)) != NULL) + hashStore(curHash, s); +lineFileClose(&lf); + +/* For middle files just replace hash with a smaller one. */ +int i; +int lastIn = inCount-1; +for (i=1; i<lastIn; ++i) + { + struct hash *nextHash = hashNew(17); + lf = lineFileOpen(inFiles[i], TRUE); + while ((s = next(lf)) != NULL) + { + if (hashLookup(curHash, s) != NULL) + hashStore(nextHash, s); + } + lineFileClose(&lf); + hashFree(&curHash); + curHash = nextHash; + } + +/* For last one print out hits. */ +struct hash *uniqHash = hashNew(17); +lf = lineFileOpen(inFiles[lastIn], TRUE); +while ((s = next(lf)) != NULL) + { + if (hashLookup(curHash, s) != NULL) + { + if (hashLookup(uniqHash, s) == NULL) + printf("%s\n", s); + else + hashAdd(uniqHash,s, NULL); + } + } +lineFileClose(&lf); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc < 3) + usage(); +col = optionInt("col", col); +if (col > 1000) + errAbort("col must be a number between 1 and 1000"); +linesInAllFiles(argc-1, argv+1); +return 0; +}