3a729cfecfb1a7fe3d219501174d5e419cd4177f kent Wed Sep 28 12:15:57 2011 -0700 Making a utility to compare two lists of IDs. diff --git src/utils/vennIds/vennIds.c src/utils/vennIds/vennIds.c new file mode 100644 index 0000000..045142a --- /dev/null +++ src/utils/vennIds/vennIds.c @@ -0,0 +1,124 @@ +/* vennIds - From two lists of unique IDs, calculate number shared between each list and number + * only in each list.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "obscure.h" + +static char const rcsid[] = "$Id: newProg.c,v 1.30 2010/03/24 21:18:33 hiram Exp $"; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "vennIds - From two lists of unique IDs, calculate number shared between each list and number\n" + "only in each list.\n" + "usage:\n" + " vennIds a.lst b.lst\n" + "where a.lst and b.lst are whitespace delimited files of IDs\n" + "options:\n" + " -both=both.out\n - if set put ids that are in both here, one per line" + " -aOnly=aOnly.out - if set put ids that are in a.lst only here, one per line\n" + " -bOnly=bOnly.out - if set put ids that are in b.lst only here, one per line\n" + ); +} + +static struct optionSpec options[] = { + {"aOnly", OPTION_STRING}, + {"bOnly", OPTION_STRING}, + {NULL, 0}, +}; + +struct hash *hashUniqueWords(char *fileName, char **words, int wordCount) +/* Create a new hash filled with words, checking that they are unique. */ +{ +struct hash *hash = hashNew(0); +int i; +for (i=0; i<wordCount; ++i) + { + char *word = words[i]; + if (hashLookup(hash, word) != NULL) + errAbort("Identifier %s is duplicated in file %s", word, fileName); + hashAdd(hash, word, NULL); + } +return hash; +} + +FILE *createFileForOption(char *option) +/* If option exists, create a file to write named by the option val. */ +{ +char *fileName = optionVal(option, NULL); +if (fileName == NULL) + return NULL; +else + return mustOpen(fileName, "w"); +} + +void vennIds(char *aFile, char *bFile) +/* vennIds - From two lists of unique IDs, calculate number shared between each list and number only in each list.. */ +{ +/* Read all words in both files into two arrays. */ +int aWordCount = 0, bWordCount = 0; +char **aWords, *aBuf, **bWords, *bBuf; +readAllWords(aFile, &aWords, &aWordCount, &aBuf); +readAllWords(bFile, &bWords, &bWordCount, &bBuf); + +/* Make hashes, ensuring that each word is unique. */ +struct hash *aHash = hashUniqueWords(aFile, aWords, aWordCount); +struct hash *bHash = hashUniqueWords(bFile, bWords, bWordCount); + +/* Open up output files if any. */ +FILE *aOnlyOut = createFileForOption("aOnly"); +FILE *bOnlyOut = createFileForOption("bOnly"); +FILE *bothOut = createFileForOption("both"); + +/* Count number of words that are in both. */ +int i, sharedCount = 0; +for (i=0; i<aWordCount; ++i) + { + char *aWord = aWords[i]; + if (hashLookup(bHash, aWord)) + { + if (bothOut != NULL) + fprintf(bothOut, "%s\n", aWord); + ++sharedCount; + } + else + { + if (aOnlyOut != NULL) + fprintf(aOnlyOut, "%s\n", aWord); + } + } + +/* If we are outputting bOnly, we need to scan through bWords too. */ +if (bOnlyOut) + { + for (i=0; i<bWordCount; ++i) + { + char *bWord = bWords[i]; + if (!hashLookup(aHash, bWord)) + fprintf(bOnlyOut, "%s\n", bWord); + } + } + +/* Calculate overlap statistics. */ +double total = aWordCount + bWordCount - sharedCount; +int aOnlyCount = aWordCount - sharedCount; +int bOnlyCount = bWordCount - sharedCount; + +/* Print result. */ +printf("%d (%4.2f%%) only in %s\n", aOnlyCount, 100.0 * aOnlyCount / total, aFile); +printf("%d (%4.2f%%) in both\n", sharedCount, 100.0 * sharedCount/total); +printf("%d (%4.2f%%) only in %s\n", bOnlyCount, 100.0 * bOnlyCount / total, bFile); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +vennIds(argv[1], argv[2]); +return 0; +}