3a729cfecfb1a7fe3d219501174d5e419cd4177f
kent
  Wed Sep 28 12:15:57 2011 -0700
Making a utility to compare two lists of IDs.
diff --git src/utils/vennIds/vennIds.c src/utils/vennIds/vennIds.c
new file mode 100644
index 0000000..045142a
--- /dev/null
+++ src/utils/vennIds/vennIds.c
@@ -0,0 +1,124 @@
+/* vennIds - From two lists of unique IDs, calculate number shared between each list and number 
+ * only in each list.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "obscure.h"
+
+static char const rcsid[] = "$Id: newProg.c,v 1.30 2010/03/24 21:18:33 hiram Exp $";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "vennIds - From two lists of unique IDs, calculate number shared between each list and number\n"
+  "only in each list.\n"
+  "usage:\n"
+  "   vennIds a.lst b.lst\n"
+  "where a.lst and b.lst are whitespace delimited files of IDs\n"
+  "options:\n"
+  "   -both=both.out\n  - if set put ids that are in both here, one per line"
+  "   -aOnly=aOnly.out  - if set put ids that are in a.lst only here, one per line\n"
+  "   -bOnly=bOnly.out  - if set put ids that are in b.lst only here, one per line\n"
+  );
+}
+
+static struct optionSpec options[] = {
+   {"aOnly", OPTION_STRING},
+   {"bOnly", OPTION_STRING},
+   {NULL, 0},
+};
+
+struct hash *hashUniqueWords(char *fileName, char **words, int wordCount)
+/* Create a new hash filled with words, checking that they are unique. */
+{
+struct hash *hash = hashNew(0);
+int i;
+for (i=0; i<wordCount; ++i)
+    {
+    char *word = words[i];
+    if (hashLookup(hash, word) != NULL)
+        errAbort("Identifier %s is duplicated in file %s", word, fileName);
+    hashAdd(hash, word, NULL);
+    }
+return hash;
+}
+
+FILE *createFileForOption(char *option)
+/* If option exists, create a file to write named by the option val. */
+{
+char *fileName = optionVal(option, NULL);
+if (fileName == NULL)
+    return NULL;
+else
+    return mustOpen(fileName, "w");
+}
+
+void vennIds(char *aFile, char *bFile)
+/* vennIds - From two lists of unique IDs, calculate number shared between each list and number only in each list.. */
+{
+/* Read all words in both files into two arrays. */
+int aWordCount = 0, bWordCount = 0;
+char **aWords, *aBuf, **bWords, *bBuf;
+readAllWords(aFile, &aWords, &aWordCount, &aBuf);
+readAllWords(bFile, &bWords, &bWordCount, &bBuf);
+
+/* Make hashes, ensuring that each word is unique. */
+struct hash *aHash = hashUniqueWords(aFile, aWords, aWordCount);
+struct hash *bHash = hashUniqueWords(bFile, bWords, bWordCount);
+
+/* Open up output files if any. */
+FILE *aOnlyOut = createFileForOption("aOnly");
+FILE *bOnlyOut = createFileForOption("bOnly");
+FILE *bothOut = createFileForOption("both");
+
+/* Count number of words that are in both. */
+int i, sharedCount = 0;
+for (i=0; i<aWordCount; ++i)
+    {
+    char *aWord = aWords[i];
+    if (hashLookup(bHash, aWord))
+	{
+	if (bothOut != NULL)
+	    fprintf(bothOut, "%s\n", aWord);
+        ++sharedCount;
+	}
+    else
+        {
+	if (aOnlyOut != NULL)
+	    fprintf(aOnlyOut, "%s\n", aWord);
+	}
+    }
+
+/* If we are outputting bOnly, we need to scan through bWords too. */
+if (bOnlyOut)
+    {
+    for (i=0; i<bWordCount; ++i)
+	{
+	char *bWord = bWords[i];
+	if (!hashLookup(aHash, bWord))
+	    fprintf(bOnlyOut, "%s\n", bWord);
+	}
+    }
+
+/* Calculate overlap statistics. */
+double total = aWordCount + bWordCount - sharedCount;
+int aOnlyCount = aWordCount - sharedCount;
+int bOnlyCount = bWordCount - sharedCount;
+
+/* Print result. */
+printf("%d (%4.2f%%) only in %s\n", aOnlyCount, 100.0 * aOnlyCount / total, aFile);
+printf("%d (%4.2f%%) in both\n", sharedCount, 100.0 * sharedCount/total);
+printf("%d (%4.2f%%) only in %s\n", bOnlyCount, 100.0 * bOnlyCount / total, bFile);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 3)
+    usage();
+vennIds(argv[1], argv[2]);
+return 0;
+}