2d05d30ed4df1612d72ba84c812d004de935b122
angie
  Fri May 17 16:08:54 2024 -0700
Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup.
Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries.

diff --git src/lib/tests/mmHashTest.c src/lib/tests/mmHashTest.c
new file mode 100644
index 0000000..f0654eb
--- /dev/null
+++ src/lib/tests/mmHashTest.c
@@ -0,0 +1,105 @@
+/* mmHashTest - Make a hash of strings, write an mmHash file, memory-map it and look up items. */
+
+/* Copyright (C) 2024 The Regents of the University of California
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
+
+#include "common.h"
+#include "linefile.h"
+#include "mmHash.h"
+#include "options.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "mmHashTest - Make a hash of strings, write an mmHash file, memory-map it and look up items.\n"
+  "usage:\n"
+  "  mmHashTest in.txt out.mmh out.txt\n"
+  "Each line of in.txt contains a key followed by a tab character followed by a value.\n"
+  "in.txt is read into a hash which is converted to mmHash and written out to out.mmh.\n"
+  "out.mmh is read back in as a memory-mapped file, items are looked up and written to out.txt.\n"
+  "out.txt should contain the same contents as in.txt unless there are multiple lines with\n"
+  "the same key; in that case, only the last line with the key will be included in out.txt.\n"
+  );
+}
+
+static struct optionSpec options[] = {
+    {NULL, 0},
+};
+
+static void makeRandomString(char *buf, int bufSize)
+/* Fill buf with bufSize-1 random printable characters and 0-terminate. */
+{
+// Use as printable characters: ASCII 32-126
+int i;
+for (i = 0;  i < bufSize-1;  i++)
+    buf[i] = 32 + (rand() % 95);
+buf[bufSize-1] = 0;
+}
+
+void mmHashTest(char *inFileName, char *mmHashFileName, char *outFileName)
+/* Read in items from a file and print the resulting clusters. */
+{
+FILE *f = mustOpen(outFileName, "w");
+// Read inFile into hash
+struct lineFile *lf = lineFileOpen(inFileName, TRUE);
+struct hash *hash = hashNew(0);
+struct slName *keyList = NULL;
+char *line;
+int size;
+while (lineFileNext(lf, &line, &size))
+    {
+    char *key = line;
+    char *val = "";
+    char *tab = strchr(line, '\t');
+    if (tab != NULL)
+        {
+        *tab = '\0';
+        val = tab + 1;
+        }
+    hashAdd(hash, key, cloneString(val));
+    slNameAddHead(&keyList, key);
+    }
+lineFileClose(&lf);
+slReverse(&keyList);
+
+// Convert hash to mmHash file.
+hashToMmHashFile(hash, mmHashFileName);
+freeHashAndVals(&hash);
+
+// Get that file memory-mapped.
+struct mmHash *mmh = mmHashFromFile(mmHashFileName);
+
+// Just for fun, look up a long random name, with the expectation that there is no way it will
+// ever appear in input, to make sure that when a key is not found, we return NULL instead of
+// crashing or returning a value.
+char longRandomString[512];
+makeRandomString(longRandomString, sizeof longRandomString);
+const char *shouldBeNull = mmHashFindVal(mmh, longRandomString);
+if (shouldBeNull != NULL)
+    errAbort("Error: I really did not expect to find the following line as a key in %s:\n%s\n"
+             "-- but there it was, with a value of '%s'",
+             inFileName, longRandomString, shouldBeNull);
+
+// Look up and write out all the items.
+struct slName *key;
+for (key = keyList;  key != NULL;  key = key->next)
+    {
+    const char *val = mmHashFindVal(mmh, key->name);
+    if (val == NULL)
+        errAbort("Lookup of key '%s' failed.", key->name);
+    fprintf(f, "%s\t%s\n", key->name, val);
+    }
+carefulClose(&f);
+}
+
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+mmHashTest(argv[1], argv[2], argv[3]);
+return 0;
+}