2d05d30ed4df1612d72ba84c812d004de935b122
angie
  Fri May 17 16:08:54 2024 -0700
Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup.
Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries.

diff --git src/inc/mmHash.h src/inc/mmHash.h
new file mode 100644
index 0000000..7cf7d0b
--- /dev/null
+++ src/inc/mmHash.h
@@ -0,0 +1,71 @@
+/* Memory mapped hash: both keys and values are 0-terminated strings */
+
+/* This file is copyright 2024 UCSC Genome Browser Authors, but license is hereby
+ * granted for all use - public, private or commercial. */
+
+// Typical use:
+//
+// * build a regular hash, convert to mmHash format and write to file
+//    struct hash *hash = hashFrom...(...);
+//    hashToMmHashFile(hash, pathToFile);
+//
+// * read mmHash from memory mapped file path, look up values
+//    struct mmHash *mmh = mmHashFromFile(pathToFile);
+//    char *val = mmHashFindVal(mmh, itemName);
+//    ...
+//    mmHashFree(&mmh);
+
+// Memory mapped file layout:
+//
+// * magic bytes    [7 bytes]  0xF0 m m h v 1 0x0F
+// * powerOfTwoSize [1 byte]
+// * bucketOffsets  [8 bytes * 2^powerOfTwoSize]
+// * variable-length key and value storage, into which bucketOffsets point.  Series of bucket elems:
+//   * elCount [2 bytes]
+//   * repeating elCount times:
+//     * keyString [0-terminated string]
+//     * valString [0-terminated string]
+// Values of bucketOffsets are either 0 (no items in bucket) or offsets into the memory mapped file.
+// They are 8 bytes because 4 bytes would limit the total size of the file (buckets, keys, values,
+// overhead) to 4GB which is a limit we could conceivably run up against with lots of large values.
+// 8 bytes seems wasteful but at least it's aligned.
+
+#ifndef MMHASH_H
+#define MMHASH_H
+
+#include "hash.h"
+#include <stdint.h>
+
+// The min is somewhat arbitrary, just meant to make sure caller isn't passing in garbage.
+#define MMHASH_MIN_POWER_OF_2_SIZE 3
+#define MMHASH_MAX_POWER_OF_2_SIZE hashMaxSize
+
+// elCount is two bytes so this is a hard limit.  Hopefully hash bucket element counts will always
+// be much less than this.
+#define MMHASH_MAX_EL_COUNT ((1 << 16) - 1)
+
+struct mmHash
+{
+    uint32_t powerOfTwoSize;    // power of two size between MMHASH_{MIN,MAX}_POWER_OF_2_SIZE
+    uint32_t mask;              // for convenience, mask on hashed key ((1 << powerOfTwoSize) - 1)
+    char *mmapFileName;         // path to memory-mapped file
+    unsigned char *mmapBytes;   // pointer to start of memory-mapped file
+    size_t mmapLength;          // number of memory-mapped bytes
+    uint64_t *bucketOffsets;    // for convenience, pointer to start of bucket array in mmapBytes
+};
+
+void hashToMmHashFile(struct hash *hash, char *mmapFilePath);
+/* Convert hash, whose values are null-terminated strings, to memory-mapped hash format and
+ * write to file at mmapFilePath. */
+
+struct mmHash *mmHashFromFile(char *mmapFilePath);
+/* Return an mmHash read in from memory-mapped file at mmapFilePath. */
+
+const char *mmHashFindVal(struct mmHash *mmh, char *key);
+/* Look up key in mmh and return its string value or NULL if not found.  Do not modify return val. */
+
+void mmHashFree(struct mmHash **pMmh);
+/* Free the allocated memory for *pMmh and unmap the mapped memory range if not NULL,
+ * but leave the memory-mapped file in place for other processes. */
+
+#endif // MMHASH_H