2d05d30ed4df1612d72ba84c812d004de935b122 angie Fri May 17 16:08:54 2024 -0700 Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup. Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries. diff --git src/lib/mmHash.c src/lib/mmHash.c new file mode 100644 index 0000000..e72881c --- /dev/null +++ src/lib/mmHash.c @@ -0,0 +1,153 @@ +/* Memory mapped hash of strings */ + +/* This file is copyright 2024 UCSC Genome Browser Authors, but license is hereby + * granted for all use - public, private or commercial. */ + +#include "common.h" +#include +#include +#include "mmHash.h" + +unsigned char mmhMagicBytes[] = {0xF0, 'm', 'm', 'h', 'v', '1', 0x0F}; +size_t mmhMagicLen = sizeof mmhMagicBytes; + +void hashToMmHashFile(struct hash *hash, char *mmapFilePath) +/* Convert hash, whose values are null-terminated strings, to memory-mapped hash format and + * write to file at mmapFilePath. */ +{ +if (hash->powerOfTwoSize < MMHASH_MIN_POWER_OF_2_SIZE || + hash->powerOfTwoSize > MMHASH_MAX_POWER_OF_2_SIZE) + errAbort("hashToMmHashFile: power of two size must be between %d and %d but is %d", + MMHASH_MIN_POWER_OF_2_SIZE, MMHASH_MAX_POWER_OF_2_SIZE, hash->powerOfTwoSize); +verbose(2, "hashToMmHashFile: power of two size %d\n", hash->powerOfTwoSize); +unsigned char powerOfTwoSize = hash->powerOfTwoSize; +// First write the file: magic bytes, byte with powerOfTwoSize, bucket offsets and key/val storage +FILE *f = mustOpen(mmapFilePath, "w"); +mustWrite(f, mmhMagicBytes, mmhMagicLen); +mustWrite(f, &powerOfTwoSize, 1); +// Allocate the bucket array in memory and seek past it in the file; we will fill it in with +// offsets as we go, and when done, seek back and write it to file. +size_t bucketCount = (1 << powerOfTwoSize); +size_t bucketArraySize = bucketCount * 8; +uint64_t *bucketOffsets = needLargeZeroedMem(bucketArraySize); +off_t bucketStartOffset = mmhMagicLen + 1; +off_t offset = bucketStartOffset + bucketArraySize; +mustSeek(f, offset, SEEK_SET); +int occupiedBuckets = 0; +int maxElCount = 0; +// For each hash bucket, if it has elements, then write its element count, key(s) and string value(s) +// to file and store its offset in bucketOffsets. +off_t i; +for (i = 0; i < bucketCount; i++) + { + struct hashEl *helList = hash->table[i]; + if (helList != NULL) + { + bucketOffsets[i] = offset; + int helCount = slCount(helList); + if (helCount > MMHASH_MAX_EL_COUNT) + errAbort("hashToMmHashFile: bucket %lu has %d elements but mmHash can have up to %d; " + "can't convert this hash to mmHash, sorry.", + i, helCount, MMHASH_MAX_EL_COUNT); + if (helCount > maxElCount) + maxElCount = helCount; + if (helCount > 100 && verboseLevel() >= 2) + { + verbose(2, "hashToMmHashFile: bucket %lu has %d elements, here are some example keys", + i, helCount); + struct hashEl *hel; + int n; + for (hel = helList, n = 0; hel != NULL && n < 20; hel = hel->next, n++) + verbose(2, ", '%s'", hel->name); + verbose(2, "\n"); + } + bits16 elCount = helCount; + mustWrite(f, &elCount, sizeof elCount); + offset += sizeof elCount; + struct hashEl *hel; + for (hel = helList; hel != NULL; hel = hel->next) + { + int len = strlen(hel->name) + 1; + mustWrite(f, hel->name, len); + offset += len; + len = strlen((char *)hel->val) + 1; + mustWrite(f, hel->val, len); + offset += len; + } + occupiedBuckets++; + } + } +// Now seek back to the start of bucketOffsets and write it. +mustSeek(f, bucketStartOffset, SEEK_SET); +mustWrite(f, bucketOffsets, bucketArraySize); +carefulClose(&f); +verbose(2, "hashToMmHashFile: %d / %lu buckets occupied, max element count is %d\n", + occupiedBuckets, bucketCount, maxElCount); +} + +struct mmHash *mmHashFromFile(char *mmapFilePath) +/* Return an mmHash read in from memory-mapped file at mmapFilePath. */ +{ +struct mmHash *mmh; +AllocVar(mmh); +mmh->mmapFileName = cloneString(mmapFilePath); +mmh->mmapLength = fileSize(mmapFilePath); +FILE *f = mustOpen(mmapFilePath, "r"); +mmh->mmapBytes = mmap(NULL, mmh->mmapLength, PROT_READ, MAP_PRIVATE, fileno(f), 0); +if (mmh->mmapBytes == MAP_FAILED) + errnoAbort("mmHashFromFile: mmap of file failed: %s", mmapFilePath); +if (madvise(mmh->mmapBytes, mmh->mmapLength, MADV_RANDOM | MADV_WILLNEED) < 0) + errnoAbort("mmHashFromFile: madvise of file failed: %s", mmapFilePath); +// Check first 7 magic bytes +if (strncmp((char *)mmh->mmapBytes, (char *)mmhMagicBytes, mmhMagicLen)) + errAbort("mmHashFromFile: magic bytes not found at start of file %s", mmapFilePath); +unsigned char powerOfTwoSize = mmh->mmapBytes[mmhMagicLen]; +if (powerOfTwoSize < MMHASH_MIN_POWER_OF_2_SIZE || powerOfTwoSize > MMHASH_MAX_POWER_OF_2_SIZE) + errAbort("mmHashFromFile: power of two size must be between %d and %d but is %d", + MMHASH_MIN_POWER_OF_2_SIZE, MMHASH_MAX_POWER_OF_2_SIZE, powerOfTwoSize); +mmh->powerOfTwoSize = powerOfTwoSize; +mmh->mask = ((1 << powerOfTwoSize) - 1); +mmh->bucketOffsets = (uint64_t *)(mmh->mmapBytes + mmhMagicLen + 1); +carefulClose(&f); +return mmh; +} + +const char *mmHashFindVal(struct mmHash *mmh, char *key) +/* Look up key in mmh and return its string value or NULL if not found. Do not modify return val. */ +{ +bits32 keyHash = hashString(key) & mmh->mask; +size_t bucketOffset = mmh->bucketOffsets[keyHash]; +// Bucket offsets are always greater than 0 because of the header stuff at the beginning of the file. +// If bucketOffset is 0, that is a special code for 'nothing in the bucket for this hash'. +if (bucketOffset == 0) + return NULL; +unsigned char *p = mmh->mmapBytes + bucketOffset; +// First two bytes of bucket is element count +bits16 elCount = *((bits16 *)p); +p += sizeof elCount; +int i; +for (i = 0; i < elCount; i++) + { + char *keyStr = (char *)p; + p += strlen(keyStr) + 1; + const char *valStr = (const char *)p; + p += strlen(valStr) + 1; + if (!strcmp(key, keyStr)) + return valStr; + } +return NULL; +} + +void mmHashFree(struct mmHash **pMmh) +/* Free the allocated memory for *pMmh and unmap the mapped memory range if not NULL, + * but leave the memory-mapped file in place for other processes. */ +{ +if (pMmh != NULL && *pMmh != NULL) + { + struct mmHash *mmh = *pMmh; + if (munmap(mmh->mmapBytes, mmh->mmapLength)) + errnoAbort("mmHashFree: munmap failed"); + freez(&(mmh->mmapFileName)); + freez(pMmh); + } +}