2d05d30ed4df1612d72ba84c812d004de935b122 angie Fri May 17 16:08:54 2024 -0700 Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup. Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries. diff --git src/utils/tabToMmHash/tabToMmHash.c src/utils/tabToMmHash/tabToMmHash.c new file mode 100644 index 0000000..837c554 --- /dev/null +++ src/utils/tabToMmHash/tabToMmHash.c @@ -0,0 +1,67 @@ +/* tabToMmHash - Read in a tab-sep file, hash first column to string of remaining columns, + * write mmHash file. */ +#include "common.h" +#include "linefile.h" +#include "mmHash.h" +#include "options.h" + +/* This file is copyright 2024 UCSC Genome Browser Authors, but license is hereby + * granted for all use - public, private or commercial. */ + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "tabToMmHash - Read in a tab-sep file, hash first column to string of remaining columns, write mmHash file\n" + "usage:\n" + " tabToMmHash in.tab out.mmh\n" +// "options:\n" +// " -xxx=XXX\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +void tabToMmHash(char *tabIn, char *mmhOut) +/* tabToMmHash - Read in a tab-sep file, hash first column to string of remaining columns, + * write mmHash file. */ +{ +// Read inFile into hash +struct lineFile *lf = lineFileOpen(tabIn, TRUE); +struct hash *hash = hashNew(0); +struct slName *keyList = NULL; +char *line; +int size; +while (lineFileNext(lf, &line, &size)) + { + char *key = line; + char *val = ""; + char *tab = strchr(line, '\t'); + if (tab != NULL) + { + *tab = '\0'; + val = tab + 1; + } + hashAdd(hash, key, cloneString(val)); + slNameAddHead(&keyList, key); + } +lineFileClose(&lf); +slReverse(&keyList); + +// Convert hash to mmHash file. +hashToMmHashFile(hash, mmhOut); +freeHashAndVals(&hash); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +tabToMmHash(argv[1], argv[2]); +return 0; +}