src/hg/hgPhyloPlace/phyloPlace.c 2d05d30ed4df1612d72ba84c812d004de935b122

2d05d30ed4df1612d72ba84c812d004de935b122
angie
  Fri May 17 16:08:54 2024 -0700
Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup.
Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 82fa6f1..b19a387 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -7,30 +7,31 @@
 #include "cart.h"
 #include "cheapcgi.h"
 #include "errCatch.h"
 #include "fa.h"
 #include "genePred.h"
 #include "grp.h"
 #include "hCommon.h"
 #include "hash.h"
 #include "hgConfig.h"
 #include "htmshell.h"
 #include "hubConnect.h"
 #include "hui.h"
 #include "iupac.h"
 #include "jsHelper.h"
 #include "linefile.h"
+#include "mmHash.h"
 #include "obscure.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "phyloTree.h"
 #include "pipeline.h"
 #include "psl.h"
 #include "pthreadWrap.h"
 #include "ra.h"
 #include "regexHelper.h"
 #include "trackHub.h"
 #include "trashDir.h"
 #include "vcf.h"
 
 // Globals:
 static boolean measureTiming = FALSE;
@@ -914,154 +915,201 @@
 struct variantPathNode *vpn;
 for (vpn = variantPath;  vpn != NULL;  vpn = vpn->next)
     {
     if (vpn != variantPath)
         fprintf(f, " > ");
     struct singleNucChange *snc;
     for (snc = vpn->sncList;  snc != NULL;  snc = snc->next)
         {
         if (snc != vpn->sncList)
             fprintf(f, ", ");
         fprintf(f, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
         }
     }
 }
 
-static struct hash *getSampleMetadata(char *metadataFile)
+static void getSampleMetadataHash(char *metadataFile, struct sampleMetadataStore *sms)
 /* If config.ra defines a metadataFile, load its contents into a hash indexed by name (and INSDC
  * nucleotide accession sans dot if present) and return the hash; otherwise return NULL. */
 {
-struct hash *sampleMetadata = NULL;
 if (isNotEmpty(metadataFile) && fileExists(metadataFile))
     {
-    sampleMetadata = hashNew(0);
+    struct hash *sampleMetadata = hashNew(0);
     struct lineFile *lf = lineFileOpen(metadataFile, TRUE);
     int headerWordCount = 0;
     char **headerWords = NULL;
     char *line;
     // Check for header line
     if (lineFileNext(lf, &line, NULL))
         {
         if (startsWithWord("strain", line))
             {
             char *headerLine = cloneString(line);
-            headerWordCount = chopString(headerLine, "\t", NULL, 0);
+            headerWordCount = chopByChar(headerLine, '\t', NULL, 0);
             AllocArray(headerWords, headerWordCount);
-            chopString(headerLine, "\t", headerWords, headerWordCount);
+            chopByChar(headerLine, '\t', headerWords, headerWordCount);
             }
         else
             errAbort("Missing header line from metadataFile %s", metadataFile);
         }
     // Look for genbank_accession in header line so we can index by that as well
     int genbankIx = stringArrayIx("genbank_accession", headerWords, headerWordCount);
     while (lineFileNext(lf, &line, NULL))
         {
-        struct sampleMetadata *met;
-        AllocVar(met);
-        met->columnCount = headerWordCount;
-        met->columnNames = headerWords;
-        AllocArray(met->columnValues, headerWordCount);
+        char **columnValues = NULL;
+        AllocArray(columnValues, headerWordCount);
         char *lineCopy = cloneString(line);
-        int wordCount = chopByChar(lineCopy, '\t', met->columnValues, headerWordCount);
+        int wordCount = chopByChar(lineCopy, '\t', columnValues, headerWordCount);
         lineFileExpectWords(lf, headerWordCount, wordCount);
-        if (genbankIx >= 0 && !isEmpty(met->columnValues[genbankIx]) &&
-            !sameString("?", met->columnValues[genbankIx]))
+        if (genbankIx >= 0 && !isEmpty(columnValues[genbankIx]) &&
+            !sameString("?", columnValues[genbankIx]))
             {
-            if (strchr(met->columnValues[genbankIx], '.'))
+            if (strchr(columnValues[genbankIx], '.'))
                 {
                 // Index by versionless accession
-                char copy[strlen(met->columnValues[genbankIx])+1];
-                safecpy(copy, sizeof copy, met->columnValues[genbankIx]);
+                char copy[strlen(columnValues[genbankIx])+1];
+                safecpy(copy, sizeof copy, columnValues[genbankIx]);
                 char *dot = strchr(copy, '.');
                 *dot = '\0';
-                hashAdd(sampleMetadata, copy, met);
+                hashAdd(sampleMetadata, copy, columnValues);
                 }
             else
-                hashAdd(sampleMetadata, met->columnValues[genbankIx], met);
+                hashAdd(sampleMetadata, columnValues[genbankIx], columnValues);
             }
-        hashAdd(sampleMetadata, met->columnValues[0], met);
+        hashAdd(sampleMetadata, columnValues[0], columnValues);
         }
     lineFileClose(&lf);
+    sms->hash = sampleMetadata;
+    sms->columnCount = headerWordCount;
+    sms->columnNames = headerWords;
+    }
+}
+
+static struct sampleMetadataStore *getSampleMetadata(char *metadataFile)
+/* If config.ra defines a metadataFile, load its contents into a hash indexed by name (and INSDC
+ * nucleotide accession sans dot if present) and return the hash; otherwise return NULL. */
+{
+struct sampleMetadataStore *sampleMetadata = NULL;
+if (isNotEmpty(metadataFile) && fileExists(metadataFile))
+    {
+    AllocVar(sampleMetadata);
+    if (endsWith(metadataFile, ".mmh"))
+        sampleMetadata->mmh = mmHashFromFile(metadataFile);
+    else
+        getSampleMetadataHash(metadataFile, sampleMetadata);
     }
 return sampleMetadata;
 }
 
 char *gbIdFromSampleName(char *sampleId)
 /* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */
 {
 char *gbId = NULL;
 regmatch_t substrs[2];
 // If it's preceded by anything, make sure it's a | so we ignore isolate names that glom on the
 // reference's GenBank accession in them (e.g. ..._MN908947.3/2022|OQ070230.1).
 if (regexMatchSubstr(sampleId, "^(.*\\|)?([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs)))
     {
     // Make sure there is a word boundary at the end of the match too
     if (!isalnum(sampleId[substrs[1].rm_eo]))
         gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so);
     }
 return gbId;
 }
 
-struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId)
+static char **metadataForSampleHash(struct hash *sampleMetadata, char *sampleId)
 /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
 {
-struct sampleMetadata *met = NULL;
-if (sampleMetadata == NULL)
-    return NULL;
-met = hashFindVal(sampleMetadata, sampleId);
+char **met = hashFindVal(sampleMetadata, sampleId);
 if (met)
     return met;
 if (!met)
     {
     char *gbId = gbIdFromSampleName(sampleId);
     if (gbId)
         met = hashFindVal(sampleMetadata, gbId);
     }
 if (!met && strchr(sampleId, '|'))
     {
     char copy[strlen(sampleId)+1];
     safecpy(copy, sizeof copy, sampleId);
     char *words[4];
-    int wordCount = chopString(copy, "|", words, ArraySize(words));
+    int wordCount = chopByChar(copy, '|', words, ArraySize(words));
     if (isNotEmpty(words[0]))
         met = hashFindVal(sampleMetadata, words[0]);
     if (met == NULL && wordCount > 1 && isNotEmpty(words[1]))
         met = hashFindVal(sampleMetadata, words[1]);
     }
-// If it's one of our collapsed node names, dig out the example name and try that.
-if (!met && isdigit(sampleId[0]) && strstr(sampleId, "_from_"))
+return met;
+}
+
+static char **metadataForSampleMmHash(struct sampleMetadataStore *sms, char *sampleId)
+/* Look up sampleId in mmHash and parse string value into struct sampleMetadata, or return NULL if
+ * not found. */
+{
+char **met = NULL;
+const char *metadataLine = mmHashFindVal(sms->mmh, sampleId);
+if (metadataLine)
     {
-    char *eg = strstr(sampleId, "_eg_");
-    if (eg)
-        met = hashFindVal(sampleMetadata, eg+strlen("_eg_"));
+    if (sms->columnNames == NULL)
+        {
+        const char *headerLine = mmHashFindVal(sms->mmh, "strain");
+        if (headerLine == NULL)
+            errAbort("metadataForSampleMmHash: can't find column definitions in file "
+                     "(no line starts with 'strain')");
+        char *headerLineCopy = cloneString(headerLine);
+        sms->columnCount = chopByChar(headerLineCopy, '\t', NULL, 0);
+        AllocArray(sms->columnNames, sms->columnCount);
+        chopByChar(headerLineCopy, '\t', sms->columnNames, sms->columnCount);
+        }
+    char *lineCopy = cloneString(metadataLine);
+    int metWordCount = chopByChar(lineCopy, '\t', NULL, 0);
+    if (metWordCount != sms->columnCount)
+        errAbort("metadataForSampleMmHash: found %lu header columns but %d data columns for '%s'"
+                 " ('%s')",
+                 sms->columnCount, metWordCount, sampleId, metadataLine);
+    AllocArray(met, sms->columnCount);
+    chopByChar(lineCopy, '\t', met, sms->columnCount);
     }
 return met;
 }
 
-static char *lineageForSample(struct hash *sampleMetadata, char *sampleId)
+char **metadataForSample(struct sampleMetadataStore *sampleMetadata, char *sampleId)
+/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
+{
+if (sampleMetadata == NULL)
+    return NULL;
+if (sampleMetadata->hash)
+    return metadataForSampleHash(sampleMetadata->hash, sampleId);
+else
+    return metadataForSampleMmHash(sampleMetadata, sampleId);
+}
+
+
+static char *lineageForSample(struct sampleMetadataStore *sms, char *sampleId)
 /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */
 {
 char *lineage = NULL;
-struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId);
+char **met = metadataForSample(sms, sampleId);
 if (met)
     {
     int ix;
-    if ((ix = stringArrayIx("pangolin_lineage", met->columnNames, met->columnCount)) >= 0 ||
-        (ix = stringArrayIx("pango_lineage", met->columnNames, met->columnCount)) >= 0 ||
-        (ix = stringArrayIx("Nextstrain_lineage", met->columnNames, met->columnCount)) >= 0 ||
-        (ix = stringArrayIx("GCC_nextclade", met->columnNames, met->columnCount)) >= 0)
-        lineage = met->columnValues[ix];
+    if ((ix = stringArrayIx("pangolin_lineage", sms->columnNames, sms->columnCount)) >= 0 ||
+        (ix = stringArrayIx("pango_lineage", sms->columnNames, sms->columnCount)) >= 0 ||
+        (ix = stringArrayIx("Nextstrain_lineage", sms->columnNames, sms->columnCount)) >= 0 ||
+        (ix = stringArrayIx("GCC_nextclade", sms->columnNames, sms->columnCount)) >= 0)
+        lineage = met[ix];
     }
 return lineage;
 }
 
 static void printLineageLink(char *lineage, char *db)
 /* Print lineage with link to outbreak.info or pango-designation issue if appropriate.
  * Caller must handle case of empty/NULL lineage. */
 {
 boolean isSarsCov2 = sameString(db, "wuhCor1");
 if (isEmpty(lineage))
     errAbort("programmer error:printLineageLink called with empty lineage");
 else if (isSarsCov2 && startsWith("proposed", lineage))
     printf("<a href='"PANGO_DESIGNATION_ISSUE_URLBASE"%s' target=_blank>%s</a>",
            lineage+strlen("proposed"), lineage);
 else if (isSarsCov2 && differentString(lineage, "None") && differentString(lineage, "Unassigned"))
@@ -1222,31 +1270,31 @@
     struct placementInfo *info = hashFindVal(samplePlacements, row->name);
     if (info)
         {
         if (isNotEmpty(info->nearestNeighbor))
             neighbor = info->nearestNeighbor;
         if (isNotEmpty(info->neighborLineage))
             lineage = info->neighborLineage;
         }
     printf("%-*s  %s  %s\n", maxLen, asciiRow, neighbor, lineage);
     }
 slNameFreeList(&rowList);
 dyStringFree(&dy);
 }
 
 static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements,
-                                     struct phyloTree *subtree, struct hash *sampleMetadata,
+                                     struct phyloTree *subtree,
                                      char *source, char *refAcc, char *db)
 /* Report how each sample fits into the big tree. */
 {
 // Sort sample placements by sampleMuts so we can group identical samples.
 struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements);
 slSort(&placementRefs, placementInfoRefCmpSampleMuts);
 int relatedCount = slCount(placementRefs);
 int clumpSize = countIdentical(placementRefs);
 if (clumpSize < relatedCount && relatedCount > 2)
     {
     // Not all of the related sequences are identical, so they will be broken down into
     // separate "clumps".  List all related samples first to avoid confusion.
     puts("<pre>");
     asciiTreeWithNeighborInfo(subtree, samplePlacements);
     puts("</pre>");
@@ -1339,47 +1387,47 @@
                                                 int *retIx)
 /* Find the subtree that contains sample name and set *retIx to its index in the list.
  * If we can't find it, return NULL and set *retIx to -1. */
 {
 struct subtreeInfo *ti;
 int ix;
 for (ti = subtreeInfoList, ix = 0;  ti != NULL;  ti = ti->next, ix++)
     if (slNameInListUseCase(ti->subtreeUserSampleIds, name))
         break;
 if (ti == NULL)
     ix = -1;
 *retIx = ix;
 return ti;
 }
 
-static void findNearestNeighbors(struct usherResults *results, struct hash *sampleMetadata)
+static void findNearestNeighbors(struct usherResults *results, struct sampleMetadataStore *sms)
 /* For each placed sample, find the nearest neighbor in the subtree and its assigned lineage,
  * and fill in those two fields of placementInfo. */
 {
 struct hashCookie cookie = hashFirst(results->samplePlacements);
 struct hashEl *hel;
 while ((hel = hashNext(&cookie)) != NULL)
     {
     struct placementInfo *info = hel->val;
     int ix;
     struct subtreeInfo *ti = subtreeInfoForSample(results->subtreeInfoList, info->sampleId, &ix);
     if (!ti)
         continue;
     info->nearestNeighbor = findNearestNeighbor(ti->subtree, info->sampleId,
                                                 results->samplePlacements);
     if (isNotEmpty(info->nearestNeighbor))
-        info->neighborLineage = lineageForSample(sampleMetadata, info->nearestNeighbor);
+        info->neighborLineage = lineageForSample(sms, info->nearestNeighbor);
     }
 }
 
 static void lookForCladesAndLineages(struct hash *samplePlacements,
                                      boolean *retGotClades, boolean *retGotLineages)
 /* See if UShER has annotated any clades and/or lineages for seqs. */
 {
 boolean gotClades = FALSE, gotLineages = FALSE;
 struct hashEl *hel;
 struct hashCookie cookie = hashFirst(samplePlacements);
 while ((hel = hashNext(&cookie)) != NULL)
     {
     struct placementInfo *pi = hel->val;
     if (pi)
         {
@@ -2098,31 +2146,31 @@
             {
             if (gotClades)
                 printf("<td>n/a</td>");
             if (gotLineages)
                 printf("<td>n/a</td>");
             printf("<td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td>");
             }
         printSubtreeTd(ur->subtreeInfoList, jsonTns, si->seq->name, subtreeSize);
         puts("</tr>");
         }
     puts("</tbody></table><p></p>");
     }
 }
 
 static void summarizeSubtrees(struct slName *sampleIds, struct usherResults *results,
-                              struct hash *sampleMetadata, struct tempName *jsonTns[],
+                              struct sampleMetadataStore *sms, struct tempName *jsonTns[],
                               char *db, int subtreeSize)
 /* Print a summary table of pasted/uploaded identifiers and subtrees */
 {
 boolean gotClades = FALSE, gotLineages = FALSE;
 lookForCladesAndLineages(results->samplePlacements, &gotClades, &gotLineages);
 puts("<table class='seqSummary'><tbody>");
 puts("<tr><th>Sequence</th>");
 boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) ||
                  startsWith("RGCC", db));
 if (gotClades)
     {
     if (sameString(db, "wuhCor1"))
         puts("<th>Nextstrain clade (UShER)"
      TOOLTIP("The <a href='https://nextstrain.org/blog/2021-01-06-updated-SARS-CoV-2-clade-naming' "
              "target=_blank>Nextstrain clade</a> "
@@ -2183,31 +2231,31 @@
     if (pi)
         {
         if (gotClades)
             printf("<td>%s</td>", pi->nextClade ? pi->nextClade : "n/a");
         if (gotLineages)
             printLineageTd(pi->pangoLineage, "n/a", db);
         }
     else
         {
         if (gotClades)
             printf("<td>n/a</td>");
         if (gotLineages)
             printf("<td>n/a</td>");
         }
     // pangolin-assigned lineage
-    char *lineage = lineageForSample(sampleMetadata, si->name);
+    char *lineage = lineageForSample(sms, si->name);
     printLineageTd(lineage, "n/a", db);
     // Maybe also #mutations with mouseover to show mutation path?
     printSubtreeTd(results->subtreeInfoList, jsonTns, si->name, subtreeSize);
     }
 puts("</tbody></table><p></p>");
 }
 
 static struct singleNucChange *sncListFromSampleMutsAndImputed(struct slName *sampleMuts,
                                                                struct baseVal *imputedBases,
                                                                struct seqWindow *gSeqWin)
 /* Convert a list of "<ref><pos><alt>" names to struct singleNucChange list.
  * However, if <alt> is ambiguous, skip it because variantProjector doesn't like it.
  * Add imputed base predictions. */
 {
 struct singleNucChange *sncList = NULL;
@@ -2834,31 +2882,31 @@
 static void addNameMaybeComponents(struct hash *nameHash, char *fullName, boolean addComponents)
 /* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName.
  * If addComponents and fullName is |-separated name|ID|date, add name and ID to nameHash. */
 {
 char *fullNameHashStored = hashStoreName(nameHash, fullName);
 // Now that we have hash storage for fullName, make it point to itself.
 struct hashEl *hel = hashLookup(nameHash, fullName);
 if (hel == NULL)
     errAbort("Can't look up '%s' right after adding it", fullName);
 hel->val = fullNameHashStored;
 if (addComponents)
     {
     char *words[4];
     char copy[strlen(fullName)+1];
     safecpy(copy, sizeof copy, fullName);
-    int wordCount = chopString(copy, "|", words, ArraySize(words));
+    int wordCount = chopByChar(copy, '|', words, ArraySize(words));
     if (wordCount == 3)
         {
         // name|ID|date
         hashAdd(nameHash, words[0], fullNameHashStored);
         maybeAddIsolate(nameHash, words[0], fullNameHashStored);
         addModVersion(nameHash, words[1], fullNameHashStored);
         }
     else if (wordCount == 2)
         {
         // ID|date
         addModVersion(nameHash, words[0], fullNameHashStored);
         // ID might be a COG-UK country/isolate/year
         maybeAddIsolate(nameHash, words[0], fullNameHashStored);
         }
     }
@@ -2904,208 +2952,239 @@
         int wordCount = chopTabs(line, words);
         lineFileExpectWords(lf, 2, wordCount);
         char *fullName = hashFindVal(nameHash, words[1]);
         if (fullName)
             hashAdd(nameHash, words[0], fullName);
         else
             {
             if (missExample == NULL)
                 missExample = cloneString(words[1]);
             }
         }
     lineFileClose(&lf);
     }
 }
 
-static struct hash *getTreeNames(char *nameFile, char *protobufFile,
+static struct hash *getTreeNamesHash(char *nameFile, char *protobufFile,
                                      struct mutationAnnotatedTree **pBigTree,
                                      boolean addComponents, int *pStartTime)
 /* Make a hash of full names of leaves of bigTree, using nameFile if it exists, otherwise
  * falling back on the bigTree itself, loading it if necessary.  If addComponents, also map
  * components of those names to the full names in case the user gives us partial names/IDs. */
 {
 struct hash *nameHash = NULL;
 if (isNotEmpty(nameFile) && fileExists(nameFile))
     {
     nameHash = hashNew(26);
     struct lineFile *lf = lineFileOpen(nameFile, TRUE);
     char *line;
     while (lineFileNext(lf, &line, NULL))
         addNameMaybeComponents(nameHash, line, addComponents);
     lineFileClose(&lf);
     }
 else
     {
     if (*pBigTree == NULL)
         {
         *pBigTree = parseParsimonyProtobuf(protobufFile);
         reportTiming(pStartTime, "parse protobuf file (at startup, because sample names file was "
                      "not provided)");
         }
     struct mutationAnnotatedTree *bigTree = *pBigTree;
     int nodeCount = bigTree->nodeHash->elCount;
     nameHash = hashNew(digitsBaseTwo(nodeCount) + 3);
     rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash, addComponents);
     }
-reportTiming(pStartTime, "get tree names");
 return nameHash;
 }
 
-static char *matchName(struct hash *nameHash, char *name)
+static struct hashOrMmHash *getTreeNames(char *nameFile, char *protobufFile,
+                                         struct mutationAnnotatedTree **pBigTree,
+                                         boolean addComponents, int *pStartTime)
+/* Make a hash of full names of leaves of bigTree, using nameFile if it exists, otherwise
+ * falling back on the bigTree itself, loading it if necessary.  If addComponents, also map
+ * components of those names to the full names in case the user gives us partial names/IDs. */
+{
+struct hashOrMmHash *treeNames = NULL;
+AllocVar(treeNames);
+if (isNotEmpty(nameFile) && fileExists(nameFile) && endsWith(nameFile, ".mmh"))
+    treeNames->mmh = mmHashFromFile(nameFile);
+else
+    treeNames->hash = getTreeNamesHash(nameFile, protobufFile, pBigTree, addComponents, pStartTime);
+
+reportTiming(pStartTime, "get tree names");
+return treeNames;
+}
+
+static const char *hashOrMmHashFindVal(struct hashOrMmHash *homh, char *key)
+/* Look up string value for key in homh; return NULL if not found. */
+{
+const char *val = NULL;
+if (homh)
+    {
+    if (homh->hash)
+        val = hashFindVal(homh->hash, key);
+    else
+        val = mmHashFindVal(homh->mmh, key);
+    }
+return val;
+}
+
+static const char *matchName(struct hashOrMmHash *nameHash, char *name)
 /* Look for a possibly partial name or ID provided by the user in nameHash.  Return the result,
  * possibly NULL.  If the full name doesn't match, try components of the name. */
 {
 name = trimSpaces(name);
 // GISAID fasta headers all have hCoV-19/country/isolate/year|EPI_ISL_#|date; strip the hCoV-19
 // because Nextstrain strips it in nextmeta/nextfasta download files, and so do I when building
 // UCSC's tree.
 if (startsWithNoCase("hCoV-19/", name))
     name += strlen("hCoV-19/");
-char *match = hashFindVal(nameHash, name);
+const char *match = hashOrMmHashFindVal(nameHash, name);
 int minWordSize=5;
 if (match == NULL && strchr(name, '|'))
     {
     // GISAID fasta headers have name|ID|date, and so do our tree IDs; try ID and name separately.
     char *words[4];
     char copy[strlen(name)+1];
     safecpy(copy, sizeof copy, name);
-    int wordCount = chopString(copy, "|", words, ArraySize(words));
+    int wordCount = chopByChar(copy, '|', words, ArraySize(words));
     if (wordCount == 3)
         {
         // name|ID|date; try ID first.
         if (strlen(words[1]) > minWordSize)
-            match = hashFindVal(nameHash, words[1]);
+            match = hashOrMmHashFindVal(nameHash, words[1]);
         if (match == NULL && strlen(words[0]) > minWordSize)
             {
-            match = hashFindVal(nameHash, words[0]);
+            match = hashOrMmHashFindVal(nameHash, words[0]);
             // Sometimes country/isolate names have spaces... strip out if present.
             if (match == NULL && strchr(words[0], ' '))
                 {
                 stripChar(words[0], ' ');
-                match = hashFindVal(nameHash, words[0]);
+                match = hashOrMmHashFindVal(nameHash, words[0]);
                 }
             }
         }
     else if (wordCount == 2)
         {
         // ID|date
         if (strlen(words[0]) > minWordSize)
-             match = hashFindVal(nameHash, words[0]);
+             match = hashOrMmHashFindVal(nameHash, words[0]);
         }
     }
 else if (match == NULL && strchr(name, ' '))
     {
     // GISAID sequence names may include spaces, in both country names ("South Korea") and
     // isolate names.  That messes up FASTA headers, so Nextstrain strips out spaces when
     // making the nextmeta and nextfasta download files for GISAID.  Try stripping out spaces:
     char copy[strlen(name)+1];
     safecpy(copy, sizeof copy, name);
     stripChar(copy, ' ');
-    match = hashFindVal(nameHash, copy);
+    match = hashOrMmHashFindVal(nameHash, copy);
     }
 return match;
 }
 
-static boolean tallyMatch(char *match, char *term,
+static boolean tallyMatch(const char *match, char *term,
                           struct slName **retMatches, struct slName **retUnmatched)
 /* If match is non-NULL, add result to retMatches and return TRUE, otherwise add term to
  * retUnmatched and return FALSE. */
 {
 boolean foundIt = FALSE;
 if (match)
     {
     foundIt = TRUE;
     slNameAddHead(retMatches, match);
     }
 else
     slNameAddHead(retUnmatched, term);
 return foundIt;
 }
 
-static boolean matchIdRange(struct hash *nameHash, char *line,
+static boolean matchIdRange(struct hashOrMmHash *nameHash, char *line,
                             struct slName **retMatches, struct slName **retUnmatched)
 /* If line looks like it might contain a range of IDs, for example EPI_ISL_123-129 from an EPI_SET,
  * then expand the range(s) into individual IDs, look up the IDs, set retMatches and retUnmatched
  * to per-ID results, and return TRUE. */
 {
 boolean foundAny = FALSE;
 *retMatches = *retUnmatched = NULL;
 regmatch_t substrArr[7];
 // Line may contain a list of distinct IDs and/or ID ranges
 #define oneIdExp "([A-Z_]+)([0-9]+)"
 #define rangeEndExp "- *([A-Z_]*)([0-9]+)"
 #define rangeListExp "^("oneIdExp",? *("rangeEndExp")?),? *"
 while (regexMatchSubstr(line, rangeListExp, substrArr, ArraySize(substrArr)))
     {
     char *prefixA = regexSubstringClone(line, substrArr[2]);
     char *numberA = regexSubstringClone(line, substrArr[3]);
     if (regexSubstrMatched(substrArr[4]))
         {
         // Looks like a well-formed ID range
         char *prefixB = regexSubstringClone(line, substrArr[5]);
         char *numberB = regexSubstringClone(line, substrArr[6]);
         int start = atol(numberA);
         int end = atol(numberB);
         if ((isEmpty(prefixB) || sameString(prefixA, prefixB)) && end >= start)
             {
             char oneId[strlen(line)+1];
             int num;
             for (num = start;  num <= end;  num++)
                 {
                 safef(oneId, sizeof oneId, "%s%d", prefixA, num);
-                char *match = hashFindVal(nameHash, oneId);
+                const char *match = hashOrMmHashFindVal(nameHash, oneId);
                 foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
                 }
             }
         else
             {
             // It matched the regex but the prefixes don't match and/or numbers are out of order
             // so we don't know what to do with it -- try matchName just in case.
             char *regMatch = regexSubstringClone(line, substrArr[1]);
-            char *match = matchName(nameHash, regMatch);
+            const char *match = matchName(nameHash, regMatch);
             foundAny |= tallyMatch(match, regMatch, retMatches, retUnmatched);
             }
         }
     else
         {
         // Just one ID
         char oneId[strlen(line)+1];
         safef(oneId, sizeof oneId, "%s%s", prefixA, numberA);
-        char *match = hashFindVal(nameHash, oneId);
+        const char *match = hashOrMmHashFindVal(nameHash, oneId);
         foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
         }
     // Skip past this match to see if the line has another range next.
     line += (substrArr[0].rm_eo - substrArr[0].rm_so);
     }
 return foundAny;
 }
 
-static struct slName *readSampleIds(struct lineFile *lf, struct hash *nameHash)
+static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash)
 /* Read a file of sample names/IDs from the user; typically these will not be exactly the same
  * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt
  * to find component matches if an exact match isn't found. */
 {
 struct slName *sampleIds = NULL;
 struct slName *unmatched = NULL;
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     // If tab-sep, just try first word in line
     char *tab = strchr(line, '\t');
     if (tab)
         *tab = '\0';
-    char *match = matchName(nameHash, line);
+    const char *match = matchName(nameHash, line);
     if (match)
         slNameAddHead(&sampleIds, match);
     else
         {
         struct slName *rangeMatches = NULL, *rangeUnmatched = NULL;
         if (matchIdRange(nameHash, line, &rangeMatches, &rangeUnmatched))
             {
             sampleIds = slCat(rangeMatches, sampleIds);
             unmatched = slCat(rangeUnmatched, unmatched);
             }
         else
             {
             // If comma-sep, just try first word in line
             char *comma = strchr(line, ',');
             if (comma)
@@ -3137,31 +3216,31 @@
     warn("Unable to find %d of your sequences in the tree, e.g. %s",
          slCount(unmatched), firstFew->string);
     dyStringFree(&firstFew);
     }
 else if (sampleIds == NULL)
     warn("Could not find any names in input; empty file uploaded?");
 slNameFreeList(&unmatched);
 return sampleIds;
 }
 
 void *loadMetadataWorker(void *arg)
 /* pthread worker function to read a tab-sep metadata file and return it hashed by name.  */
 {
 char *metadataFile = arg;
 int startTime = clock1000();
-struct hash *sampleMetadata = getSampleMetadata(metadataFile);
+struct sampleMetadataStore *sampleMetadata = getSampleMetadata(metadataFile);
 reportTiming(&startTime, "read sample metadata (in a pthread)");
 return sampleMetadata;
 }
 
 static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *))
 /* Fork off a child process that parses a file and returns the resulting data structure. */
 {
 pthread_t *pt;
 AllocVar(pt);
 if (! pthreadMayCreate(pt, NULL, workerFunction, filename))
     pt = NULL;
 return pt;
 }
 
 static struct dnaSeq *getChromSeq(char *org, char *ref, char *db)
@@ -3243,31 +3322,31 @@
 if (0)
     {
     bigTree = parseParsimonyProtobuf(protobufPath);
     reportTiming(&startTime, "parse protobuf file (at startup, for excluding informativeBases "
                  "from maskSites)");
     informativeBasesFromTree(bigTree->tree, maskSites, refGenome->size);
     reportTiming(&startTime, "remove any informative bases in tree from maskSites");
     }
 boolean isFasta = FALSE;
 boolean subtreesOnly = FALSE;
 struct seqInfo *seqInfoList = NULL;
 if (lfLooksLikeFasta(lf))
     {
     struct slPair *failedSeqs;
     struct slPair *failedPsls;
-    struct hash *treeNames = NULL;
+    struct hashOrMmHash *treeNames = NULL;
     // We need to check uploaded names in fasta only for original usher, not usher-sampled(-server).
     if (!serverIsConfigured(org) && !endsWith(usherPath, "-sampled"))
         treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, FALSE, &startTime);
     vcfTn = vcfFromFasta(lf, org, refName, refGenome, maskSites, treeNames,
                          &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime);
     if (failedSeqs)
         {
         puts("<p>");
         struct slPair *fail;
         for (fail = failedSeqs;  fail != NULL;  fail = fail->next)
             printf("%s<br>\n", fail->name);
         puts("</p>");
         }
     if (failedPsls)
         {
@@ -3278,60 +3357,64 @@
         puts("</p>");
         }
     if (seqInfoList == NULL)
         printf("<p>Sorry, could not align any sequences to reference well enough to place in "
                "the phylogenetic tree.</p>\n");
     isFasta = TRUE;
     }
 else if (lfLooksLikeVcf(lf))
     {
     vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds);
     reportTiming(&startTime, "check uploaded VCF");
     }
 else
     {
     subtreesOnly = TRUE;
-    struct hash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE, &startTime);
-    addAliases(treeNames, aliasFile);
+    struct hashOrMmHash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE,
+                                                  &startTime);
+    if (treeNames && treeNames->hash)
+        {
+        addAliases(treeNames->hash, aliasFile);
         reportTiming(&startTime, "load sample name aliases");
+        }
     sampleIds = readSampleIds(lf, treeNames);
     reportTiming(&startTime, "look up uploaded sample names");
     }
 lineFileClose(&lf);
 if (sampleIds == NULL)
     {
     return;
     }
 
 // Kick off child thread to load metadata simultaneously with running usher or matUtils.
 pthread_t *metadataPthread = mayStartLoaderPthread(metadataFile, loadMetadataWorker);
 
 struct usherResults *results = NULL;
 if (vcfTn)
     {
     fflush(stdout);
     results = runUsher(org, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds,
                        treeChoices, &startTime);
     }
 else if (subtreesOnly)
     {
     char *matUtilsPath = getMatUtilsPath(TRUE);
     results = runMatUtilsExtractSubtrees(refName, matUtilsPath, protobufPath, subtreeSize,
                                          sampleIds, treeChoices, &startTime);
     }
 
-struct hash *sampleMetadata = NULL;
+struct sampleMetadataStore *sampleMetadata = NULL;
 if (metadataPthread)
     {
     pthreadJoin(metadataPthread, (void **)(&sampleMetadata));
     reportTiming(&startTime, "wait for sample metadata loading thread to finish");
     }
 else
     {
     // We really need metadata -- load it the slow way.
     sampleMetadata = getSampleMetadata(metadataFile);
     reportTiming(&startTime, "load sample metadata without pthread");
     }
 
 if (results && results->singleSubtreeInfo)
     {
     if (retSuccess)
@@ -3446,31 +3529,31 @@
                 {
                 int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds);
                 printf("<h3>Subtree %d: ", ix+1);
                 if (subtreeUserSampleCount > 1)
                     printf("%d related samples", subtreeUserSampleCount);
                 else if (subtreeCount > 1)
                     printf("Unrelated sample");
                 printf("</h3>\n");
                 makeNextstrainButtonN("viewNextstrainSub", ix, subtreeUserSampleCount, subtreeSize,
                                       jsonTns);
                 puts("<br>");
                 // Make a sub-subtree with only user samples for display:
                 struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi);
                 subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds);
                 describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements,
-                                         subtree, sampleMetadata, source, refAcc, refName);
+                                         subtree, source, refAcc, refName);
                 }
             reportTiming(&startTime, "describe placements");
             }
         else
             printf("<p>(Skipping details; "
                    "you uploaded %d sequences, and details are shown only when "
                    "you upload at most %d sequences.)</p>\n",
                    seqCount, MAX_SEQ_DETAILS);
         }
 
     puts("<h3>Downloads</h3>");
     if (! subtreesOnly)
         {
         puts("<ul>");
         // Offer big tree w/new samples for download