src/hg/hgPhyloPlace/phyloPlace.c 0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3

0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3
angie
  Fri Oct 28 15:08:06 2022 -0700
Performance improvements for trees with millions of sequences:
* Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher
* Use usher-sampled instead of usher if server is not configured but usher-sampled is available
* Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running
* Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results)
* Avoid parsing the protobuf and traversing the big tree unless absolutely necessary
** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c
** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree)
** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new.
** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 7cf8d44..fe1e4a4 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -1,42 +1,43 @@
 /* Place SARS-CoV-2 sequences in phylogenetic tree using usher program. */
 
-/* Copyright (C) 2020 The Regents of the University of California */
+/* Copyright (C) 2020-2022 The Regents of the University of California */
 
 #include "common.h"
 #include "bigBed.h"
 #include "cart.h"
 #include "cheapcgi.h"
 #include "errCatch.h"
 #include "fa.h"
 #include "genePred.h"
 #include "grp.h"
 #include "hCommon.h"
 #include "hash.h"
 #include "hgConfig.h"
 #include "htmshell.h"
 #include "hubConnect.h"
 #include "hui.h"
 #include "iupac.h"
 #include "jsHelper.h"
 #include "linefile.h"
 #include "obscure.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "phyloTree.h"
 #include "pipeline.h"
 #include "psl.h"
+#include "pthreadWrap.h"
 #include "ra.h"
 #include "regexHelper.h"
 #include "trackHub.h"
 #include "trashDir.h"
 #include "vcf.h"
 
 // Globals:
 static boolean measureTiming = FALSE;
 
 // Parameter constants:
 int maxGenotypes = 1000;        // Upper limit on number of samples user can upload at once.
 boolean showParsimonyScore = FALSE;
 
 
 struct slName *phyloPlaceDbList(struct cart *cart)
@@ -141,36 +142,43 @@
 {
 char *fileName = phyloPlaceDbSetting(db, settingName);
 if (isNotEmpty(fileName) && fileName[0] != '/' && !isUrl(fileName) && !fileExists(fileName))
     {
     struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s",
                                          trackHubSkipHubName(db), fileName);
     if (fileExists(dy->string))
         return dyStringCannibalize(&dy);
     else
         return NULL;
     }
 return fileName;
 }
 
 char *getUsherPath(boolean abortIfNotFound)
-/* Return hgPhyloPlaceData/usher if it exists, else NULL.  Do not free the returned value. */
+/* Return hgPhyloPlaceData/usher-sampled if it exists, else try hgPhyloPlaceData/usher, else NULL.
+ * Do not free the returned value. */
 {
-char *usherPath = PHYLOPLACE_DATA_DIR "/usher";
+char *usherPath = PHYLOPLACE_DATA_DIR "/usher-sampled";
 if (fileExists(usherPath))
     return usherPath;
-else if (abortIfNotFound)
+else
+    {
+    usherPath = PHYLOPLACE_DATA_DIR "/usher";
+    if (fileExists(usherPath))
+        return usherPath;
+    }
+if (abortIfNotFound)
     errAbort("Missing usher executable (expected to be at %s)", usherPath);
 return NULL;
 }
 
 char *getMatUtilsPath(boolean abortIfNotFound)
 /* Return hgPhyloPlaceData/matUtils if it exists, else NULL.  Do not free the returned value. */
 {
 char *matUtilsPath = PHYLOPLACE_DATA_DIR "/matUtils";
 if (fileExists(matUtilsPath))
     return matUtilsPath;
 else if (abortIfNotFound)
     errAbort("Missing matUtils executable (expected to be at %s)", matUtilsPath);
 return NULL;
 }
 
@@ -211,64 +219,70 @@
 
 struct treeChoices *loadTreeChoices(char *db)
 /* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */
 {
 struct treeChoices *treeChoices = NULL;
 char *filename = phyloPlaceDbSettingPath(db, "treeChoices");
 if (isNotEmpty(filename) && fileExists(filename))
     {
     AllocVar(treeChoices);
     int maxChoices = 128;
     AllocArray(treeChoices->protobufFiles, maxChoices);
     AllocArray(treeChoices->metadataFiles, maxChoices);
     AllocArray(treeChoices->sources, maxChoices);
     AllocArray(treeChoices->descriptions, maxChoices);
     AllocArray(treeChoices->aliasFiles, maxChoices);
+    AllocArray(treeChoices->sampleNameFiles, maxChoices);
     struct lineFile *lf = lineFileOpen(filename, TRUE);
     char *line;
     while (lineFileNextReal(lf, &line))
         {
-        char *words[6];
+        char *words[7];
         int wordCount = chopTabs(line, words);
         lineFileExpectAtLeast(lf, 4, wordCount);
         if (treeChoices->count >= maxChoices)
             {
             warn("File %s has too many lines, only showing first %d phylogenetic tree choices",
                  filename, maxChoices);
             break;
             }
         struct dyString *dy = dyStringNew(0);
         addPathIfNecessary(dy, db, words[0]);
         treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string);
         addPathIfNecessary(dy, db, words[1]);
         treeChoices->metadataFiles[treeChoices->count] = cloneString(dy->string);
         treeChoices->sources[treeChoices->count] = cloneString(words[2]);
         // Description can be either a file or just some text.
         addPathIfNecessary(dy, db, words[3]);
         if (fileExists(dy->string))
             {
             char *desc = NULL;
             readInGulp(dy->string, &desc, NULL);
             treeChoices->descriptions[treeChoices->count] = desc;
             }
         else
             treeChoices->descriptions[treeChoices->count] = cloneString(words[3]);
         if (wordCount > 4)
             {
             addPathIfNecessary(dy, db, words[4]);
             treeChoices->aliasFiles[treeChoices->count] = cloneString(dy->string);
             }
+        if (wordCount > 5)
+            {
+            addPathIfNecessary(dy, db, words[5]);
+            treeChoices->sampleNameFiles[treeChoices->count] = cloneString(dy->string);
+            }
         treeChoices->count++;
         dyStringFree(&dy);
         }
     lineFileClose(&lf);
     }
 return treeChoices;
 }
 
 static char *urlFromTn(struct tempName *tn)
 /* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't
  * just leave it up to the user's web browser to do the right thing with "../". */
 {
 struct dyString *dy = dyStringCreate("%s%s", hLocalHostCgiBinUrl(), tn->forHtml);
 return dyStringCannibalize(&dy);
 }
@@ -555,33 +569,35 @@
     struct seqInfo *si = hashFindVal(seqInfoHash, pi->sampleId);
     if (si)
         {
         struct slName *sampleMuts = NULL;
         struct singleNucChange *snc;
         for (snc = si->sncList;  snc != NULL;  snc = snc->next)
             {
             char mutStr[64];
             safef(mutStr, sizeof mutStr, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase);
             slNameAddHead(&sampleMuts, mutStr);
             }
         slReverse(&sampleMuts);
         pi->sampleMuts = sampleMuts;
         }
     else
+        {
         errAbort("addSampleMutsFromSeqInfo: no seqInfo for placed sequence '%s'", pi->sampleId);
         }
     }
+}
 
 static void displaySampleMuts(struct placementInfo *info, char *refAcc)
 {
 printf("<p>Differences from the reference genome "
        "(<a href='https://www.ncbi.nlm.nih.gov/nuccore/%s' target=_blank>%s</a>): ",
        refAcc, refAcc);
 if (info->sampleMuts == NULL)
     printf("(None; identical to reference)");
 else
     {
     struct slName *sln;
     for (sln = info->sampleMuts;  sln != NULL;  sln = sln->next)
         {
         if (sln != info->sampleMuts)
             printf(", ");
@@ -624,127 +640,91 @@
 static void displayVariantPath(struct variantPathNode *variantPath, char *sampleId)
 /* Display mutations on the path to this sample. */
 {
 printf("<p>Mutations along the path from the root of the phylogenetic tree to %s:<br>",
        sampleId);
 if (variantPath)
     {
     variantPathPrint(variantPath);
     puts("<br>");
     }
 else
     puts("(None; your sample was placed at the root of the phylogenetic tree)");
 puts("</p>");
 }
 
-static struct variantPathNode *findLastInternalNode(struct variantPathNode *variantPath,
-                                                    int minNewNode)
-/* Return the last node in variantPath with a numeric name less than minNewNode, or NULL. */
-{
-if (!variantPath || !isInternalNodeName(variantPath->nodeName, minNewNode))
-    return NULL;
-while (variantPath->next && isInternalNodeName(variantPath->next->nodeName, minNewNode))
-    variantPath = variantPath->next;
-if (variantPath && isInternalNodeName(variantPath->nodeName, minNewNode))
-    return variantPath;
-return NULL;
-}
-
 static int mutCountCmp(const void *a, const void *b)
 /* Compare number of mutations of phyloTree nodes a and b. */
 {
 const struct phyloTree *nodeA = *(struct phyloTree * const *)a;
 const struct phyloTree *nodeB = *(struct phyloTree * const *)b;
 return slCount(nodeA->priv) - slCount(nodeB->priv);
 }
 
-static char *leafWithLeastMuts(struct phyloTree *node, struct mutationAnnotatedTree *bigTree)
-/* If node has a leaf child with no mutations of its own, return the name of that leaf.
+static char *leafWithLeastMuts(struct phyloTree *node, struct hash *excludeHash)
+/* If node has a leaf child (not in excludeHash) with no mutations of its own, return that leaf name.
  * Otherwise, if node has leaf children, return the name of the leaf with the fewest mutations.
  * Otherwise return NULL. */
 {
 char *leafName = NULL;
 int leafCount = 0;
 int i;
 for (i = 0;  i < node->numEdges;  i++)
     {
     struct phyloTree *kid = node->edges[i];
-    if (kid->numEdges == 0)
+    if (kid->numEdges == 0 && !hashLookup(excludeHash, kid->ident->name))
         {
         leafCount++;
         struct singleNucChange *kidMuts = kid->priv;
         if (!kidMuts)
             {
-            struct slName *nodeList = hashFindVal(bigTree->condensedNodes, kid->ident->name);
-            if (nodeList)
-                leafName = cloneString(nodeList->name);
-            else
             leafName = cloneString(kid->ident->name);
             break;
             }
         }
     }
 if (leafName == NULL && leafCount)
     {
     // Pick the leaf with the fewest mutations.
     struct phyloTree *leafKids[leafCount];
     int leafIx = 0;
     for (i = 0;  i < node->numEdges;  i++)
         {
         struct phyloTree *kid = node->edges[i];
-        if (kid->numEdges == 0)
+        if (kid->numEdges == 0 && !hashLookup(excludeHash, kid->ident->name))
                 leafKids[leafIx++] = kid;
         }
     qsort(leafKids, leafCount, sizeof(leafKids[0]), mutCountCmp);
     leafName = cloneString(leafKids[0]->ident->name);
     }
 return leafName;
 }
 
-static char *findNearestNeighbor(struct mutationAnnotatedTree *bigTree, char *sampleId,
-                                 struct variantPathNode *variantPath)
-/* Use the sequence of mutations in variantPath to find sampleId's parent node in bigTree,
- * then look for most similar leaf sibling(s). */
-{
-char *nearestNeighbor = NULL;
-int bigTreeINodeCount = phyloCountInternalNodes(bigTree->tree);
-int minNewNode = bigTreeINodeCount + 1; // 1-based
-struct variantPathNode *lastOldNode = findLastInternalNode(variantPath, minNewNode);
-struct phyloTree *node = lastOldNode ? hashFindVal(bigTree->nodeHash, lastOldNode->nodeName) :
-                                       bigTree->tree;
-if (lastOldNode && !node)
-    {
-    if (startsWith(USHER_NODE_PREFIX, lastOldNode->nodeName))
-        // protobuf still has number even if usher prepends USHER_NODE_PREFIX when parsing.
-        node = hashFindVal(bigTree->nodeHash, lastOldNode->nodeName+strlen(USHER_NODE_PREFIX));
-    if (node == NULL)
-        errAbort("Can't find last internal node %s for sample %s", lastOldNode->nodeName, sampleId);
-    }
-// Look for a leaf kid with no mutations relative to the parent, should be closest.
-if (node->numEdges == 0)
+static char *findNearestNeighbor(struct phyloTree *tree, char *sampleId,
+                                 struct hash *samplePlacements)
+/* Find sampleId's parent node in bigTree, then look for most similar leaf sibling(s),
+ * excluding other uploaded samples (which can be found in samplePlacements). */
 {
-    struct slName *nodeList = hashFindVal(bigTree->condensedNodes, node->ident->name);
-    if (nodeList)
-        nearestNeighbor = cloneString(nodeList->name);
-    else
-        nearestNeighbor = cloneString(node->ident->name);
-    }
-else
+struct phyloTree *sampleNode = phyloFindName(tree, sampleId);
+if (!sampleNode || ! sampleNode->parent)
+    return NULL;
+struct phyloTree *node = sampleNode->parent;
+char *nearestNeighbor = leafWithLeastMuts(node, samplePlacements);
+while (nearestNeighbor == NULL && node->parent != NULL)
     {
-    nearestNeighbor = leafWithLeastMuts(node, bigTree);
-    if (nearestNeighbor == NULL && node->parent != NULL)
-        nearestNeighbor = leafWithLeastMuts(node->parent, bigTree);
+    node = node->parent;
+    nearestNeighbor = leafWithLeastMuts(node, samplePlacements);
     }
 return nearestNeighbor;
 }
 
 static void printVariantPathNoNodeNames(FILE *f, struct variantPathNode *variantPath)
 /* Print out variant path with no node names (even if non-numeric) to f. */
 {
 struct variantPathNode *vpn;
 for (vpn = variantPath;  vpn != NULL;  vpn = vpn->next)
     {
     if (vpn != variantPath)
         fprintf(f, " > ");
     struct singleNucChange *snc;
     for (snc = vpn->sncList;  snc != NULL;  snc = snc->next)
         {
@@ -942,48 +922,30 @@
         met = hashFindVal(sampleMetadata, eg+strlen("_eg_"));
     }
 return met;
 }
 
 static char *lineageForSample(struct hash *sampleMetadata, char *sampleId)
 /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */
 {
 char *lineage = NULL;
 struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId);
 if (met)
     lineage = met->lineage ? met->lineage : met->nLineage;
 return lineage;
 }
 
-static void findNearestNeighbors(struct hash *samplePlacements, struct hash *sampleMetadata,
-                                 struct mutationAnnotatedTree *bigTree)
-/* For each placed sample, find the nearest neighbor in the bigTree and its assigned lineage,
- * and fill in those two fields of placementInfo. */
-{
-if (!bigTree)
-    return;
-struct hashCookie cookie = hashFirst(samplePlacements);
-struct hashEl *hel;
-while ((hel = hashNext(&cookie)) != NULL)
-    {
-    struct placementInfo *info = hel->val;
-    info->nearestNeighbor = findNearestNeighbor(bigTree, info->sampleId, info->variantPath);
-    if (isNotEmpty(info->nearestNeighbor))
-        info->neighborLineage = lineageForSample(sampleMetadata, info->nearestNeighbor);
-    }
-}
-
 static void printLineageLink(char *lineage, char *db)
 /* Print lineage with link to outbreak.info or pango-designation issue if appropriate.
  * Caller must handle case of empty/NULL lineage. */
 {
 boolean isSarsCov2 = sameString(db, "wuhCor1");
 if (isEmpty(lineage))
     errAbort("programmer error:printLineageLink called with empty lineage");
 else if (isSarsCov2 && startsWith("proposed", lineage))
     printf("<a href='"PANGO_DESIGNATION_ISSUE_URLBASE"%s' target=_blank>%s</a>",
            lineage+strlen("proposed"), lineage);
 else if (isSarsCov2 && differentString(lineage, "None") && differentString(lineage, "Unassigned"))
     {
     // Trim _suffix that I add to extra tree annotations for problematic branches, if present.
     char lineageCopy[strlen(lineage)+1];
     safecpy(lineageCopy, sizeof lineageCopy, lineage);
@@ -1040,32 +1002,40 @@
             break;
         }
     }
 return diff;
 }
 
 static struct slRef *getPlacementRefList(struct slName *sampleIds, struct hash *samplePlacements)
 /* Look up sampleIds in samplePlacements and return ref list of placements. */
 {
 struct slRef *placementRefs = NULL;
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     {
     struct placementInfo *info = hashFindVal(samplePlacements, sample->name);
     if (!info)
+        {
+        // Usher might have added a prefix to distinguish from a sequence with the same name
+        // already in the tree.
+        char nameWithPrefix[strlen(USHER_DEDUP_PREFIX) + strlen(sample->name) + 1];
+        safef(nameWithPrefix, sizeof nameWithPrefix, "%s%s", USHER_DEDUP_PREFIX, sample->name);
+        info = hashFindVal(samplePlacements, nameWithPrefix);
+        if (!info)
             errAbort("getPlacementRefList: can't find placement info for sample '%s'",
                      sample->name);
+        }
     slAddHead(&placementRefs, slRefNew(info));
     }
 slReverse(&placementRefs);
 return placementRefs;
 }
 
 static int countIdentical(struct slRef *placementRefs)
 /* Return the number of placements that have identical sampleMuts lists. */
 {
 int clumpCount = 0;
 struct slRef *ref;
 for (ref = placementRefs;  ref != NULL;  ref = ref->next)
     {
     clumpCount++;
     if (ref->next == NULL || placementInfoRefCmpSampleMuts(&ref, &ref->next))
@@ -1225,51 +1195,74 @@
     if (kidCount > 1)
         {
         slReverse(&prunedKids);
         // There is no phyloTreeFree, but if we ever add one, should use it here.
         node->numEdges = kidCount;
         struct phyloTree *kid;
         for (i = 0, kid = prunedKids;  i < kidCount;  i++, kid = kid->next)
             {
             node->edges[i] = kid;
             kid->parent = node;
             }
         }
     else
         return prunedKids;
     }
-else if (! (node->ident->name && slNameInList(sampleIds, node->ident->name)))
+else if (! (node->ident->name && slNameInListUseCase(sampleIds, node->ident->name)))
     node = NULL;
 return node;
 }
 
+// NOTE: it would be more efficient to associate a subtree with each sample after parsing
+// and sorting subtrees, e.g. hash and/or store a link in placementInfo so we don't have to search
+// subtrees for every sample like this, but this will do for now.
 static struct subtreeInfo *subtreeInfoForSample(struct subtreeInfo *subtreeInfoList, char *name,
                                                 int *retIx)
 /* Find the subtree that contains sample name and set *retIx to its index in the list.
  * If we can't find it, return NULL and set *retIx to -1. */
 {
 struct subtreeInfo *ti;
 int ix;
 for (ti = subtreeInfoList, ix = 0;  ti != NULL;  ti = ti->next, ix++)
-    if (slNameInList(ti->subtreeUserSampleIds, name))
+    if (slNameInListUseCase(ti->subtreeUserSampleIds, name))
         break;
 if (ti == NULL)
     ix = -1;
 *retIx = ix;
 return ti;
 }
 
+static void findNearestNeighbors(struct usherResults *results, struct hash *sampleMetadata)
+/* For each placed sample, find the nearest neighbor in the subtree and its assigned lineage,
+ * and fill in those two fields of placementInfo. */
+{
+struct hashCookie cookie = hashFirst(results->samplePlacements);
+struct hashEl *hel;
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    struct placementInfo *info = hel->val;
+    int ix;
+    struct subtreeInfo *ti = subtreeInfoForSample(results->subtreeInfoList, info->sampleId, &ix);
+    if (!ti)
+        continue;
+    info->nearestNeighbor = findNearestNeighbor(ti->subtree, info->sampleId,
+                                                results->samplePlacements);
+    if (isNotEmpty(info->nearestNeighbor))
+        info->neighborLineage = lineageForSample(sampleMetadata, info->nearestNeighbor);
+    }
+}
+
 static void lookForCladesAndLineages(struct hash *samplePlacements,
                                      boolean *retGotClades, boolean *retGotLineages)
 /* See if UShER has annotated any clades and/or lineages for seqs. */
 {
 boolean gotClades = FALSE, gotLineages = FALSE;
 struct hashEl *hel;
 struct hashCookie cookie = hashFirst(samplePlacements);
 while ((hel = hashNext(&cookie)) != NULL)
     {
     struct placementInfo *pi = hel->val;
     if (pi)
         {
         if (isNotEmpty(pi->nextClade))
             gotClades = TRUE;
         if (isNotEmpty(pi->pangoLineage))
@@ -1832,31 +1825,31 @@
             if (gotClades)
                 printf("<td>n/a</td>");
             if (gotLineages)
                 printf("<td>n/a</td>");
             printf("<td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td>");
             }
         printSubtreeTd(ur->subtreeInfoList, jsonTns, si->seq->name);
         puts("</tr>");
         }
     puts("</tbody></table><p></p>");
     }
 }
 
 static void summarizeSubtrees(struct slName *sampleIds, struct usherResults *results,
                               struct hash *sampleMetadata, struct tempName *jsonTns[],
-                              struct mutationAnnotatedTree *bigTree, char *db)
+                              char *db)
 /* Print a summary table of pasted/uploaded identifiers and subtrees */
 {
 boolean gotClades = FALSE, gotLineages = FALSE;
 lookForCladesAndLineages(results->samplePlacements, &gotClades, &gotLineages);
 puts("<table class='seqSummary'><tbody>");
 puts("<tr><th>Sequence</th>");
 if (gotClades)
     puts("<th>Nextstrain clade (UShER)"
      TOOLTIP("The <a href='https://nextstrain.org/blog/2021-01-06-updated-SARS-CoV-2-clade-naming' "
              "target=_blank>Nextstrain clade</a> "
              "assigned to the sequence by UShER according to its place in the phylogenetic tree")
          "</th>");
 if (gotLineages)
     puts("<th>Pango lineage (UShER)"
          TOOLTIP("The <a href='https://cov-lineages.org/' "
@@ -2284,37 +2277,48 @@
 {
 if (node->numEdges)
     {
     int i;
     for (i = 0;  i < node->numEdges;  i++)
         rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin,
                                   spikeChanges);
     }
 else
     {
     writeOneTsvRow(f, node->ident->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges);
     }
 }
 
 
-static struct hash *hashFromSeqInfoList(struct seqInfo *seqInfoList)
-/* Hash sequence name to seqInfo for quick lookup. */
+static struct hash *hashFromSeqInfoListAndIds(struct seqInfo *seqInfoList, struct slName *sampleIds)
+/* Hash sequence name (possibly prefixed by usher) to seqInfo for quick lookup. */
 {
 struct hash *hash = hashNew(0);
 struct seqInfo *si;
 for (si = seqInfoList;  si != NULL;  si = si->next)
+    {
+    if (! slNameInListUseCase(sampleIds, si->seq->name))
+        {
+        // Usher might have added a prefix to distinguish from a sequence with the same name
+        // already in the tree.
+        char nameWithPrefix[strlen(USHER_DEDUP_PREFIX) + strlen(si->seq->name) + 1];
+        safef(nameWithPrefix, sizeof nameWithPrefix, "%s%s", USHER_DEDUP_PREFIX, si->seq->name);
+        if (slNameInListUseCase(sampleIds, nameWithPrefix))
+            si->seq->name = cloneString(nameWithPrefix);
+        }
     hashAdd(hash, si->seq->name, si);
+    }
 return hash;
 }
 
 static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree,
                                         struct slName *sampleIds, struct hash *seqInfoHash,
                                         struct geneInfo *geneInfoList, struct seqWindow *gSeqWin,
                                         struct hash *spikeChanges, int *pStartTime)
 /* Write a tab-separated summary file for download.  If the user uploaded enough samples to make
  * a tree, then write out samples in tree order; otherwise use sampleIds list.
  * Accumulate S gene changes. */
 {
 struct tempName *tsvTn = NULL;
 AllocVar(tsvTn);
 trashDirFile(tsvTn, "ct", "usher_samples", ".tsv");
 FILE *f = mustOpen(tsvTn->forCgi, "w");
@@ -2444,66 +2448,70 @@
 printf("<a href='%s' download>Global phylogenetic tree with your sequences</a> | ", treeFile);
 printf("<a href='%s' download>TSV summary of sequences and placements</a> | ", sampleSummaryFile);
 printf("<a href='%s' download>TSV summary of Spike mutations</a> | ", spikeSummaryFile);
 printf("<a href='%s' download>ZIP file of subtree JSON and Newick files</a> | ", subtreeZipFile);
 puts("</p>");
 }
 
 static int subTreeInfoUserSampleCmp(const void *pa, const void *pb)
 /* Compare subtreeInfo by number of user sample IDs (highest number first). */
 {
 struct subtreeInfo *tiA = *(struct subtreeInfo **)pa;
 struct subtreeInfo *tiB = *(struct subtreeInfo **)pb;
 return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds);
 }
 
-static void getProtobufMetadataSource(char *db, char *protobufFile, char **retProtobufPath,
-                                      char **retMetadataFile, char **retSource, char **retAliasFile)
+static void getProtobufMetadataSource(char *db, struct treeChoices *treeChoices,
+                                      char *protobufFile, char **retProtobufPath,
+                                      char **retMetadataFile, char **retSource,
+                                      char **retAliasFile, char **retSampleNameFile)
 /* If the config file specifies a list of tree choices, and protobufFile is a valid choice, then
  * set ret* to the files associated with that choice.  Otherwise fall back on older conf settings.
  * Return the selected treeChoice if there is one. */
 {
-struct treeChoices *treeChoices = loadTreeChoices(db);
 if (treeChoices)
     {
     *retProtobufPath = protobufFile;
     if (isEmpty(*retProtobufPath))
         *retProtobufPath = treeChoices->protobufFiles[0];
     int i;
     for (i = 0;  i < treeChoices->count;  i++)
         if (sameString(treeChoices->protobufFiles[i], *retProtobufPath))
             {
             *retMetadataFile = treeChoices->metadataFiles[i];
             *retSource = treeChoices->sources[i];
             *retAliasFile = treeChoices->aliasFiles[i];
+            *retSampleNameFile = treeChoices->sampleNameFiles[i];
             break;
             }
     if (i == treeChoices->count)
         {
         *retProtobufPath = treeChoices->protobufFiles[0];
         *retMetadataFile = treeChoices->metadataFiles[0];
         *retSource = treeChoices->sources[0];
         *retAliasFile = treeChoices->aliasFiles[0];
+        *retSampleNameFile = treeChoices->sampleNameFiles[0];
         }
     }
 else
     {
     // Fall back on old settings
     *retProtobufPath = getUsherAssignmentsPath(db, TRUE);
     *retMetadataFile = phyloPlaceDbSettingPath(db, "metadataFile");
     *retSource = "GISAID";
     *retAliasFile = NULL;
+    *retSampleNameFile = NULL;
     }
 }
 
 static void addModVersion(struct hash *nameHash, char *id, char *fullName)
 /* Map id to fullName.  If id has .version, strip that (modifying id!) and map versionless id
  * to fullName. */
 {
 hashAdd(nameHash, id, fullName);
 char *p = strchr(id, '.');
 if (p)
     {
     *p = '\0';
     hashAdd(nameHash, id, fullName);
     }
 }
@@ -2598,39 +2606,61 @@
         int wordCount = chopTabs(line, words);
         lineFileExpectWords(lf, 2, wordCount);
         char *fullName = hashFindVal(nameHash, words[1]);
         if (fullName)
             hashAdd(nameHash, words[0], fullName);
         else
             {
             if (missExample == NULL)
                 missExample = cloneString(words[1]);
             }
         }
     lineFileClose(&lf);
     }
 }
 
-static struct hash *getTreeNames(struct mutationAnnotatedTree *bigTree, char *aliasFile,
-                                 boolean addComponents)
-/* Make a hash of full names of leaves of bigTree; if addComponents, also map components of those
- * names to the full names in case the user gives us partial names/IDs. */
+static struct hash *getTreeNames(char *nameFile, char *protobufFile,
+                                 struct mutationAnnotatedTree **pBigTree,
+                                 boolean addComponents, int *pStartTime)
+/* Make a hash of full names of leaves of bigTree, using nameFile if it exists, otherwise
+ * falling back on the bigTree itself, loading it if necessary.  If addComponents, also map
+ * components of those names to the full names in case the user gives us partial names/IDs. */
 {
+struct hash *nameHash = NULL;
+if (isNotEmpty(nameFile) && fileExists(nameFile))
+    {
+    nameHash = hashNew(26);
+    struct lineFile *lf = lineFileOpen(nameFile, TRUE);
+    char *line;
+    while (lineFileNext(lf, &line, NULL))
+        addNameMaybeComponents(nameHash, line, addComponents);
+    lineFileClose(&lf);
+    }
+else
+    {
+    if (*pBigTree == NULL)
+        {
+        *pBigTree = parseParsimonyProtobuf(protobufFile);
+        reportTiming(pStartTime, "parse protobuf file (at startup, because sample names file was "
+                     "not provided)");
+        }
+    struct mutationAnnotatedTree *bigTree = *pBigTree;
     int nodeCount = bigTree->nodeHash->elCount;
-struct hash *nameHash = hashNew(digitsBaseTwo(nodeCount) + 3);
+    nameHash = hashNew(digitsBaseTwo(nodeCount) + 3);
     rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash, addComponents);
-addAliases(nameHash, aliasFile);
+    }
+reportTiming(pStartTime, "get tree names");
 return nameHash;
 }
 
 static char *matchName(struct hash *nameHash, char *name)
 /* Look for a possibly partial name or ID provided by the user in nameHash.  Return the result,
  * possibly NULL.  If the full name doesn't match, try components of the name. */
 {
 name = trimSpaces(name);
 // GISAID fasta headers all have hCoV-19/country/isolate/year|EPI_ISL_#|date; strip the hCoV-19
 // because Nextstrain strips it in nextmeta/nextfasta download files, and so do I when building
 // UCSC's tree.
 if (startsWithNoCase("hCoV-19/", name))
     name += strlen("hCoV-19/");
 char *match = hashFindVal(nameHash, name);
 int minWordSize=5;
@@ -2739,39 +2769,37 @@
         }
     else
         {
         // Just one ID
         char oneId[strlen(line)+1];
         safef(oneId, sizeof oneId, "%s%s", prefixA, numberA);
         char *match = hashFindVal(nameHash, oneId);
         foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
         }
     // Skip past this match to see if the line has another range next.
     line += (substrArr[0].rm_eo - substrArr[0].rm_so);
     }
 return foundAny;
 }
 
-static struct slName *readSampleIds(struct lineFile *lf, struct mutationAnnotatedTree *bigTree,
-                                    char *aliasFile)
+static struct slName *readSampleIds(struct lineFile *lf, struct hash *nameHash)
 /* Read a file of sample names/IDs from the user; typically these will not be exactly the same
  * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt
  * to find component matches if an exact match isn't found. */
 {
 struct slName *sampleIds = NULL;
 struct slName *unmatched = NULL;
-struct hash *nameHash = getTreeNames(bigTree, aliasFile, TRUE);
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     // If tab-sep or comma-sep, just try first word in line
     char *tab = strchr(line, '\t');
     if (tab)
         *tab = '\0';
     else
         {
         char *comma = strchr(line, ',');
         if (comma)
             *comma = '\0';
         }
     char *match = matchName(nameHash, line);
     if (match)
@@ -2798,203 +2826,253 @@
          i++, example = example->next)
         {
         dyStringAppendSep(firstFew, ", ");
         dyStringPrintf(firstFew, "'%s'", example->name);
         }
     warn("Unable to find %d of your sequences in the tree, e.g. %s",
          slCount(unmatched), firstFew->string);
     dyStringFree(&firstFew);
     }
 else if (sampleIds == NULL)
     warn("Could not find any names in input; empty file uploaded?");
 slNameFreeList(&unmatched);
 return sampleIds;
 }
 
+void *loadMetadataWorker(void *arg)
+/* pthread worker function to read a tab-sep metadata file and return it hashed by name.  */
+{
+char *metadataFile = arg;
+int startTime = clock1000();
+struct hash *sampleMetadata = getSampleMetadata(metadataFile);
+reportTiming(&startTime, "read sample metadata (in a pthread)");
+return sampleMetadata;
+}
+
+static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *))
+/* Fork off a child process that parses a file and returns the resulting data structure. */
+{
+pthread_t *pt;
+AllocVar(pt);
+if (! pthreadMayCreate(pt, NULL, workerFunction, filename))
+    pt = NULL;
+return pt;
+}
+
 char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf,
                         boolean doMeasureTiming, int subtreeSize, int fontHeight,
                         boolean *retSuccess)
 /* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids:
  * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results,
  * make custom track files and return the top-level custom track file.
  * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils
  * to make subtrees, show subtree results, and return NULL.  Set retSuccess to TRUE if we were
  * able to get at least some results for the user's input. */
 {
 char *ctFile = NULL;
 if (retSuccess)
     *retSuccess = FALSE;
 measureTiming = doMeasureTiming;
 int startTime = clock1000();
 struct tempName *vcfTn = NULL;
 struct slName *sampleIds = NULL;
 char *usherPath = getUsherPath(TRUE);
 char *protobufPath = NULL;
 char *source = NULL;
 char *metadataFile = NULL;
 char *aliasFile = NULL;
-getProtobufMetadataSource(db, defaultProtobuf, &protobufPath, &metadataFile, &source, &aliasFile);
-struct mutationAnnotatedTree *bigTree = parseParsimonyProtobuf(protobufPath);
-reportTiming(&startTime, "parse protobuf file");
-if (! bigTree)
-    {
-    warn("Problem parsing %s; can't make subtree subtracks.", protobufPath);
-    }
+char *sampleNameFile = NULL;
+struct treeChoices *treeChoices = loadTreeChoices(db);
+getProtobufMetadataSource(db, treeChoices, defaultProtobuf,
+                          &protobufPath, &metadataFile, &source, &aliasFile, &sampleNameFile);
+reportTiming(&startTime, "start up and find the tree etc. files");
+struct mutationAnnotatedTree *bigTree = NULL;
 lineFileCarefulNewlines(lf);
 char *chrom = hDefaultChrom(db);
 int chromSize = hChromSize(db, chrom);
 struct slName **maskSites = getProblematicSites(db, chrom, chromSize);
-boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites, chromSize);
+//#*** TODO: add CGI param option for this almost-never-needed tweak:
+if (0)
+    {
+    bigTree = parseParsimonyProtobuf(protobufPath);
+    reportTiming(&startTime, "parse protobuf file (at startup, for excluding informativeBases "
+                 "from maskSites)");
+    informativeBasesFromTree(bigTree->tree, maskSites, chromSize);
+    reportTiming(&startTime, "remove any informative bases in tree from maskSites");
+    }
 struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize);
 boolean isFasta = FALSE;
 boolean subtreesOnly = FALSE;
 struct seqInfo *seqInfoList = NULL;
 if (lfLooksLikeFasta(lf))
     {
     struct slPair *failedSeqs;
     struct slPair *failedPsls;
-    struct hash *treeNames = getTreeNames(bigTree, NULL, FALSE);
-    vcfTn = vcfFromFasta(lf, db, refGenome, informativeBases, maskSites, treeNames,
+    struct hash *treeNames = NULL;
+    // We need to check uploaded names in fasta only for original usher, not usher-sampled(-server).
+    if (!serverIsConfigured(db) && !endsWith(usherPath, "-sampled"))
+        treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, FALSE, &startTime);
+    vcfTn = vcfFromFasta(lf, db, refGenome, maskSites, treeNames,
                          &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime);
     if (failedSeqs)
         {
         puts("<p>");
         struct slPair *fail;
         for (fail = failedSeqs;  fail != NULL;  fail = fail->next)
             printf("%s<br>\n", fail->name);
         puts("</p>");
         }
     if (failedPsls)
         {
         puts("<p>");
         struct slPair *fail;
         for (fail = failedPsls;  fail != NULL;  fail = fail->next)
             printf("%s<br>\n", fail->name);
         puts("</p>");
         }
     if (seqInfoList == NULL)
         printf("<p>Sorry, could not align any sequences to reference well enough to place in "
                "the phylogenetic tree.</p>\n");
     isFasta = TRUE;
     }
 else if (lfLooksLikeVcf(lf))
     {
     vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds);
     reportTiming(&startTime, "check uploaded VCF");
     }
 else
     {
     subtreesOnly = TRUE;
-    sampleIds = readSampleIds(lf, bigTree, aliasFile);
+    struct hash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE, &startTime);
+    addAliases(treeNames, aliasFile);
+    reportTiming(&startTime, "load sample name aliases");
+    sampleIds = readSampleIds(lf, treeNames);
+    reportTiming(&startTime, "look up uploaded sample names");
     }
 lineFileClose(&lf);
-struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList);
 if (sampleIds == NULL)
     {
     return ctFile;
     }
+
+// Kick off child thread to load metadata simultaneously with running usher or matUtils.
+pthread_t *metadataPthread = mayStartLoaderPthread(metadataFile, loadMetadataWorker);
+
 struct usherResults *results = NULL;
 if (vcfTn)
     {
     fflush(stdout);
-    results = runUsher(usherPath, protobufPath, vcfTn->forCgi,
-                       subtreeSize, sampleIds, bigTree->condensedNodes,
-                       &startTime);
-    if (results)
-        addSampleMutsFromSeqInfo(results->samplePlacements, seqInfoHash);
+    results = runUsher(db, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds,
+                       treeChoices, &startTime);
     }
 else if (subtreesOnly)
     {
     char *matUtilsPath = getMatUtilsPath(TRUE);
     results = runMatUtilsExtractSubtrees(matUtilsPath, protobufPath, subtreeSize,
-                                         sampleIds, bigTree->condensedNodes,
-                                         &startTime);
+                                         sampleIds, &startTime);
+    }
+
+struct hash *sampleMetadata = NULL;
+if (metadataPthread)
+    {
+    pthreadJoin(metadataPthread, (void **)(&sampleMetadata));
+    reportTiming(&startTime, "wait for sample metadata loading thread to finish");
     }
+else
+    {
+    // We really need metadata -- load it the slow way.
+    sampleMetadata = getSampleMetadata(metadataFile);
+    reportTiming(&startTime, "load sample metadata without pthread");
+    }
+
 if (results && results->singleSubtreeInfo)
     {
     if (retSuccess)
         *retSuccess = TRUE;
     puts("<p></p>");
     readQcThresholds(db);
     int subtreeCount = slCount(results->subtreeInfoList);
     // Sort subtrees by number of user samples (largest first).
     slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp);
     // Make Nextstrain/auspice JSON file for each subtree.
     char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile");
     struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome);
     struct seqWindow *gSeqWin = memSeqWindowNew(chrom, refGenome->dna);
-    struct hash *sampleMetadata = getSampleMetadata(metadataFile);
     struct hash *sampleUrls = hashNew(0);
     struct tempName *jsonTns[subtreeCount];
     struct subtreeInfo *ti;
     int ix;
     for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
         {
         AllocVar(jsonTns[ix]);
         char subtreeName[512];
         safef(subtreeName, sizeof(subtreeName), "subtreeAuspice%d", ix+1);
         trashDirFile(jsonTns[ix], "ct", subtreeName, ".json");
         treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, NULL,
                           jsonTns[ix]->forCgi, source);
         // Add a link for every sample to this subtree, so the single-subtree JSON can
         // link to subtree JSONs
         char *subtreeUrl = nextstrainUrlFromTn(jsonTns[ix]);
         struct slName *sample;
         for (sample = ti->subtreeUserSampleIds;  sample != NULL;  sample = sample->next)
             hashAdd(sampleUrls, sample->name, subtreeUrl);
         }
     struct tempName *singleSubtreeJsonTn;
     AllocVar(singleSubtreeJsonTn);
     trashDirFile(singleSubtreeJsonTn, "ct", "singleSubtreeAuspice", ".json");
     treeToAuspiceJson(results->singleSubtreeInfo, db, geneInfoList, gSeqWin, sampleMetadata,
                       sampleUrls, singleSubtreeJsonTn->forCgi, source);
+    reportTiming(&startTime, "make Auspice JSON");
     struct subtreeInfo *subtreeInfoForButtons = results->subtreeInfoList;
     if (subtreeCount > MAX_SUBTREE_BUTTONS)
         subtreeInfoForButtons = NULL;
     makeButtonRow(singleSubtreeJsonTn, jsonTns, subtreeInfoForButtons, subtreeSize, isFasta,
                   !subtreesOnly);
     printf("<p>If you have metadata you wish to display, click a 'view subtree in "
            "Nextstrain' button, and then you can drag on a CSV file to "
            "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>."
            "</p>\n");
     puts("<p><em>Note: "
          "The Nextstrain subtree views, and Download files below, are temporary files and will "
          "expire within two days.  "
          "Please download the Nextstrain subtree JSON files if you will want to view them "
          "again in the future.  The JSON files can be drag-dropped onto "
          "<a href='https://auspice.us/' target=_blank>https://auspice.us/</a>."
          "</em></p>");
 
     struct tempName *tsvTn = NULL, *sTsvTn = NULL;
     struct tempName *zipTn = makeSubtreeZipFile(results, jsonTns, singleSubtreeJsonTn,
                                                 &startTime);
     struct tempName *ctTn = NULL;
     if (subtreesOnly)
         {
-        summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, bigTree, db);
+        summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, db);
         reportTiming(&startTime, "describe subtrees");
         }
     else
         {
-        findNearestNeighbors(results->samplePlacements, sampleMetadata, bigTree);
+        findNearestNeighbors(results, sampleMetadata);
+        reportTiming(&startTime, "find nearest neighbors");
 
         // Make custom tracks for uploaded samples and subtree(s).
         struct phyloTree *sampleTree = NULL;
-        ctTn = writeCustomTracks(db, vcfTn, results, sampleIds, bigTree, source, fontHeight,
+        ctTn = writeCustomTracks(db, vcfTn, results, sampleIds, source, fontHeight,
                                  &sampleTree, &startTime);
 
         // Make a sample summary TSV file and accumulate S gene changes
+        struct hash *seqInfoHash = hashFromSeqInfoListAndIds(seqInfoList, sampleIds);
+        addSampleMutsFromSeqInfo(results->samplePlacements, seqInfoHash);
         struct hash *spikeChanges = hashNew(0);
         tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoHash,
                                 geneInfoList, gSeqWin, spikeChanges, &startTime);
         sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds));
         downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml,
                      zipTn->forHtml);
 
         int seqCount = slCount(seqInfoList);
         if (seqCount <= MAX_SEQ_DETAILS)
             {
             char *refAcc = cloneString(chrom);
             if (regexMatch(refAcc, "v[0-9]+$"))
                 {
                 char *v = strrchr(refAcc, 'v');
                 assert(v != NULL);