src/hg/hgPhyloPlace/runUsher.c 0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3

0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3
angie
  Fri Oct 28 15:08:06 2022 -0700
Performance improvements for trees with millions of sequences:
* Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher
* Use usher-sampled instead of usher if server is not configured but usher-sampled is available
* Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running
* Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results)
* Avoid parsing the protobuf and traversing the big tree unless absolutely necessary
** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c
** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree)
** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new.
** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree

diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c
index 3c04bdc..d35c987 100644
--- src/hg/hgPhyloPlace/runUsher.c
+++ src/hg/hgPhyloPlace/runUsher.c
@@ -1,857 +1,1345 @@
 /* Invoke usher to place user's uploaded samples in the phylogenetic tree & parse output files. */
 
-/* Copyright (C) 2020 The Regents of the University of California */
+/* Copyright (C) 2020-2022 The Regents of the University of California */
 
 #include "common.h"
 #include "dnautil.h"
 #include "hash.h"
+#include "hgConfig.h"
 #include "linefile.h"
 #include "obscure.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "regexHelper.h"
 #include "pipeline.h"
+#include "trackHub.h"
 #include "trashDir.h"
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
 
 // Keywords in stderr output of usher:
 #define sampleIdPrefix "Sample name:"
 #define pScorePrefix "Parsimony score:"
 #define numPlacementsPrefix "Number of parsimony-optimal placements:"
 #define imputedMutsPrefix "Imputed mutations:"
 
 static void parseSampleIdAndParsimonyScore(char **words, char **retSampleId,
                                            struct hash *samplePlacements)
 /* If words[] seems to contain columns of the line that gives sample ID and parsimony score,
  * then parse out those values. */
 {
 // Example line:
 // words[0] = Current tree size (#nodes): 70775
 // words[1] = Sample name: MyLabSequence2
 // words[2] = Parsimony score: 1
 // words[3] = Number of parsimony-optimal placements: 1
 char *p = stringIn(sampleIdPrefix, words[1]);
 if (p)
     {
     *retSampleId = cloneString(trimSpaces(p + strlen(sampleIdPrefix)));
     struct placementInfo *info;
     AllocVar(info);
     hashAdd(samplePlacements, *retSampleId, info);
     info->sampleId = *retSampleId;
     p = stringIn(pScorePrefix, words[2]);
     if (p)
         info->parsimonyScore = atoi(skipToSpaces(p + strlen(pScorePrefix)));
     else
         errAbort("Problem parsing stderr output of usher: "
                  "expected '" sampleIdPrefix "...' to be followed by '"
                  pScorePrefix "...' but could not find the latter." );
     p = stringIn(numPlacementsPrefix, words[3]);
     if (p)
         info->bestNodeCount = atoi(skipToSpaces(p + strlen(numPlacementsPrefix)));
     else
         errAbort("Problem parsing stderr output of usher: "
                  "expected '" sampleIdPrefix "... " pScorePrefix " ...' to be followed by '"
                  numPlacementsPrefix "...' but could not find the latter." );
     }
 else
     errAbort("Unexpected format of sample ID line:\n%s\t%s\t%s\t%s",
              words[0], words[1], words[2], words[3]);
 }
 
 static struct singleNucChange *parseSnc(char *sncStr)
 /* If sncStr looks like a <old><pos><new>-style single nucleotide change then parse out those
  * values & return singleNucChange (with parBase and newBase; no refBase), otherwise return NULL.  */
 {
 struct singleNucChange *snc = NULL;
 regmatch_t substrs[4];
 if (regexMatchSubstr(sncStr, "^([ACGT])([0-9]+)([ACGT])$", substrs, ArraySize(substrs)))
     {
     int chromStart = regexSubstringInt(sncStr, substrs[2]) - 1;
     snc = sncNew(chromStart, '\0', sncStr[0], sncStr[substrs[3].rm_so]);
     }
 return snc;
 }
 
 static struct baseVal *bvListFromSemiColonSep(char *mutStr)
 /* Parse a string of ;-sep'd position:allele & return a list of struct baseVal. */
 {
 char *muts[strlen(mutStr) / 4];
 int mutCount = chopString(mutStr, ";", muts, ArraySize(muts));
 struct baseVal *bvList = NULL;
 int i;
 for (i = 0;  i < mutCount;  i++)
     {
     boolean problem = FALSE;
     char *colon = strchr(muts[i], ':');
     if (colon)
         {
         int pos = atoi(muts[i]);
         char *val = cloneString(colon+1);
         if (pos < 1)
             problem = TRUE;
         else if (!isAllNt(val, strlen(val)))
             problem = TRUE;
         else
             {
             struct baseVal *bv;
             AllocVar(bv);
             bv->chromStart = pos - 1;
             bv->val = val;
             slAddHead(&bvList, bv);
             }
         }
     if (problem)
         errAbort("Problem parsing stderr output of usher: "
                  "expected imputed mutation to be number:base, but got '%s'", muts[i]);
     }
 slReverse(&bvList);
 return bvList;
 }
 
 static boolean parseImputedMutations(char **words, struct placementInfo *info)
 /* If words[] looks like it defines imputed mutations of the most recently named sample,
  * then parse out the list and add to info->imputedBases and return TRUE. */
 {
 // Example line:
 // words[0] = Imputed mutations: 
 // words[1] = 6709:A;23403:G
 boolean matches = FALSE;
 if (stringIn(imputedMutsPrefix, words[0]))
     {
     matches = TRUE;
     info->imputedBases = bvListFromSemiColonSep(words[1]);
     }
 return matches;
 }
 
 static void parseStderr(char *amsStderrFile, struct hash *samplePlacements)
 /* The stderr output of usher is where we find important info for each sample:
  * the path of variants on nodes from root to sample leaf, imputed values of ambiguous bases
  * (if any), and parsimony score. */
 {
 struct lineFile *lf = lineFileOpen(amsStderrFile, TRUE);
 char *sampleId = NULL;
 int size;
 char *line;
 while (lineFileNext(lf, &line, &size))
     {
     char lineCpy[size+1];
     safencpy(lineCpy, sizeof lineCpy, line, size);
     char *words[16];
     int wordCount = chopTabs(lineCpy, words);
     if (wordCount == 4)
         parseSampleIdAndParsimonyScore(words, &sampleId, samplePlacements);
     else if (wordCount == 2)
         {
         if (! sampleId)
             errAbort("Problem parsing stderr output of usher: "
                      "Got line starting with '%s' that was not preceded by a line that "
                      "defines sample ID.:\n%s", words[0], line);
         struct placementInfo *info = hashFindVal(samplePlacements, sampleId);
         if (!info)
             errAbort("Problem parsing stderr output of usher: "
                      "Can't find placement info for sample '%s'", sampleId);
         parseImputedMutations(words, info);
         }
     }
 }
 
-static void parsePlacements(char *outDirName, char *stderrName, struct hash *samplePlacements)
+static void parsePlacements(char *outDirName, char *stderrName, struct hash *samplePlacements,
+                            struct slName **pSampleIds)
 /* If usher created the file outdir/placement_stats.tsv then parse its contents into
- * samplePlacements; otherwise parse the same info out of the stderr output. */
+ * samplePlacements; otherwise parse the same info out of the stderr output.
+ * Check placements vs. *pSampleIds; if usher renamed a sample with prefix to avoid clash with
+ * sample already in the tree then rename the sample in *pSampleIds. */
 {
 char placementsFileName[PATH_LEN];
 safef(placementsFileName, sizeof placementsFileName, "%s/placement_stats.tsv", outDirName);
+int placementCount = 0;
 if (fileExists(placementsFileName))
     {
+    // Frustratingly, the file from server can exist but appear to be empty for a couple seconds!
+    // Worse, it might have a multiple-of-4096 size yet still be incomplete!
+    // If we don't get this file's data, then we'll run into errors when parsing subsequent files.
+    // So wait a while if the size seems suspicious.
+    int maxTries = 30;
+    int nx = 0;
+    long long size = 0;
+    while ((size = fileSize(placementsFileName)) % 4096 == 0 && nx < maxTries)
+        {
+        fprintf(stderr, "parsePlacements: after %ds, %s has multiple-of-4096 size %lld.\n",
+                nx, placementsFileName, size);
+        sleep(1);
+        nx++;
+        }
+    if ((size % 4096) == 0)
+        {
+        fprintf(stderr, "parsePlacements: %s still has multiple-of-4096 size %lld but we've waited "
+                "a long time (%ds), proceed.\n",
+                placementsFileName, size, nx);
+        }
+    else if (nx > 0)
+        fprintf(stderr, "parsePlacements: after %ds, %s size is %lld, getting started.\n",
+                nx, placementsFileName, size);
     struct lineFile *lf = lineFileOpen(placementsFileName, TRUE);
     char *line;
     while (lineFileNext(lf, &line, NULL))
         {
         char *words[5];
         int wordCount = chopTabs(line, words);
         lineFileExpectAtLeast(lf, 4, wordCount);
         char *sampleId = words[0];
         struct placementInfo *info;
         AllocVar(info);
         hashAdd(samplePlacements, sampleId, info);
         info->sampleId = cloneString(sampleId);
         info->parsimonyScore = atoi(words[1]);
         info->bestNodeCount = atoi(words[2]);
         info->imputedBases = bvListFromSemiColonSep(words[3]);
+        placementCount++;
         }
     lineFileClose(&lf);
     }
 else
+    {
+    fprintf(stderr, "parsePlacements: No %s, reading stderr %s\n", placementsFileName, stderrName);
     parseStderr(stderrName, samplePlacements);
     }
+// Check whether we placed all samples, and whether any sample names have the USHER_DEDUP_PREFIX
+// in usher results files.
+struct slName *newSampleIds = NULL;
+struct slName *sample, *next = NULL;
+for (sample = *pSampleIds;  sample != NULL;  sample = next)
+    {
+    next = sample->next;
+    if (!hashLookup(samplePlacements, sample->name))
+        {
+        // Usher might have added a prefix to distinguish from a sequence with the same name
+        // already in the tree.
+        char nameWithPrefix[strlen(USHER_DEDUP_PREFIX) + strlen(sample->name) + 1];
+        safef(nameWithPrefix, sizeof nameWithPrefix, "%s%s", USHER_DEDUP_PREFIX, sample->name);
+        if (!hashLookup(samplePlacements, nameWithPrefix))
+            {
+            warn("parsePlacements: did not find placement for sample '%s'", sample->name);
+            slAddHead(&newSampleIds, sample);
+            }
+        else
+            {
+            warn("Renamed %s to %s to distinguish it from sequence already in the tree",
+                 sample->name, nameWithPrefix);
+            slNameAddHead(&newSampleIds, nameWithPrefix);
+            }
+        }
+    else
+        slAddHead(&newSampleIds, sample);
+    }
+slReverse(&newSampleIds);
+*pSampleIds = newSampleIds;
+}
 
 static void parseVariantPaths(char *filename, struct hash *samplePlacements)
 /* Parse out space-sep list of {node ID, ':', node-associated ,-sep variant list} into
  * variantPathNode list and associate with sample ID. */
 {
 // Example line (note the back-mutation at 28144T... may want to highlight those):
 // words[0] = MySeq
 // words[1] = 1:C8782T,T28144C 2309:C29095T 2340:T8782C 2342:T29095C 2588:C28144T MySeq:C29867T 
 struct lineFile *lf = lineFileOpen(filename, TRUE);
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     char *words[3];
     int wordCount = chopTabs(line, words);
     lineFileExpectWords(lf, 2, wordCount);
     char *sampleId = words[0];
     char *nodePath = words[1];
     char *nodes[strlen(nodePath) / 4];
     int nodeCount = chopString(nodePath, " ", nodes, ArraySize(nodes));
     struct variantPathNode *vpNodeList = NULL;
     int i;
     for (i = 0;  i < nodeCount;  i++)
         {
         struct variantPathNode *vpn;
         AllocVar(vpn);
         char *nodeVariants = nodes[i];
         // First there is a node ID followed by ':'.  If node ID is numeric, ignore;
         // otherwise it is a sample ID, indicating a mutation specific to the sample ID,
         // so include it.
         char *colon = strrchr(nodeVariants, ':');
         if (colon)
             {
             *colon = '\0';
             char *nodeName = nodeVariants;
             nodeVariants = colon+1;
             vpn->nodeName = cloneString(nodeName);
             }
         // Next there should be a comma-sep list of mutations/variants.
         char *variants[strlen(nodeVariants) / 4];
         int varCount = chopCommas(nodeVariants, variants);
         int j;
         for (j = 0;  j < varCount;  j++)
             {
             variants[j] = trimSpaces(variants[j]);
             struct singleNucChange *snc = parseSnc(variants[j]);
             if (snc)
                 slAddHead(&vpn->sncList, snc);
             else
                 errAbort("parseVariantPath: Expected variant path for %s to specify "
                          "single-nucleotide changes but got '%s'", sampleId, variants[j]);
             }
         slReverse(&vpn->sncList);
         slAddHead(&vpNodeList, vpn);
         }
     slReverse(&vpNodeList);
     struct placementInfo *info = hashFindVal(samplePlacements, sampleId);
     if (!info)
         errAbort("parseVariantPath: can't find placementInfo for sample '%s'", sampleId);
     info->variantPath = vpNodeList;
     }
 lineFileClose(&lf);
 }
 
 static struct variantPathNode *parseSubtreeMut(char *line)
 /* Parse a line with a node name and list of mutations.  Examples:
  * ROOT->1: C241T,C14408T,A23403G,C3037T,A20268G,C28854T,T24076C
  * 1: C20759T
  * USA/CA-CZB-4019/2020|EPI_ISL_548621|20-08-01: G17608T,G22199T
  * node_6849_condensed_4_leaves: 
  */
 {
 struct variantPathNode *vpn = NULL;
 char *colon = strrchr(line, ':');
 if (colon)
     {
     AllocVar(vpn);
     char *nodeName = line;
     *colon = '\0';
     vpn->nodeName = cloneString(nodeName);
     char *mutString = trimSpaces(colon+1);
     char *mutWords[strlen(mutString)/4];
     int mutCount = chopCommas(mutString, mutWords);
     int i;
     for (i = 0;  i < mutCount;  i++)
         {
         struct singleNucChange *snc = parseSnc(mutWords[i]);
         if (snc)
             slAddHead(&vpn->sncList, snc);
         else
             errAbort("parseSubtreeMut: Expected subtree mutation list to specify single-nucleotide "
                      "changes but got '%s'", mutWords[i]);
         }
     slReverse(&vpn->sncList);
     }
 else
     errAbort("parseSubtreeMut: Expected line to contain colon but got '%s'", line);
 return vpn;
 }
 
 static struct variantPathNode *parseSubtreeMutations(char *filename)
 /* Parse subtree node mutation lists out of usher subtree-N-mutations.txt file. */
 {
 struct variantPathNode *subtreeMutList = NULL;
 struct lineFile *lf = lineFileOpen(filename, TRUE);
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     struct variantPathNode *vpn = parseSubtreeMut(line);
     if (vpn)
         {
         // The ROOT->1 (subtree ancestor) sncList needs to be prepended to the subtree
         // root's sncList because Auspice doesn't seem to display mutations on nodes
         // with only one child.  Discard the subtree ancestor element.
         if (subtreeMutList && subtreeMutList->next == NULL &&
             startsWith("ROOT->", subtreeMutList->nodeName))
             {
             vpn->sncList = slCat(subtreeMutList->sncList, vpn->sncList);
             subtreeMutList = NULL;
             }
         slAddHead(&subtreeMutList, vpn);
         }
     }
 slReverse(&subtreeMutList);
 lineFileClose(&lf);
 return subtreeMutList;
 }
 
 static struct usherResults *usherResultsNew()
 /* Allocate & return usherResults (just bigTreePlusTn and samplePlacements, subtrees come later). */
 {
 struct usherResults *results;
 AllocVar(results);
 AllocVar(results->bigTreePlusTn);
 trashDirFile(results->bigTreePlusTn, "ct", "phyloPlusSamples", ".nwk");
 results->samplePlacements = hashNew(0);
 return results;
 }
 
 static void rPhyloLeafNames(struct phyloTree *node, struct slName **pList)
 /* Build up a list of leaf names under node in reverse depth-first-search order. */
 {
 if (node->numEdges > 0)
     {
     int i;
     for (i = 0;  i < node->numEdges;  i++)
         rPhyloLeafNames(node->edges[i], pList);
     }
 else
     slAddHead(pList, slNameNew(node->ident->name));
 }
 
 static struct slName *phyloLeafNames(struct phyloTree *tree)
 /* Return a list of leaf names in tree in depth-first-search order. */
 {
 struct slName *list = NULL;
 rPhyloLeafNames(tree, &list);
 slReverse(&list);
 return list;
 }
 
 static struct hash *slNameListToIxHash(struct slName *list)
 /* Given a list of names, add each name to a hash of ints with the index of the name in the list. */
 {
 struct hash *hash = hashNew(0);
 struct slName *sln;
 int ix;
 for (ix = 0, sln = list;  sln != NULL;  ix++, sln = sln->next)
     hashAddInt(hash, sln->name, ix);
 return hash;
 }
 
 static struct slName *getSubtreeSampleIds(struct slName *sampleIds, struct hash *subtreeIds)
 /* Return a list of sampleIds that are found in subtreeIds. */
 {
 struct slName *subtreeSampleIds = NULL;
 struct slName *id;
 for (id = sampleIds;  id != NULL;  id = id->next)
     if (hashLookup(subtreeIds, id->name))
         slNameAddHead(&subtreeSampleIds, id->name);
 slReverse(&subtreeSampleIds);
 return subtreeSampleIds;
 }
 
-static int hashElIntCmpDesc(const void *va, const void *vb)
-/* Compare two hashEl by integer value, descending. */
-{
-const struct hashEl *a = *((struct hashEl **)va);
-const struct hashEl *b = *((struct hashEl **)vb);
-return b->val - a->val;
-}
-
-static char *summarizeCountries(struct slName *idList)
-/* If enough sample names in idList have identifiable countries, then return a summary string,
- * else NULL. */
-{
-char *summary = NULL;
-int idCount = slCount(idList);
-int idWithCountryCount = 0;
-struct hash *countryCounts = hashNew(0);
-struct slName *id;
-for (id = idList;  id != NULL;  id = id->next)
-    {
-    regmatch_t substrs[2];
-    if (regexMatchSubstr(id->name, "([A-Za-z_]+)/[^/]+/20", substrs, ArraySize(substrs)))
-        {
-        char country[256];
-        regexSubstringCopy(id->name, substrs[1], country, sizeof country);
-        idWithCountryCount++;
-        hashIncInt(countryCounts, country);
-        }
-    }
-if ((double)idWithCountryCount / idCount >= 0.5)
-    {
-    struct dyString *dy = dyStringCreate("%d from ", idCount);
-    struct hashEl *helList = hashElListHash(countryCounts);
-    if (helList->next == NULL)
-        dyStringAppend(dy, helList->name);
-    else
-        {
-        slSort(&helList, hashElIntCmpDesc);
-        struct hashEl *hel;
-        for (hel = helList;  hel != NULL;  hel = hel->next)
-            {
-            if (hel != helList)
-                dyStringAppend(dy, "+");
-            dyStringAppend(dy, hel->name);
-            int count = ptToInt(hel->val);
-            if (count > 1)
-                dyStringPrintf(dy, "[%d]", count);
-            if (dyStringLen(dy) > 100)
-                {
-                dyStringAppend(dy, "+...");
-                break;
-                }
-            }
-        }
-    // Labels have to be unique -- tack on an example ID
-    dyStringPrintf(dy, " eg %s", idList->name);
-    summary = dyStringCannibalize(&dy);
-    subChar(summary, ' ', '_');
-    }
-return summary;
-}
-
-static char *expandCondensedNodeName(struct hash *condensedNodes, char *name)
-/* If name is found in condensedNodes, then return an expanded name that indicates countries of
- * origin (if we can find them) and an example ID.  Otherwise return NULL. */
-{
-char *expandedName = NULL;
-struct slName *nodeList = hashFindVal(condensedNodes, name);
-if (nodeList)
+static struct hash *hashVpnList(struct variantPathNode *vpnList)
+/* Return a hash of nodeName to sncList for each variantPathNode in vpnList. */
 {
-    char *summary = summarizeCountries(nodeList);
-    if (summary)
-        expandedName = summary;
-    else
-        {
-        int nodeCount = slCount(nodeList);
-        struct dyString *dy = dyStringCreate("%s_and_%d_others", nodeList->name, nodeCount-1);
-        expandedName=  dyStringCannibalize(&dy);
-        }
-    }
-return expandedName;
-}
-
-static struct hash *expandCondensedNodeNames(struct hash *condensedNodes, struct slName *idList)
-/* If idList contains names found in condensedNodes, then hash those names to new names that are
- * _-sep strings from the lists in condensedNodes.  Return hash (which may have no elements
- * if nothing in idList is found in condensedNodes). */
-{
-struct hash *hash = hashNew(0);
-struct slName *id;
-for (id = idList;  id != NULL;  id = id->next)
-    {
-    char *expandedName = expandCondensedNodeName(condensedNodes, id->name);
-    if (expandedName)
-        hashAdd(hash, id->name, expandedName);
-    }
-return hash;
-}
-
-static void rSubstTreeNames(struct phyloTree *node, struct hash *nameSubstitutions)
-/* If node or descendants have names in nameSubstitutions, then substitute those names. */
-{
-if (node->ident->name)
-    {
-    char *subst = hashFindVal(nameSubstitutions, node->ident->name);
-    if (subst)
-        node->ident->name = subst;
-    }
-int i;
-for (i = 0;  i < node->numEdges;  i++)
-    rSubstTreeNames(node->edges[i], nameSubstitutions);
-}
-
-static struct tempName *substituteTreeNames(struct phyloTree *tree, char *treeName,
-                                            struct hash *nameSubstitutions)
-/* If tree has any nodes whose names are in nameSubstitutions, then substitute those names.
- * Write tree out to a trash file and return its location. */
-{
-rSubstTreeNames(tree, nameSubstitutions);
-struct tempName *newTn;
-AllocVar(newTn);
-trashDirFile(newTn, "ct", treeName, ".nwk");
-FILE *f = mustOpen(newTn->forCgi, "w");
-phyloPrintTree(tree, f);
-carefulClose(&f);
-return newTn;
-}
-
-static struct slName *substituteNameList(struct slName *idList, struct hash *nameSubstitutions)
-/* Return a new list that is just like idList, except if any item in idList has a value in
- * nameSubstitutions, then the item is replaced by the substitution. */
-{
-struct slName *newList = NULL;
-struct slName *id;
-for (id = idList;  id != NULL;  id = id->next)
-    {
-    char *subst = hashFindVal(nameSubstitutions, id->name);
-    slNameAddHead(&newList, subst ? subst : id->name);
-    }
-slReverse(&newList);
-return newList;
+struct hash *vpnHash = hashNew(13);
+struct variantPathNode *vpn;
+for (vpn = vpnList;  vpn != NULL;  vpn = vpn->next)
+    hashAdd(vpnHash, vpn->nodeName, vpn);
+return vpnHash;
 }
 
-static void addMutationsToTree(struct phyloTree *node, struct variantPathNode **pNodeMutList)
-/* Store the list of mutations associated with each node in node->priv. */
+static void addMutationsToTree(struct phyloTree *node, struct hash *subtreeMutHash)
+/* Store the list of mutations associated with each node in node->priv, removing node from
+ * subtreeMutHash so we can check that it's empty when done with tree. */
 {
-struct variantPathNode *nodeMuts = slPopHead(pNodeMutList);
-if (! nodeMuts)
-    errAbort("addMutationsToTree: subtree mutation list has fewer nodes than subtree");
-if (node->ident->name && ! sameString(nodeMuts->nodeName, node->ident->name))
-    errAbort("addMutationsToTree: subtree node name is '%s' but subtree mutation list item is '%s'",
-             node->ident->name, nodeMuts->nodeName);
+if (! node->ident->name)
+    errAbort("addMutationsToTree: node name is NULL but I need a node name to find mutations");
 if (node->priv != NULL)
-    errAbort("addMutationsToTree: node already has mutations assigned");
+    errAbort("addMutationsToTree: node '%s' already has mutations assigned (duplicated in tree?)",
+             node->ident->name);
+struct variantPathNode *nodeMuts = hashFindVal(subtreeMutHash, node->ident->name);
+if (! nodeMuts)
+    errAbort("addMutationsToTree: can't find node '%s' in subtree mutations file",
+             node->ident->name);
+hashRemove(subtreeMutHash, node->ident->name);
 node->priv = nodeMuts->sncList;
 int i;
 for (i = 0;  i < node->numEdges;  i++)
-    addMutationsToTree(node->edges[i], pNodeMutList);
+    addMutationsToTree(node->edges[i], subtreeMutHash);
 }
 
 static struct subtreeInfo *parseOneSubtree(struct tempName *subtreeTn, char *subtreeName,
                                            struct variantPathNode *subtreeMuts,
-                                           struct slName *userSampleIds, struct hash *condensedNodes)
-/* Parse usher's subtree output, figure out which user samples are in subtree and expand names
- * of condensed nodes. */
+                                           struct slName *userSampleIds)
+/* Parse usher's subtree output and figure out which user samples are in subtree. */
 {
 struct subtreeInfo *ti;
 AllocVar(ti);
 ti->subtreeTn = subtreeTn;
 ti->subtree = phyloOpenTree(ti->subtreeTn->forCgi);
-addMutationsToTree(ti->subtree, &subtreeMuts);
-if (subtreeMuts != NULL)
-    errAbort("addMutationsToTree: subtreeMutationList has more nodes than subtree");
-struct slName *subtreeIdList = phyloLeafNames(ti->subtree);
-// Don't do name substitutions on condensed node names in subtreeIdToIx since the IDs have to
-// match those in the original tree from protobuf.
-ti->subtreeIdToIx = slNameListToIxHash(subtreeIdList);
+struct hash *subtreeMutHash = hashVpnList(subtreeMuts);
+addMutationsToTree(ti->subtree, subtreeMutHash);
+if (hashNumEntries(subtreeMutHash) != 0)
+    errAbort("addMutationsToTree: subtree has fewer nodes than defined in subtree mutations file");
+ti->subtreeNameList = phyloLeafNames(ti->subtree);
+ti->subtreeIdToIx = slNameListToIxHash(ti->subtreeNameList);
 ti->subtreeUserSampleIds = getSubtreeSampleIds(userSampleIds, ti->subtreeIdToIx);
 if (slCount(ti->subtreeUserSampleIds) == 0)
     errAbort("No user sample IDs found in subtree file %s", ti->subtreeTn->forCgi);
-// Substitute in nicer node names for condensed nodes for displaying to the user in
-// custom tracks and Nextstrain/auspice JSON.
-struct hash *nameSubstitutions = expandCondensedNodeNames(condensedNodes, subtreeIdList);
-if (nameSubstitutions->elCount > 0)
-    ti->subtreeTn = substituteTreeNames(ti->subtree, subtreeName, nameSubstitutions);
-ti->subtreeNameList = substituteNameList(subtreeIdList, nameSubstitutions);
-hashFree(&nameSubstitutions);
-slFreeList(&subtreeIdList);
+hashFree(&subtreeMutHash);
 return ti;
 }
 
 static struct subtreeInfo *parseSubtrees(int subtreeCount, struct tempName *singleSubtreeTn,
                                          struct variantPathNode *singleSubtreeMuts,
                                          struct tempName *subtreeTns[],
                                          struct variantPathNode *subtreeMuts[],
-                                         struct slName *userSampleIds, struct hash *condensedNodes)
-/* Parse usher's subtree output, figure out which user samples are in each subtree, expand names
- * of condensed nodes.  Add parsed singleSubtree at head of list, followed by numbered subtrees. */
+                                         struct slName *userSampleIds)
+/* Parse usher's subtree output and figure out which user samples are in each subtree.
+ * Add parsed singleSubtree at head of list, followed by numbered subtrees. */
 {
 struct subtreeInfo *subtreeInfoList = NULL;
 int sIx;
 for (sIx = 0;  sIx < subtreeCount;  sIx++)
     {
     char subtreeName[512];
     safef(subtreeName, sizeof(subtreeName), "subtree%d", sIx+1);
     struct subtreeInfo *ti = parseOneSubtree(subtreeTns[sIx], subtreeName, subtreeMuts[sIx],
-                                             userSampleIds, condensedNodes);
+                                             userSampleIds);
     slAddHead(&subtreeInfoList, ti);
     }
 slReverse(&subtreeInfoList);
 struct subtreeInfo *ti = parseOneSubtree(singleSubtreeTn, "singleSubtree", singleSubtreeMuts,
-                                         userSampleIds, condensedNodes);
+                                         userSampleIds);
 slAddHead(&subtreeInfoList, ti);
 return subtreeInfoList;
 }
 
 static void parseClades(char *filename, struct hash *samplePlacements)
 /* Parse usher's clades.txt, which might have {sample, clade} or {sample, clade, lineage}. */
 {
 struct hash *wordStore = hashNew(0);
 struct lineFile *lf = lineFileOpen(filename, TRUE);
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     char *words[3];
     int wordCount = chopTabs(line, words);
     char *sampleId = words[0];
     struct placementInfo *info = hashFindVal(samplePlacements, sampleId);
     if (!info)
         errAbort("parseClades: can't find placementInfo for sample '%s'", sampleId);
     if (wordCount > 1)
         {
         // Nextstrain's clade "20E (EU1)" has to be tweaked to "20E.EU1" for matUtils to avoid
         // whitespace trouble; tweak it back.
         if (sameString(words[1], "20E.EU1"))
             words[1] = "20E (EU1)";
+        // Chop extra chars in usher-sampled clades.txt output
+        char *ptr = strstr(words[1], "*|");
+        if (ptr)
+            *ptr = '\0';
         info->nextClade = hashStoreName(wordStore, words[1]);
         }
     if (wordCount > 2)
+        {
+        // Chop extra chars in usher-sampled clades.txt output
+        char *ptr = strstr(words[2], "*|");
+        if (ptr)
+            *ptr = '\0';
         info->pangoLineage = hashStoreName(wordStore, words[2]);
         }
+    }
 lineFileClose(&lf);
 }
 
 static char *dirPlusFile(struct dyString *dy, char *dir, char *file)
-/* Write dir/file into dy and return pointer to dy->string. */
+/* Write dir/file into dy and return pointer to dy->string.  Do not free result! */
 {
 dyStringClear(dy);
 dyStringPrintf(dy, "%s/%s", dir, file);
 return dy->string;
 }
 
+#define USHER_SAMPLED_TREE_PREFIX "tree-0-"
+
 static int processOutDirFiles(struct usherResults *results, char *outDir,
                               struct tempName **retSingleSubtreeTn,
                               struct variantPathNode **retSingleSubtreeMuts,
                               struct tempName *subtreeTns[], struct variantPathNode *subtreeMuts[],
                               int maxSubtrees)
-/* Get paths to files in outDir; parse them, move files that we'll keep up to trash/ct/,
- * remove outDir. */
+/* Get paths to files in outDir; parse them and move files that we'll keep up to trash/ct/,
+ * leaving behind files that we will remove when done. */
 {
 int subtreeCount = 0;
 memset(subtreeTns, 0, maxSubtrees * sizeof(*subtreeTns));
 memset(subtreeMuts, 0, maxSubtrees * sizeof(*subtreeMuts));
 struct dyString *dyScratch = dyStringNew(0);
 struct slName *outDirFiles = listDir(outDir, "*"), *file;
 for (file = outDirFiles;  file != NULL;  file = file->next)
     {
     char *path = dirPlusFile(dyScratch, outDir, file->name);
     if (sameString(file->name, "uncondensed-final-tree.nh"))
         {
         mustRename(path, results->bigTreePlusTn->forCgi);
         }
-    else if (sameString(file->name, "mutation-paths.txt"))
+    else if (sameString(file->name, "mutation-paths.txt") ||  // original usher
+             sameString(file->name, "mutation-paths-1.txt"))  // usher-sampled(-server)
         {
         parseVariantPaths(path, results->samplePlacements);
         }
-    else if (startsWith("subtree-", file->name))
+    else if (startsWith("subtree-", file->name) ||
+             startsWith(USHER_SAMPLED_TREE_PREFIX "subtree-", file->name))
         {
         char fnameCpy[strlen(file->name)+1];
+        if (startsWith(USHER_SAMPLED_TREE_PREFIX, file->name))
+            safecpy(fnameCpy, sizeof fnameCpy, file->name + strlen(USHER_SAMPLED_TREE_PREFIX));
+        else
             safecpy(fnameCpy, sizeof fnameCpy, file->name);
         char *parts[4];
         int partCount = chopString(fnameCpy, "-", parts, ArraySize(parts));
         if (partCount == 2)
             {
             // Expect "subtree-N.nh"
             char *dot = strstr(parts[1], ".nh");
             if (dot)
                 {
                 *dot = '\0';
                 int subtreeIx = atol(parts[1]) - 1;
+                if (subtreeIx < 0)
+                    errAbort("processOutDirFiles: expected subtree file names to have numbers "
+                             "starting with 1, but found one with 0 (%s)", file->name);
                 if (subtreeIx >= maxSubtrees)
                     errAbort("Too many subtrees in usher output (max %d)", maxSubtrees);
                 if (subtreeIx >= subtreeCount)
                     subtreeCount = subtreeIx + 1;
                 char sname[32];
                 safef(sname, sizeof sname, "subtree%d", subtreeIx+1);
                 AllocVar(subtreeTns[subtreeIx]);
                 trashDirFile(subtreeTns[subtreeIx], "ct", sname, ".nwk");
                 mustRename(path, subtreeTns[subtreeIx]->forCgi);
                 }
             else
                 warn("Unexpected filename '%s' from usher, ignoring", file->name);
             }
         else if (partCount == 3)
             {
             // Expect "subtree-N-mutations.txt" or "subtree-N-expanded.txt"
             int subtreeIx = atol(parts[1]) - 1;
+            if (subtreeIx < 0)
+                errAbort("processOutDirFiles: expected subtree file names to have numbers "
+                         "starting with 1, but found one with 0 (%s)", file->name);
             if (subtreeIx >= maxSubtrees)
                 errAbort("Too many subtrees in usher output (max %d)", maxSubtrees);
             if (subtreeIx >= subtreeCount)
                 subtreeCount = subtreeIx + 1;
             if (sameString(parts[2], "mutations.txt"))
                 {
                 subtreeMuts[subtreeIx] = parseSubtreeMutations(path);
                 }
             else if (sameString(parts[2], "expanded.txt"))
                 {
                 // Don't need this, just remove it
                 }
             else
                 warn("Unexpected filename '%s' from usher, ignoring", file->name);
             }
         else
             warn("Unexpected filename '%s' from usher, ignoring", file->name);
         }
-    else if (startsWith("single-subtree", file->name))
+    else if (startsWith("single-subtree", file->name) ||
+             startsWith(USHER_SAMPLED_TREE_PREFIX "single-subtree", file->name))
         {
         // One subtree to bind them all
         char fnameCpy[strlen(file->name)+1];
+        if (startsWith(USHER_SAMPLED_TREE_PREFIX, file->name))
+            safecpy(fnameCpy, sizeof fnameCpy, file->name + strlen(USHER_SAMPLED_TREE_PREFIX));
+        else
             safecpy(fnameCpy, sizeof fnameCpy, file->name);
         char *parts[4];
         int partCount = chopString(fnameCpy, "-", parts, ArraySize(parts));
         if (partCount == 2)
             {
             // Expect single-subtree.nh
             if (!endsWith(parts[1], ".nh"))
                 warn("Unexpected filename '%s' from usher, ignoring", file->name);
             else if (retSingleSubtreeTn)
                 {
                 struct tempName *tn;
                 AllocVar(tn);
                 trashDirFile(tn, "ct", "subtree", ".nwk");
                 mustRename(path, tn->forCgi);
                 *retSingleSubtreeTn = tn;
                 }
             }
         else if (partCount == 3)
             {
             // Expect single-subtree-mutations.txt or single-subtree-expanded.txt
             if (sameString(parts[2], "mutations.txt"))
                 {
                 if (retSingleSubtreeMuts)
                     *retSingleSubtreeMuts = parseSubtreeMutations(path);
                 }
             else if (sameString(parts[2], "expanded.txt"))
                 {
                 // Don't need this, just remove it
                 }
             else
                 warn("Unexpected filename '%s' from usher, ignoring", file->name);
             }
         else
             warn("Unexpected filename '%s' from usher, ignoring", file->name);
         }
     else if (sameString(file->name, "clades.txt"))
         {
         parseClades(path, results->samplePlacements);
         }
     else if (sameString(file->name, "final-tree.nh") ||
+             sameString(file->name, "current-tree.nh") ||
              sameString(file->name, "placement_stats.tsv"))
         {
-        // Don't need this, just remove it.
+        // Don't need this (or already parsed it elsewhere not here), just remove it.
         }
     else
         warn("Unexpected filename '%s' from usher, ignoring", file->name);
-    unlink(path);
     }
-rmdir(outDir);
+dyStringFree(&dyScratch);
 // Make sure we got a complete range of subtrees [0..subtreeCount-1]
 int i;
 for (i = 0;  i < subtreeCount;  i++)
     {
     if (subtreeTns[i] == NULL)
         errAbort("Missing file subtree-%d.nh in usher results", i+1);
     if (subtreeMuts[i] == NULL)
         errAbort("Missing file subtree-%d-mutations.txt in usher results", i+1);
     }
 return subtreeCount;
 }
 
+static void removeOutDir(char *outDir)
+/* Remove outDir and its files. */
+{
+struct slName *outDirFiles = listDir(outDir, "*"), *file;
+struct dyString *dyScratch = dyStringNew(0);
+for (file = outDirFiles;  file != NULL;  file = file->next)
+    {
+    char *path = dirPlusFile(dyScratch, outDir, file->name);
+    unlink(path);
+    }
+dyStringFree(&dyScratch);
+rmdir(outDir);
+}
+
+static void runUsherCommand(char *cmd[], char *stderrFile, int *pStartTime)
+/* Run the standalone usher command with its stderr output redirected to stderrFile. */
+{
+char **cmds[] = { cmd, NULL };
+struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, stderrFile, 0);
+pipelineClose(&pl);
+reportTiming(pStartTime, "run usher");
+}
+
+boolean serverIsConfigured(char *db)
+/* Return TRUE if all necessary configuration settings are in place to run usher-sampled-server. */
+{
+char *serverDir = cfgOption("hgPhyloPlaceServerDir");
+if (isNotEmpty(serverDir))
+    {
+    char *usherServerEnabled = phyloPlaceDbSetting(db, "usherServerEnabled");
+    if (isNotEmpty(usherServerEnabled) && SETTING_IS_ON(usherServerEnabled) &&
+        fileExists(PHYLOPLACE_DATA_DIR "/usher-sampled-server"))
+        {
+        return TRUE;
+        }
+    }
+return FALSE;
+}
+
+static char *getUsherServerFilePath(char *db, char *file)
+/* Alloc & return the path to special server file in $hgPhyloPlaceServerDir/$db/ . */
+{
+char *serverDir = cfgOption("hgPhyloPlaceServerDir");
+// We need host name in order to prevent clashes between hgw1 and hgw2 sharing the same hg.conf
+// contents and NFS filesystem.  And not the vhost name from getHost(), the real base host name.
+char host[PATH_LEN];
+int ret = gethostname(host, sizeof host);
+if (ret != 0)
+    safecpy(host, sizeof host, "hostunknown");
+char *p = strchr(host, '.');
+if (p)
+    *p = '\0';
+struct dyString *dyPath = dyStringCreate("%s/%s/%s.%s",
+                                         serverDir, trackHubSkipHubName(db), host, file);
+return dyStringCannibalize(&dyPath);
+}
+
+static char *getUsherServerMfifoPath(char *db)
+/* Alloc & return path to server manager fifo file for db. */
+{
+return getUsherServerFilePath(db, "server.fifo");
+}
+
+static char *getUsherServerSocketPath(char *db)
+/* Alloc & return path to server socket file for db. */
+{
+return getUsherServerFilePath(db, "server.socket");
+}
+
+static char *getUsherServerLogPath(char *db)
+/* Alloc & return path to server log file (very important because it tells us the pid) for db. */
+{
+return getUsherServerFilePath(db, "server.log");
+}
+
+boolean serverIsRunning(char *db, FILE *errFile)
+/* Return TRUE if logFile exists and its first line specifies a pid as expected from daemonishSpawn,
+ * and that pid looks alive according to /proc. */
+{
+boolean pidIsLive = FALSE;
+char *logFile = getUsherServerLogPath(db);
+struct lineFile *lf = lineFileMayOpen(logFile, TRUE);
+if (lf)
+    {
+    char *line;
+    if (lineFileNext(lf, &line, NULL))
+        {
+        if (isAllDigits(line))
+            {
+            int pid = atol(line);
+            if (pid > 1)
+                {
+                char procStatus[PATH_LEN];
+                safef(procStatus, sizeof procStatus, "/proc/%d/status", pid);
+                struct lineFile *psLf = lineFileMayOpen(procStatus, TRUE);
+                if (psLf)
+                    {
+                    const static char *statePrefix ="State:\t";
+                    boolean foundState = FALSE;
+                    while (lineFileNext(psLf, &line, NULL))
+                        {
+                        if (startsWith(statePrefix, line))
+                            {
+                            foundState = TRUE;
+                            char state = *(line + strlen(statePrefix));
+                            // State Running, Sleep or Deep (uninterruptible) sleep is OK
+                            if (state == 'R' || state == 'S' || state == 'D')
+                                {
+                                pidIsLive = TRUE;
+                                fprintf(errFile, "serverIsRunning: Found pid %d with state %c.\n",
+                                        pid, state);
+                                }
+                            else
+                                {
+                                // State sTopped or Zombie not OK
+                                fprintf(errFile, "serverIsRunning: WARNING: pid %d has state %c.\n",
+                                        pid, state);
+                                }
+                            }
+                        }
+                    lineFileClose(&psLf);
+                    if (!foundState)
+                        fprintf(errFile,
+                                "serverIsRunning: No line of %s begins with '%s', "
+                                "can't find process state.\n",
+                                procStatus, statePrefix);
+                    }
+                else
+                    fprintf(errFile, "serverIsRunning: Can't open %s, looks like server is dead.\n",
+                            procStatus);
+                }
+            else
+                fprintf(errFile, "serverIsRunning: expected number greater than 1 (pid) in "
+                        "first line of log file %s, but got '%s'\n", logFile, line);
+            }
+        else
+            {
+            fprintf(errFile, "serverIsRunning: First line of log file %s has unexpected format '%s' "
+                    "(expected to begin with pid)\n",
+                    logFile, line);
+            }
+        }
+    else
+        fprintf(errFile, "serverIsRunning: log file '%s' appears empty\n", logFile);
+    lineFileClose(&lf);
+    }
+else
+    fprintf(errFile, "serverIsRunning: Can't open log file '%s'\n", logFile);
+return pidIsLive;
+}
+
+
+//******************* BEGIN copied from lib/pipeline.c, consider libifying these ***************
+static void closeNonStdDescriptors(void)
+/* close non-standard file descriptors */
+{
+long maxFd = sysconf(_SC_OPEN_MAX);
+if (maxFd < 0)
+    maxFd = 4096;  // shouldn't really happen
+int fd;
+for (fd = STDERR_FILENO+1; fd < maxFd; fd++)
+    close(fd);
+}
+
+static int openRead(char *fname)
+/* open a file for reading */
+{
+int fd = open(fname, O_RDONLY);
+if (fd < 0)
+    errnoAbort("can't open for read access: %s", fname);
+return fd;
+}
+
+static int openWrite(char *fname, boolean append)
+/* open a file for write access */
+{
+int flags = O_WRONLY|O_CREAT;
+if (append)
+    flags |= O_APPEND;
+else
+    flags |= O_TRUNC;
+int fd = open(fname, flags, 0666);
+if (fd < 0)
+    errnoAbort("can't open for write access: %s", fname);
+return fd;
+}
+//******************* END copied from lib/pipeline.c, consider libifying these ***************
+
+
+static void daemonishInit(int newStderr)
+/* Isolate a double-fork-spawned process from its parent processes by closing file descriptors,
+ * setting stderr to errlogFile, resetting umask etc.
+ * See "man 7 daemon".  This function is "daemonish" because I'm not doing 100% of the recommended
+ * initialization things from "man 7 daemon", because I'm not running a Linux system service,
+ * just a program that needs to remain running in the background after this CGI exits.
+ * This errAborts if setting stdin, stdout or stderr fails, and ignores any other failures. */
+{
+// Set stdin and stdout to /dev/null, stderr to errlogFile, close all other possibly open fds.
+int newStdin = openRead("/dev/null");
+if (dup2(newStdin, STDIN_FILENO) < 0)
+    errnoAbort("daemonishInit: can't dup2 to stdin");
+int newStdout = openWrite("/dev/null", TRUE);
+if (dup2(newStdout, STDOUT_FILENO) < 0)
+    errnoAbort("daemonishInit: can't dup2 to stdout");
+if (dup2(newStderr, STDERR_FILENO) < 0)
+    errnoAbort("daemonishInit: can't dup2 to stderr");
+closeNonStdDescriptors();
+
+// Reset all signal handlers to their default.
+int ix;
+for (ix = 0;  ix <= SIGRTMAX;  ix++)
+    signal(ix, SIG_DFL);
+
+// Reset the signal mask
+sigset_t sigset;
+sigemptyset(&sigset);
+sigprocmask(SIG_SETMASK, &sigset, NULL);
+
+// Reset the umask
+umask(0);
+}
+
+int daemonishSpawn(char *cmd[], char *errlogFile)
+/* Use a double fork & setgid to create a process with parent process group = initd so it has
+ * no controlling terminal and so the CGI won't become a zombie process when done.
+ * See "man 7 daemon".  This function is "daemonish" because I'm not doing 100% of the recommended
+ * initialization things from "man 7 daemon", because I'm not running a Linux system service,
+ * just a program that needs to remain running in the background after this CGI exits.
+ * Double-fork grandchild execs cmd, uses /dev/null for stdin and stdout, and sends stderr
+ * to errlogFile -- after first writing its pid to errlogFile so that later processes can use
+ * errlogFile to check whether the spawned process is still running.
+ * Calling process/first fork parent returns result of first fork (pid for success, -1 for error) */
+{
+int fork1 = fork();
+if (fork1 < 0)
+    errnoWarn("first fork failed (%d)", fork1);
+else if (fork1 == 0)
+    {
+    // First fork child will setsid and fork again
+    if (setsid() < 0)
+        errnoAbort("setsid failed; first fork child errAborting");
+    else
+        {
+        int fork2 = fork();
+        if (fork2 < 0)
+            errnoAbort("second fork failed (%d); first fork child errAborting", fork2);
+        else if (fork2 > 0)
+            {
+            // First fork child / second fork parent is all done. Use _exit not exit so we
+            // don't close mysql connections etc.
+            _exit(EXIT_SUCCESS);
+            }
+        else
+            {
+            // Second fork child sets stdin and stdout to /dev/null, stderr to errLogFile (after
+            // writing pid), then becomes cmd using execvp.
+            int newStderr = openWrite(errlogFile, FALSE);
+            char pidBuf[16];
+            safef(pidBuf, sizeof pidBuf, "%llu\n", (long long unsigned)getpid());
+            write(newStderr, pidBuf, strlen(pidBuf));
+            daemonishInit(newStderr);
+            execvp(cmd[0], cmd);
+            }
+        }
+    }
+return fork1;
+}
+
+boolean startServer(char *db, struct treeChoices *treeChoices, FILE *errFile)
+/* Start up an usher-sampled-server process to run in the background. */
+{
+boolean success = FALSE;
+if (serverIsConfigured(db))
+    {
+    char *serverDir = cfgOption("hgPhyloPlaceServerDir");
+    if (! fileExists(serverDir))
+        makeDir(serverDir);
+    char path[PATH_LEN];
+    safef(path, sizeof path, "%s/%s", serverDir, db);
+    if (! fileExists(path))
+        makeDir(path);
+
+    char *usherServerPath = PHYLOPLACE_DATA_DIR "/usher-sampled-server";
+    // Each protobuf file from treeChoices must appear as a separate arg after -l,
+    // so we have to dynamically build up serverCmd, but can use a static for the first args.
+    char *serverCmdBase[] = { usherServerPath,
+                              "-m", getUsherServerMfifoPath(db),
+                              "-s", getUsherServerSocketPath(db),
+                              "-T", USHER_NUM_THREADS,
+                              "-t", USHER_SERVER_CHILD_TIMEOUT,
+                              "-l" };
+    size_t serverCmdBaseSize = ArraySize(serverCmdBase);
+    size_t serverCmdSize = serverCmdBaseSize + treeChoices->count + 1;
+    char *serverCmd[serverCmdSize];
+    int ix;
+    for (ix = 0;  ix < serverCmdBaseSize;   ix++)
+        serverCmd[ix] = serverCmdBase[ix];
+    for (ix = 0;  ix < treeChoices->count;  ix++)
+        serverCmd[serverCmdBaseSize + ix] = treeChoices->protobufFiles[ix];
+    serverCmd[serverCmdBaseSize + ix] = NULL;
+
+    fputs("Spawning server with command: ", errFile);
+    for (ix = 0;  serverCmd[ix] != NULL;  ix++)
+        {
+        fputc(' ', errFile);
+        fputs(serverCmd[ix], errFile);
+        }
+    fputc('\n', errFile);
+    success = (daemonishSpawn(serverCmd, getUsherServerLogPath(db)) > 0);
+    if (success)
+        fputs("Server spawned; first fork returned successfully.\n", errFile);
+    else
+        fputs("Server spawn failed at first fork!\n", errFile);
+    }
+else
+    errAbort("Usher server is not configured for %s, not starting", db);
+return success;
+}
+
+void serverReloadProtobufs(char *db, struct treeChoices *treeChoices)
+/* Send a reload command and list of protobufs for db to usher server. */
+{
+char *usherServerMfifoPath = getUsherServerMfifoPath(db);
+FILE *mf = fopen(usherServerMfifoPath, "a");
+if (mf)
+    {
+    fprintf(mf, "reload\n");
+    int ix;
+    for (ix = 0;  ix < treeChoices->count;  ix++)
+        fprintf(mf, "%s\n", treeChoices->protobufFiles[ix]);
+    fputc('\n', mf);
+    carefulClose(&mf);
+    }
+else
+    warn("serverReload: unable to open '%s', command not sent", usherServerMfifoPath);
+}
+
+void serverStop(char *db)
+/* Send stop command to usher server. */
+{
+char *usherServerMfifoPath = getUsherServerMfifoPath(db);
+FILE *mf = fopen(usherServerMfifoPath, "a");
+if (mf)
+    {
+    fprintf(mf, "stop\n");
+    carefulClose(&mf);
+    }
+else
+    warn("serverStop: unable to open '%s', command not sent", usherServerMfifoPath);
+}
+
+void serverSetThreadCount(char *db, int val)
+/* Send thread command and value to usher server. */
+{
+char *usherServerMfifoPath = getUsherServerMfifoPath(db);
+FILE *mf = fopen(usherServerMfifoPath, "a");
+if (mf)
+    {
+    if (val > 0)
+        fprintf(mf, "thread %d\n", val);
+    else
+        errAbort("Bad value %d passed to serverSetThreadCount, not sending", val);
+    carefulClose(&mf);
+    }
+else
+    warn("serverSetTimeout: unable to open '%s', command not sent", usherServerMfifoPath);
+}
+
+void serverSetTimeout(char *db, int val)
+/* Send timeout command and value (in seconds) to usher server. */
+{
+char *usherServerMfifoPath = getUsherServerMfifoPath(db);
+FILE *mf = fopen(usherServerMfifoPath, "a");
+if (mf)
+    {
+    if (val > 0)
+        fprintf(mf, "timeout %d\n", val);
+    else
+        errAbort("Bad value %d passed to serverSetTimeout, not sending", val);
+    carefulClose(&mf);
+    }
+else
+    warn("serverSetTimeout: unable to open '%s', command not sent", usherServerMfifoPath);
+}
+
+static int getServerSocket(char *db, struct treeChoices *treeChoices, FILE *errFile)
+/* Try to connect to server; attempt to restart server and then connect if it seems like the server
+ * is down.  Return -1 if unable to connect. */
+{
+int socketFd = socket(AF_UNIX, SOCK_STREAM, 0);
+if (socketFd < 0)
+    {
+    // OS-level failure, not anything we can do about it
+    fprintf(errFile, "Failed to create a UNIX socket: %s\n", strerror(errno));
+    }
+else
+    {
+    // From "man 7 unix":
+    // struct sockaddr_un {
+    //     sa_family_t sun_family;               /* AF_UNIX */
+    //     char        sun_path[UNIX_PATH_MAX];  /* pathname */
+    // };
+    struct sockaddr_un addr;
+    addr.sun_family=AF_UNIX;
+    char *usherServerSocketPath = getUsherServerSocketPath(db);
+    safecpy(addr.sun_path, sizeof(addr.sun_path), usherServerSocketPath);
+    int ret = connect(socketFd, (struct sockaddr *)&addr, sizeof(addr));
+    if (ret < 0)
+        {
+        fprintf(errFile, "Cannot connect socket %d to path '%s': %s\n",
+                socketFd, usherServerSocketPath, strerror(errno));
+        // Does the server need to be restarted, or (in case another web request prompted some
+        // other hgPhyloPlace process to restart it but it isn't quite up yet) should we wait a
+        // few seconds and try again?
+        if (serverIsRunning(db, errFile) ||
+            startServer(db, treeChoices, errFile))
+            {
+            sleep(5);
+            ret = connect(socketFd, (struct sockaddr *)&addr, sizeof(addr));
+            if (ret < 0)
+                {
+                // Give up - fall back on regular usher command
+                fprintf(errFile, "Second attempt to connect socket %d to path '%s' failed: %s\n",
+                        socketFd, usherServerSocketPath, strerror(errno));
+                socketFd = -1;
+                }
+            }
+        }
+    }
+return socketFd;
+}
+
+// Server sends ASCII EOT character (4) when done. Sadly I can't find a header file that defines EOT.
+#define EOT 4
+
+static boolean sendQuery(int socketFd, char *cmd[], char *db, struct treeChoices *treeChoices,
+                         FILE *errFile)
+/* Send command to socket, read response on socket, return TRUE if we get a successful response. */
+{
+boolean success = FALSE;
+struct dyString *dyMessage = dyStringNew(0);
+int ix;
+for (ix = 0;  cmd[ix] != NULL;  ix++)
+    {
+    // Don't include args from -T onward; server rejects requests with -T or --optimization_radius
+    if (sameString("-T", cmd[ix]))
+        break;
+    dyStringPrintf(dyMessage, "%s\n", cmd[ix]);
+    }
+// But we do need --no-ignore-prefix:
+dyStringPrintf(dyMessage, "--no-ignore-prefix\n"USHER_DEDUP_PREFIX"\n");
+dyStringAppendC(dyMessage, '\n');
+boolean serverError = FALSE;
+int bytesWritten = write(socketFd, dyMessage->string, dyMessage->stringSize);
+if (bytesWritten == dyMessage->stringSize)
+    {
+    struct lineFile *lf = lineFileAttach("server socket", TRUE, socketFd);
+    if (lf)
+        {
+        char *line;
+        while (lineFileNext(lf, &line, NULL))
+            {
+            if (startsWith("Tree", line) && endsWith(line, "not found"))
+                {
+                // Tell the server to reload the latest protobufs
+                serverReloadProtobufs(getUsherServerMfifoPath(db), treeChoices);
+                // Reloading multiple trees takes a while, so fall back on standalone usher(-sampled)
+                serverError = TRUE;
+                // Continue reading output from server.
+                }
+            else if (line[0] == EOT)
+                {
+                success = ! serverError;
+                break;
+                }
+            else if (isNotEmpty(line))
+                fprintf(errFile, "%s\n", line);
+            }
+        }
+    else
+        fprintf(errFile, "Failed to attach linefile to socket %d.\n", socketFd);
+    }
+else
+    fprintf(errFile, "Failed to send query to socket %d: attempted to write %ld bytes, ret=%d\n",
+            socketFd, dyMessage->stringSize, bytesWritten);
+dyStringFree(&dyMessage);
+return success;
+}
+
+static boolean runUsherServer(char *db, char *cmd[], char *stderrFile,
+                              struct treeChoices *treeChoices, int *pStartTime)
+/* Start the server if necessary, connect to it, send a query, get response and return TRUE if.
+ * all goes well. If unsuccessful, write reasons to errFile and return FALSE. */
+{
+boolean success = FALSE;
+if (serverIsConfigured(db))
+    {
+    FILE *errFile = mustOpen(stderrFile, "w");
+    int serverSocket = getServerSocket(db, treeChoices, errFile);
+
+    reportTiming(pStartTime, "get socket");
+    if (serverSocket > 0)
+        {
+        success = sendQuery(serverSocket, cmd, db, treeChoices, errFile);
+        close(serverSocket);
+        reportTiming(pStartTime, "send query");
+        }
+    carefulClose(&errFile);
+    }
+return success;
+}
+
 #define MAX_SUBTREES 1000
 
-struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile,
-                              int subtreeSize, struct slName *userSampleIds,
-                              struct hash *condensedNodes, int *pStartTime)
+struct usherResults *runUsher(char *db, char *usherPath, char *usherAssignmentsPath, char *vcfFile,
+                              int subtreeSize, struct slName **pUserSampleIds,
+                              struct treeChoices *treeChoices, int *pStartTime)
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
- * and parse other results out of stderr output. */
+ * and parse other results out of stderr output.  The usher-sampled version of usher might
+ * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */
 {
 struct usherResults *results = usherResultsNew();
 char subtreeSizeStr[16];
 safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize);
 struct tempName tnOutDir;
 trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir");
 char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi,
-                "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS, "-u",
+                "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-u",
+                "-T", USHER_NUM_THREADS,       // Don't pass args from -T onward to server
+                "--optimization_radius", "0",  // Don't pass these to original usher, only -sampled
+                "--no-ignore-prefix", USHER_DEDUP_PREFIX,
                 NULL };
-char **cmds[] = { cmd, NULL };
 struct tempName tnStderr;
 trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt");
-struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0);
-pipelineClose(&pl);
-reportTiming(pStartTime, "run usher");
-parseStderr(tnStderr.forCgi, results->samplePlacements);
+struct tempName tnServerStderr;
+trashDirFile(&tnServerStderr, "ct", "usher_server_stderr", ".txt");
+char *stderrFile = tnServerStderr.forCgi;
+if (! runUsherServer(db, cmd, tnServerStderr.forCgi, treeChoices, pStartTime))
+    {
+    if (!endsWith(usherPath, "-sampled"))
+        {
+        // Truncate cmd for original usher: remove usher-sampled-specific option
+        int ix = stringArrayIx("--optimization_radius", cmd, ArraySize(cmd)-1);
+        if (ix > 0)
+            cmd[ix] = NULL;
+        }
+    runUsherCommand(cmd, tnStderr.forCgi, pStartTime);
+    stderrFile = tnStderr.forCgi;
+    }
 
 struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES];
 struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES];
-parsePlacements(tnOutDir.forCgi, tnStderr.forCgi, results->samplePlacements);
+parsePlacements(tnOutDir.forCgi, stderrFile, results->samplePlacements, pUserSampleIds);
 int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn,
                                       &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES);
-if (singleSubtreeTn == NULL)
+if (singleSubtreeTn)
     {
-    warn("Sorry, there was a problem running usher.  "
-         "Please ask genome-www@soe.ucsc.edu to take a look at %s.", tnStderr.forCgi);
-    return NULL;
-    }
     results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts,
-                                         subtreeTns, subtreeMuts, userSampleIds, condensedNodes);
+                                             subtreeTns, subtreeMuts, *pUserSampleIds);
     results->singleSubtreeInfo = results->subtreeInfoList;
     results->subtreeInfoList = results->subtreeInfoList->next;
+    removeOutDir(tnOutDir.forCgi);
+    }
+else
+    {
+    results = NULL;
+    warn("Sorry, there was a problem running usher.  "
+         "Please ask genome-www@soe.ucsc.edu to take a look at %s.", stderrFile);
+    }
+reportTiming(pStartTime, "parse results from usher");
 return results;
 }
 
 static void addEmptyPlacements(struct slName *sampleIds, struct hash *samplePlacements)
 /* Parsing an usher-style clades.txt file from matUtils extract requires samplePlacements to
  * have placementInfo for each sample.  When running usher, those are added when we parse
  * usher stderr; when running matUtils, just allocate one for each sample. */
 {
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     {
     struct placementInfo *info;
     AllocVar(info);
     hashAdd(samplePlacements, sample->name, info);
     info->sampleId = cloneString(sample->name);
     }
 }
 
 struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath,
                                                 int subtreeSize, struct slName *sampleIds,
-                                                struct hash *condensedNodes, int *pStartTime)
+                                                int *pStartTime)
 /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees
  * containing sampleIds, save resulting subtrees to trash files, return subtree results.
  * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */
 {
 struct usherResults *results = usherResultsNew();
 char subtreeSizeStr[16];
 safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize);
 char *numThreadsStr = "16";
 struct tempName tnSamples;
 trashDirFile(&tnSamples, "ct", "matUtilsExtractSamples", ".txt");
 FILE *f = mustOpen(tnSamples.forCgi, "w");
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     fprintf(f, "%s\n", sample->name);
 carefulClose(&f);
 struct tempName tnOutDir;
 trashDirFile(&tnOutDir, "ct", "matUtils_outdir", ".dir");
 char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir.forCgi,
                 "-s", tnSamples.forCgi,
                 "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", numThreadsStr,
                 "--usher-clades-txt", NULL };
 char **cmds[] = { cmd, NULL };
 struct tempName tnStderr;
 trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt");
 struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0);
 pipelineClose(&pl);
 reportTiming(pStartTime, "run matUtils");
 addEmptyPlacements(sampleIds, results->samplePlacements);
 struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES];
 struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES];
 int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts,
                                       subtreeTns, subtreeMuts, MAX_SUBTREES);
 results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts,
-                                         subtreeTns, subtreeMuts, sampleIds, condensedNodes);
+                                         subtreeTns, subtreeMuts, sampleIds);
 results->singleSubtreeInfo = results->subtreeInfoList;
 results->subtreeInfoList = results->subtreeInfoList->next;
+reportTiming(pStartTime, "process results from matUtils");
 return results;
 }