src/hg/hgPhyloPlace/runUsher.c a2a68e8ad4ff5927b604b27898c183a519c9e5f4

a2a68e8ad4ff5927b604b27898c183a519c9e5f4
angie
  Tue Feb 23 20:08:34 2021 -0800
Instead of replacing small subtrees (usher -k) with the new single subtree option (-K), make both (unless a large number of sequences are uploaded; then just -K).

diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c
index 5cd7aa7..4903c14 100644
--- src/hg/hgPhyloPlace/runUsher.c
+++ src/hg/hgPhyloPlace/runUsher.c
@@ -581,77 +581,95 @@
 {
 struct variantPathNode *nodeMuts = slPopHead(pNodeMutList);
 if (! nodeMuts)
     errAbort("addMutationsToTree: subtree mutation list has fewer nodes than subtree");
 if (node->ident->name && ! sameString(nodeMuts->nodeName, node->ident->name))
     errAbort("addMutationsToTree: subtree node name is '%s' but subtree mutation list item is '%s'",
              node->ident->name, nodeMuts->nodeName);
 if (node->priv != NULL)
     errAbort("addMutationsToTree: node already has mutations assigned");
 node->priv = nodeMuts->sncList;
 int i;
 for (i = 0;  i < node->numEdges;  i++)
     addMutationsToTree(node->edges[i], pNodeMutList);
 }
 
-static struct subtreeInfo *parseSubtrees(int subtreeCount, struct tempName *subtreeTns[],
-                                         struct variantPathNode *subtreeMuts[],
+static struct subtreeInfo *parseOneSubtree(struct tempName *subtreeTn,
+                                           struct variantPathNode *subtreeMuts,
                                            struct slName *userSampleIds, struct hash *condensedNodes)
-/* Parse usher's subtree output, figure out which user samples are in each subtree, expand names
- * of condensed nodes, and write auspice json. */
-{
-struct subtreeInfo *subtreeInfoList = NULL;
-int sIx;
-for (sIx = 0;  sIx < subtreeCount;  sIx++)
+/* Parse usher's subtree output, figure out which user samples are in subtree and expand names
+ * of condensed nodes. */
 {
 struct subtreeInfo *ti;
 AllocVar(ti);
-    ti->subtreeTn = subtreeTns[sIx];
+ti->subtreeTn = subtreeTn;
 ti->subtree = phyloOpenTree(ti->subtreeTn->forCgi);
-    addMutationsToTree(ti->subtree, &subtreeMuts[sIx]);
-    if (subtreeMuts[sIx] != NULL)
+addMutationsToTree(ti->subtree, &subtreeMuts);
+if (subtreeMuts != NULL)
     errAbort("addMutationsToTree: subtreeMutationList has more nodes than subtree");
 struct slName *subtreeIdList = phyloLeafNames(ti->subtree);
 // Don't do name substitutions on condensed node names in subtreeIdToIx since the IDs have to
 // match those in the original tree from protobuf.
 ti->subtreeIdToIx = slNameListToIxHash(subtreeIdList);
 ti->subtreeUserSampleIds = getSubtreeSampleIds(userSampleIds, ti->subtreeIdToIx);
 if (slCount(ti->subtreeUserSampleIds) == 0)
     errAbort("No user sample IDs found in subtree file %s", ti->subtreeTn->forCgi);
 // Substitute in nicer node names for condensed nodes for displaying to the user in
 // custom tracks and Nextstrain/auspice JSON.
 struct hash *nameSubstitutions = expandCondensedNodeNames(condensedNodes, subtreeIdList);
 if (nameSubstitutions->elCount > 0)
     ti->subtreeTn = substituteTreeNames(ti->subtree, nameSubstitutions);
 ti->subtreeNameList = substituteNameList(subtreeIdList, nameSubstitutions);
-    slAddHead(&subtreeInfoList, ti);
 hashFree(&nameSubstitutions);
 slFreeList(&subtreeIdList);
+return ti;
+}
+
+static struct subtreeInfo *parseSubtrees(int subtreeCount, struct tempName *singleSubtreeTn,
+                                         struct variantPathNode *singleSubtreeMuts,
+                                         struct tempName *subtreeTns[],
+                                         struct variantPathNode *subtreeMuts[],
+                                         struct slName *userSampleIds, struct hash *condensedNodes)
+/* Parse usher's subtree output, figure out which user samples are in each subtree, expand names
+ * of condensed nodes.  Add parsed singleSubtree at head of list, followed by numbered subtrees. */
+{
+struct subtreeInfo *subtreeInfoList = NULL;
+int sIx;
+for (sIx = 0;  sIx < subtreeCount;  sIx++)
+    {
+    struct subtreeInfo *ti = parseOneSubtree(subtreeTns[sIx], subtreeMuts[sIx], userSampleIds,
+                                             condensedNodes);
+    slAddHead(&subtreeInfoList, ti);
     }
 slReverse(&subtreeInfoList);
+struct subtreeInfo *ti = parseOneSubtree(singleSubtreeTn, singleSubtreeMuts, userSampleIds,
+                                         condensedNodes);
+slAddHead(&subtreeInfoList, ti);
 return subtreeInfoList;
 }
 
 static char *dirPlusFile(struct dyString *dy, char *dir, char *file)
 /* Write dir/file into dy and return pointer to dy->string. */
 {
 dyStringClear(dy);
 dyStringPrintf(dy, "%s/%s", dir, file);
 return dy->string;
 }
 
 static int processOutDirFiles(struct usherResults *results, char *outDir,
+                              struct tempName **retSingleSubtreeTn,
+                              struct variantPathNode **retSingleSubtreeMuts,
                               struct tempName *subtreeTns[], struct variantPathNode *subtreeMuts[],
                               int maxSubtrees)
 /* Get paths to files in outDir; parse them, move files that we'll keep up to trash/ct/,
  * remove outDir. */
 {
 int subtreeCount = 0;
 memset(subtreeTns, 0, maxSubtrees * sizeof(*subtreeTns));
 memset(subtreeMuts, 0, maxSubtrees * sizeof(*subtreeMuts));
 struct dyString *dyScratch = dyStringNew(0);
 struct slName *outDirFiles = listDir(outDir, "*"), *file;
 for (file = outDirFiles;  file != NULL;  file = file->next)
     {
     char *path = dirPlusFile(dyScratch, outDir, file->name);
     if (sameString(file->name, "uncondensed-final-tree.nh"))
         {
@@ -700,60 +718,63 @@
                 {
                 subtreeMuts[subtreeIx] = parseSubtreeMutations(path);
                 }
             else if (sameString(parts[2], "expanded.txt"))
                 {
                 // Don't need this, just remove it
                 }
             else
                 warn("Unexpected filename '%s' from usher, ignoring", file->name);
             }
         else
             warn("Unexpected filename '%s' from usher, ignoring", file->name);
         }
     else if (startsWith("single-subtree", file->name))
         {
-        // We have a single subtree, not subtree-N-* files
-        int subtreeIx = 0;
-        subtreeCount = 1;
+        // One subtree to bind them all
         char fnameCpy[strlen(file->name)+1];
         safecpy(fnameCpy, sizeof fnameCpy, file->name);
         char *parts[4];
         int partCount = chopString(fnameCpy, "-", parts, ArraySize(parts));
         if (partCount == 2)
             {
             // Expect single-subtree.nh
             if (!endsWith(parts[1], ".nh"))
                 warn("Unexpected filename '%s' from usher, ignoring", file->name);
-            else
+            else if (retSingleSubtreeTn)
                 {
-                AllocVar(subtreeTns[subtreeIx]);
-                trashDirFile(subtreeTns[subtreeIx], "ct", "subtree", ".nwk");
-                mustRename(path, subtreeTns[subtreeIx]->forCgi);
+                struct tempName *tn;
+                AllocVar(tn);
+                trashDirFile(tn, "ct", "subtree", ".nwk");
+                mustRename(path, tn->forCgi);
+                *retSingleSubtreeTn = tn;
                 }
             }
         else if (partCount == 3)
             {
             // Expect single-subtree-mutations.txt or single-subtree-expanded.txt
             if (sameString(parts[2], "mutations.txt"))
                 {
-                subtreeMuts[subtreeIx] = parseSubtreeMutations(path);
+                if (retSingleSubtreeMuts)
+                    *retSingleSubtreeMuts = parseSubtreeMutations(path);
                 }
             else if (sameString(parts[2], "expanded.txt"))
                 {
                 // Don't need this, just remove it
                 }
+            else
+                warn("Unexpected filename '%s' from usher, ignoring", file->name);
             }
         else
             warn("Unexpected filename '%s' from usher, ignoring", file->name);
         }
     else if (sameString(file->name, "final-tree.nh"))
         {
         // Don't need this, just remove it.
         }
     else
         warn("Unexpected filename '%s' from usher, ignoring", file->name);
     unlink(path);
     }
 rmdir(outDir);
 // Make sure we got a complete range of subtrees [0..subtreeCount-1]
 int i;
@@ -771,33 +792,35 @@
 
 struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName *userSampleIds,
                               struct hash *condensedNodes, int *pStartTime)
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
  * and parse other results out of stderr output. */
 {
 struct usherResults *results = usherResultsNew();
 char subtreeSizeStr[16];
 safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize);
 char *numThreadsStr = "16";
 struct tempName tnOutDir;
 trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir");
 char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi,
-                "-K", subtreeSizeStr, "-T", numThreadsStr, "-u", "-l", NULL };
+                "-k", subtreeSizeStr, "-K", "1000", "-T", numThreadsStr, "-u", "-l", NULL };
 char **cmds[] = { cmd, NULL };
 struct tempName tnStderr;
 trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt");
 struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi);
 pipelineClose(&pl);
 reportTiming(pStartTime, "run usher");
 parseStderr(tnStderr.forCgi, results->samplePlacements);
 
-struct tempName *subtreeTns[MAX_SUBTREES];
-struct variantPathNode *subtreeMuts[MAX_SUBTREES];
-int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, subtreeTns, subtreeMuts,
-                                      MAX_SUBTREES);
-results->subtreeInfoList = parseSubtrees(subtreeCount, subtreeTns, subtreeMuts, userSampleIds,
-                                         condensedNodes);
+struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES];
+struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES];
+int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts,
+                                      subtreeTns, subtreeMuts, MAX_SUBTREES);
+results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts,
+                                         subtreeTns, subtreeMuts, userSampleIds, condensedNodes);
+results->singleSubtreeInfo = results->subtreeInfoList;
+results->subtreeInfoList = results->subtreeInfoList->next;
 return results;
 }