0b186effb2d2b536d2c280ec31bec6e153e6bd7e
angie
  Tue May 21 13:26:39 2024 -0700
Add support for uploaded names/IDs for the multi-ref/tree organism.ra case.  Add new config option anchorSamples, pass to usher-sampled/matUtils/server if present.
anchorSamples is a file with names of sequences that should always be included in the subtree to provide some larger-scale context, e.g. well-known vaccine or reference material strains.  Influenza user request.

diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c
index 074ba4d..11401cc 100644
--- src/hg/hgPhyloPlace/runUsher.c
+++ src/hg/hgPhyloPlace/runUsher.c
@@ -714,31 +714,31 @@
 
 static void removeOutDir(char *outDir)
 /* Remove outDir and its files. */
 {
 struct slName *outDirFiles = listDir(outDir, "*"), *file;
 struct dyString *dyScratch = dyStringNew(0);
 for (file = outDirFiles;  file != NULL;  file = file->next)
     {
     char *path = dirPlusFile(dyScratch, outDir, file->name);
     unlink(path);
     }
 dyStringFree(&dyScratch);
 rmdir(outDir);
 }
 
-static void runUsherCommand(char *cmd[], char *stderrFile, int *pStartTime)
+static void runUsherCommand(char *cmd[], char *stderrFile, char *anchorFile, int *pStartTime)
 /* Run the standalone usher command with its stderr output redirected to stderrFile. */
 {
 char **cmds[] = { cmd, NULL };
 struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, stderrFile, 0);
 pipelineClose(&pl);
 reportTiming(pStartTime, "run usher command");
 }
 
 boolean serverIsConfigured(char *org)
 /* Return TRUE if all necessary configuration settings are in place to run usher-sampled-server. */
 {
 char *serverDir = cfgOption("hgPhyloPlaceServerDir");
 if (isNotEmpty(serverDir))
     {
     char *usherServerEnabled = phyloPlaceOrgSetting(org, "usherServerEnabled");
@@ -1140,46 +1140,48 @@
                 // Give up - fall back on regular usher command
                 fprintf(errFile, "Second attempt to connect socket %d to path '%s' failed: %s\n",
                         socketFd, usherServerSocketPath, strerror(errno));
                 socketFd = -1;
                 }
             }
         }
     }
 return socketFd;
 }
 
 // Server sends ASCII EOT character (4) when done. Sadly I can't find a header file that defines EOT.
 #define EOT 4
 
 static boolean sendQuery(int socketFd, char *cmd[], char *org, struct treeChoices *treeChoices,
-                         FILE *errFile, boolean addNoIgnorePrefix)
+                         FILE *errFile, boolean addNoIgnorePrefix, char *anchorFile)
 /* Send command to socket, read response on socket, return TRUE if we get a successful response. */
 {
 boolean success = FALSE;
 struct dyString *dyMessage = dyStringNew(0);
 int ix;
 for (ix = 0;  cmd[ix] != NULL;  ix++)
     {
     // Don't include args from -T onward; server rejects requests with -T or --optimization_radius
     if (sameString("-T", cmd[ix]))
         break;
     dyStringPrintf(dyMessage, "%s\n", cmd[ix]);
     }
 if (addNoIgnorePrefix)
     // Needed when placing uploaded sequences, but not when finding uploaded names
     dyStringPrintf(dyMessage, "--no-ignore-prefix\n"USHER_DEDUP_PREFIX"\n");
+if (isNotEmpty(anchorFile))
+    dyStringPrintf(dyMessage, "--anchor-samples\n%s\n", anchorFile);
 dyStringAppendC(dyMessage, '\n');
 boolean serverError = FALSE;
 int bytesWritten = write(socketFd, dyMessage->string, dyMessage->stringSize);
 if (bytesWritten == dyMessage->stringSize)
     {
     struct lineFile *lf = lineFileAttach("server socket", TRUE, socketFd);
     if (lf)
         {
         char *line;
         while (lineFileNext(lf, &line, NULL))
             {
             if (startsWith("Tree", line) && endsWith(line, "not found"))
                 {
                 // Tell the server to reload the latest protobufs
                 serverReloadProtobufs(getUsherServerMfifoPath(org), treeChoices);
@@ -1195,91 +1197,99 @@
             else if (isNotEmpty(line))
                 fprintf(errFile, "%s\n", line);
             }
         }
     else
         fprintf(errFile, "Failed to attach linefile to socket %d.\n", socketFd);
     }
 else
     fprintf(errFile, "Failed to send query to socket %d: attempted to write %ld bytes, ret=%d\n",
             socketFd, dyMessage->stringSize, bytesWritten);
 dyStringFree(&dyMessage);
 return success;
 }
 
 static boolean runUsherServer(char *org, char *cmd[], char *stderrFile,
-                              struct treeChoices *treeChoices, int *pStartTime)
+                              struct treeChoices *treeChoices, char *anchorFile, int *pStartTime)
 /* Start the server if necessary, connect to it, send a query, get response and return TRUE if.
  * all goes well. If unsuccessful, write reasons to errFile and return FALSE. */
 {
 boolean success = FALSE;
 if (serverIsConfigured(org))
     {
     FILE *errFile = mustOpen(stderrFile, "w");
     int serverSocket = getServerSocket(org, treeChoices, errFile);
 
     reportTiming(pStartTime, "get socket");
     if (serverSocket > 0)
         {
-        success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, TRUE);
+        success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, TRUE, anchorFile);
         close(serverSocket);
         if (success)
             reportTiming(pStartTime, "send query and get response (successful)");
         else
             reportTiming(pStartTime, "send query and get response (failed)");
         }
     carefulClose(&errFile);
     }
 return success;
 }
 
 #define MAX_SUBTREES 1000
 
 struct usherResults *runUsher(char *org, char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName **pUserSampleIds,
-                              struct treeChoices *treeChoices, int *pStartTime)
+                              struct treeChoices *treeChoices, char *anchorFile, int *pStartTime)
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
  * and parse other results out of stderr output.  The usher-sampled version of usher might
  * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */
 {
 struct usherResults *results = usherResultsNew();
 char subtreeSizeStr[16];
 safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize);
 struct tempName tnOutDir;
 trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir");
 char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi,
                 "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-u",
                 "-T", USHER_NUM_THREADS,       // Don't pass args from -T onward to server
                 "--optimization_radius", "0",  // Don't pass these to original usher, only -sampled
                 "--no-ignore-prefix", USHER_DEDUP_PREFIX,
+                "--anchor-samples", anchorFile,
                 NULL };
 struct tempName tnStderr;
 trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt");
 struct tempName tnServerStderr;
 trashDirFile(&tnServerStderr, "ct", "usher_server_stderr", ".txt");
 char *stderrFile = tnServerStderr.forCgi;
-if (! runUsherServer(org, cmd, tnServerStderr.forCgi, treeChoices, pStartTime))
+if (! runUsherServer(org, cmd, tnServerStderr.forCgi, treeChoices, anchorFile, pStartTime))
     {
     if (!endsWith(usherPath, "-sampled"))
         {
         // Truncate cmd for original usher: remove usher-sampled-specific option
         int ix = stringArrayIx("--optimization_radius", cmd, ArraySize(cmd)-1);
         if (ix > 0)
             cmd[ix] = NULL;
         }
-    runUsherCommand(cmd, tnStderr.forCgi, pStartTime);
+    else if (isEmpty(anchorFile))
+        {
+        // Don't pass --anchor-samples option unless it's configured
+        int ix = stringArrayIx("--anchor-samples", cmd, ArraySize(cmd)-1);
+        if (ix > 0)
+            cmd[ix] = NULL;
+        }
+    runUsherCommand(cmd, tnStderr.forCgi, anchorFile, pStartTime);
     stderrFile = tnStderr.forCgi;
     }
 
 struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES];
 struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES];
 parsePlacements(tnOutDir.forCgi, stderrFile, results->samplePlacements, pUserSampleIds);
 int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn,
                                       &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES);
 if (singleSubtreeTn)
     {
     results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts,
                                              subtreeTns, subtreeMuts, *pUserSampleIds);
     results->singleSubtreeInfo = results->subtreeInfoList;
     results->subtreeInfoList = results->subtreeInfoList->next;
     removeOutDir(tnOutDir.forCgi);
@@ -1299,100 +1309,108 @@
  * have placementInfo for each sample.  When running usher, those are added when we parse
  * placement_stats.tsv; when running matUtils, just allocate one for each sample. */
 {
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     {
     struct placementInfo *info;
     AllocVar(info);
     hashAdd(samplePlacements, sample->name, info);
     info->sampleId = cloneString(sample->name);
     }
 }
 
 static void runMatUtilsExtractCommand(char *matUtilsPath, char *protobufPath,
                                       char *subtreeSizeStr, struct tempName *tnSamples,
-                                      struct tempName *tnOutDir, int *pStartTime)
+                                      struct tempName *tnOutDir, char *anchorFile, int *pStartTime)
 /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees
  * containing sampleIds, save resulting subtrees to trash files, return subtree results.
  * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */
 {
 char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir->forCgi,
                 "-s", tnSamples->forCgi,
                 "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS,
-                "--usher-clades-txt", NULL };
+                "--usher-clades-txt", "--usher-anchor-samples", anchorFile, NULL };
 char **cmds[] = { cmd, NULL };
+// Don't pass --usher-anchor-samples option unless it's configured
+if (isEmpty(anchorFile))
+    {
+    int ix = stringArrayIx("--usher-anchor-samples", cmd, ArraySize(cmd)-1);
+    if (ix > 0)
+        cmd[ix] = NULL;
+    }
 struct tempName tnStderr;
 trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt");
 struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0);
 pipelineClose(&pl);
 reportTiming(pStartTime, "run matUtils command");
 }
 
 static boolean runMatUtilsServer(char *org, char *protobufPath, char *subtreeSizeStr,
                                  struct tempName *tnSamples, struct tempName *tnOutDir,
-                                 struct treeChoices *treeChoices, int *pStartTime)
+                                 struct treeChoices *treeChoices, char *anchorFile, int *pStartTime)
 /* Cheng Ye added a 'matUtils mode' to usher-sampled-server so we can get subtrees super-fast
  * for uploaded sample names too. */
 {
 boolean success = FALSE;
 char *cmd[] = { "usher-sampled-server", "-i", protobufPath, "-d", tnOutDir->forCgi,
                 "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE,
                 "--existing_samples", tnSamples->forCgi, "-D",
                 NULL };
 struct tempName tnErrFile;
 trashDirFile(&tnErrFile, "ct", "matUtils_server_stderr", ".txt");
 if (serverIsConfigured(org))
     {
     FILE *errFile = mustOpen(tnErrFile.forCgi, "w");
     int serverSocket = getServerSocket(org, treeChoices, errFile);
 
     reportTiming(pStartTime, "get socket");
     if (serverSocket > 0)
         {
-        success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, FALSE);
+        success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, FALSE, anchorFile);
         close(serverSocket);
         if (success)
             reportTiming(pStartTime, "send query and get response (successful)");
         else
             reportTiming(pStartTime, "send query and get response (failed)");
         }
     carefulClose(&errFile);
     }
 return success;
 }
 
 struct usherResults *runMatUtilsExtractSubtrees(char *org, char *matUtilsPath, char *protobufPath,
                                                 int subtreeSize, struct slName *sampleIds,
-                                                struct treeChoices *treeChoices, int *pStartTime)
+                                                struct treeChoices *treeChoices, char *anchorFile,
+                                                int *pStartTime)
 /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees
  * containing sampleIds, save resulting subtrees to trash files, return subtree results.
  * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */
 {
 struct usherResults *results = usherResultsNew();
 char subtreeSizeStr[16];
 safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize);
 struct tempName tnSamples;
 trashDirFile(&tnSamples, "ct", "matUtilsExtractSamples", ".txt");
 FILE *f = mustOpen(tnSamples.forCgi, "w");
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     fprintf(f, "%s\n", sample->name);
 carefulClose(&f);
 struct tempName tnOutDir;
 trashDirFile(&tnOutDir, "ct", "matUtils_outdir", ".dir");
 if (! runMatUtilsServer(org, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, treeChoices,
-                        pStartTime))
+                        anchorFile, pStartTime))
     runMatUtilsExtractCommand(matUtilsPath, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir,
-                              pStartTime);
+                              anchorFile, pStartTime);
 addEmptyPlacements(sampleIds, results->samplePlacements);
 struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES];
 struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES];
 int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts,
                                       subtreeTns, subtreeMuts, MAX_SUBTREES);
 results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts,
                                          subtreeTns, subtreeMuts, sampleIds);
 results->singleSubtreeInfo = results->subtreeInfoList;
 results->subtreeInfoList = results->subtreeInfoList->next;
 reportTiming(pStartTime, "process results from matUtils");
 return results;
 }