src/hg/hgPhyloPlace/runUsher.c 827de614bca5613228664b8944048f61c61f336a

827de614bca5613228664b8944048f61c61f336a
angie
  Fri Dec 2 09:44:45 2022 -0800
Use @yceh's new --existing_samples option to usher-sampled-server as a faster alternative to running matUtils.

diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c
index d35c987..05f688d 100644
--- src/hg/hgPhyloPlace/runUsher.c
+++ src/hg/hgPhyloPlace/runUsher.c
@@ -1136,44 +1136,45 @@
                 // Give up - fall back on regular usher command
                 fprintf(errFile, "Second attempt to connect socket %d to path '%s' failed: %s\n",
                         socketFd, usherServerSocketPath, strerror(errno));
                 socketFd = -1;
                 }
             }
         }
     }
 return socketFd;
 }
 
 // Server sends ASCII EOT character (4) when done. Sadly I can't find a header file that defines EOT.
 #define EOT 4
 
 static boolean sendQuery(int socketFd, char *cmd[], char *db, struct treeChoices *treeChoices,
-                         FILE *errFile)
+                         FILE *errFile, boolean addNoIgnorePrefix)
 /* Send command to socket, read response on socket, return TRUE if we get a successful response. */
 {
 boolean success = FALSE;
 struct dyString *dyMessage = dyStringNew(0);
 int ix;
 for (ix = 0;  cmd[ix] != NULL;  ix++)
     {
     // Don't include args from -T onward; server rejects requests with -T or --optimization_radius
     if (sameString("-T", cmd[ix]))
         break;
     dyStringPrintf(dyMessage, "%s\n", cmd[ix]);
     }
-// But we do need --no-ignore-prefix:
+if (addNoIgnorePrefix)
+    // Needed when placing uploaded sequences, but not when finding uploaded names
     dyStringPrintf(dyMessage, "--no-ignore-prefix\n"USHER_DEDUP_PREFIX"\n");
 dyStringAppendC(dyMessage, '\n');
 boolean serverError = FALSE;
 int bytesWritten = write(socketFd, dyMessage->string, dyMessage->stringSize);
 if (bytesWritten == dyMessage->stringSize)
     {
     struct lineFile *lf = lineFileAttach("server socket", TRUE, socketFd);
     if (lf)
         {
         char *line;
         while (lineFileNext(lf, &line, NULL))
             {
             if (startsWith("Tree", line) && endsWith(line, "not found"))
                 {
                 // Tell the server to reload the latest protobufs
@@ -1203,31 +1204,31 @@
 
 static boolean runUsherServer(char *db, char *cmd[], char *stderrFile,
                               struct treeChoices *treeChoices, int *pStartTime)
 /* Start the server if necessary, connect to it, send a query, get response and return TRUE if.
  * all goes well. If unsuccessful, write reasons to errFile and return FALSE. */
 {
 boolean success = FALSE;
 if (serverIsConfigured(db))
     {
     FILE *errFile = mustOpen(stderrFile, "w");
     int serverSocket = getServerSocket(db, treeChoices, errFile);
 
     reportTiming(pStartTime, "get socket");
     if (serverSocket > 0)
         {
-        success = sendQuery(serverSocket, cmd, db, treeChoices, errFile);
+        success = sendQuery(serverSocket, cmd, db, treeChoices, errFile, TRUE);
         close(serverSocket);
         reportTiming(pStartTime, "send query");
         }
     carefulClose(&errFile);
     }
 return success;
 }
 
 #define MAX_SUBTREES 1000
 
 struct usherResults *runUsher(char *db, char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName **pUserSampleIds,
                               struct treeChoices *treeChoices, int *pStartTime)
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
@@ -1277,69 +1278,111 @@
     removeOutDir(tnOutDir.forCgi);
     }
 else
     {
     results = NULL;
     warn("Sorry, there was a problem running usher.  "
          "Please ask genome-www@soe.ucsc.edu to take a look at %s.", stderrFile);
     }
 reportTiming(pStartTime, "parse results from usher");
 return results;
 }
 
 static void addEmptyPlacements(struct slName *sampleIds, struct hash *samplePlacements)
 /* Parsing an usher-style clades.txt file from matUtils extract requires samplePlacements to
  * have placementInfo for each sample.  When running usher, those are added when we parse
- * usher stderr; when running matUtils, just allocate one for each sample. */
+ * placement_stats.tsv; when running matUtils, just allocate one for each sample. */
 {
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     {
     struct placementInfo *info;
     AllocVar(info);
     hashAdd(samplePlacements, sample->name, info);
     info->sampleId = cloneString(sample->name);
     }
 }
 
-struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath,
+static void runMatUtilsExtractCommand(char *matUtilsPath, char *protobufPath,
+                                      char *subtreeSizeStr, struct tempName *tnSamples,
+                                      struct tempName *tnOutDir, int *pStartTime)
+/* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees
+ * containing sampleIds, save resulting subtrees to trash files, return subtree results.
+ * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */
+{
+char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir->forCgi,
+                "-s", tnSamples->forCgi,
+                "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS,
+                "--usher-clades-txt", NULL };
+char **cmds[] = { cmd, NULL };
+struct tempName tnStderr;
+trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt");
+struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0);
+pipelineClose(&pl);
+reportTiming(pStartTime, "run matUtils");
+}
+
+static boolean runMatUtilsServer(char *db, char *protobufPath, char *subtreeSizeStr,
+                                 struct tempName *tnSamples, struct tempName *tnOutDir,
+                                 struct treeChoices *treeChoices, int *pStartTime)
+/* Cheng Ye added a 'matUtils mode' to usher-sampled-server so we can get subtrees super-fast
+ * for uploaded sample names too. */
+{
+boolean success = FALSE;
+char *cmd[] = { "usher-sampled-server", "-i", protobufPath, "-d", tnOutDir->forCgi,
+                "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE,
+                "--existing_samples", tnSamples->forCgi, "-D",
+                NULL };
+struct tempName tnErrFile;
+trashDirFile(&tnErrFile, "ct", "matUtils_server_stderr", ".txt");
+if (serverIsConfigured(db))
+    {
+    FILE *errFile = mustOpen(tnErrFile.forCgi, "w");
+    int serverSocket = getServerSocket(db, treeChoices, errFile);
+
+    reportTiming(pStartTime, "get socket");
+    if (serverSocket > 0)
+        {
+        success = sendQuery(serverSocket, cmd, db, treeChoices, errFile, FALSE);
+        close(serverSocket);
+        reportTiming(pStartTime, "send query");
+        }
+    carefulClose(&errFile);
+    }
+return success;
+}
+
+struct usherResults *runMatUtilsExtractSubtrees(char *db, char *matUtilsPath, char *protobufPath,
                                                 int subtreeSize, struct slName *sampleIds,
-                                                int *pStartTime)
+                                                struct treeChoices *treeChoices, int *pStartTime)
 /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees
  * containing sampleIds, save resulting subtrees to trash files, return subtree results.
  * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */
 {
 struct usherResults *results = usherResultsNew();
 char subtreeSizeStr[16];
 safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize);
-char *numThreadsStr = "16";
 struct tempName tnSamples;
 trashDirFile(&tnSamples, "ct", "matUtilsExtractSamples", ".txt");
 FILE *f = mustOpen(tnSamples.forCgi, "w");
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     fprintf(f, "%s\n", sample->name);
 carefulClose(&f);
 struct tempName tnOutDir;
 trashDirFile(&tnOutDir, "ct", "matUtils_outdir", ".dir");
-char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir.forCgi,
-                "-s", tnSamples.forCgi,
-                "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", numThreadsStr,
-                "--usher-clades-txt", NULL };
-char **cmds[] = { cmd, NULL };
-struct tempName tnStderr;
-trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt");
-struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0);
-pipelineClose(&pl);
-reportTiming(pStartTime, "run matUtils");
+if (! runMatUtilsServer(db, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, treeChoices,
+                        pStartTime))
+    runMatUtilsExtractCommand(matUtilsPath, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir,
+                              pStartTime);
 addEmptyPlacements(sampleIds, results->samplePlacements);
 struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES];
 struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES];
 int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts,
                                       subtreeTns, subtreeMuts, MAX_SUBTREES);
 results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts,
                                          subtreeTns, subtreeMuts, sampleIds);
 results->singleSubtreeInfo = results->subtreeInfoList;
 results->subtreeInfoList = results->subtreeInfoList->next;
 reportTiming(pStartTime, "process results from matUtils");
 return results;
 }