705420e703b067fbcad43ab67ed3e131552e7ac8 angie Wed Apr 24 14:23:03 2024 -0700 Pathogen drop-down choices can now be groups of references/trees, for example 'Dengue (types 1 - 4)' instead of a separate choice for each type. Instead of config.ra, each group has an organism.ra and subdirectories named after reference accessions that contain reference.ra files. nextclade sort is used to match the user's uploaded sequences against available references for the selected pathogen. SARS-CoV-2, M. tuberculosis and hMPXV still have only one reference and still use config.ra, but RSV, Dengue and Influenza will become groups. Presentation is still kinda rough, just a loop on the original results output. The server commands part needs testing and will not work yet for groups (currently used only for SARS-CoV-2). diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c index 8b98f78..6775016 100644 --- src/hg/hgPhyloPlace/runUsher.c +++ src/hg/hgPhyloPlace/runUsher.c @@ -1,18 +1,18 @@ /* Invoke usher to place user's uploaded samples in the phylogenetic tree & parse output files. */ -/* Copyright (C) 2020-2022 The Regents of the University of California */ +/* Copyright (C) 2020-2024 The Regents of the University of California */ #include "common.h" #include "dnautil.h" #include "hash.h" #include "hgConfig.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "regexHelper.h" #include "pipeline.h" #include "trackHub.h" #include "trashDir.h" #include #include @@ -723,88 +723,88 @@ unlink(path); } dyStringFree(&dyScratch); rmdir(outDir); } static void runUsherCommand(char *cmd[], char *stderrFile, int *pStartTime) /* Run the standalone usher command with its stderr output redirected to stderrFile. */ { char **cmds[] = { cmd, NULL }; struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, stderrFile, 0); pipelineClose(&pl); reportTiming(pStartTime, "run usher"); } -boolean serverIsConfigured(char *db) +boolean serverIsConfigured(char *org) /* Return TRUE if all necessary configuration settings are in place to run usher-sampled-server. */ { char *serverDir = cfgOption("hgPhyloPlaceServerDir"); if (isNotEmpty(serverDir)) { - char *usherServerEnabled = phyloPlaceDbSetting(db, "usherServerEnabled"); + char *usherServerEnabled = phyloPlaceOrgSetting(org, "usherServerEnabled"); if (isNotEmpty(usherServerEnabled) && SETTING_IS_ON(usherServerEnabled) && fileExists(PHYLOPLACE_DATA_DIR "/usher-sampled-server")) { return TRUE; } } return FALSE; } -static char *getUsherServerFilePath(char *db, char *file) -/* Alloc & return the path to special server file in $hgPhyloPlaceServerDir/$db/ . */ +static char *getUsherServerFilePath(char *org, char *file) +/* Alloc & return the path to special server file in $hgPhyloPlaceServerDir/$org/ . */ { char *serverDir = cfgOption("hgPhyloPlaceServerDir"); // We need host name in order to prevent clashes between hgw1 and hgw2 sharing the same hg.conf // contents and NFS filesystem. And not the vhost name from getHost(), the real base host name. char host[PATH_LEN]; int ret = gethostname(host, sizeof host); if (ret != 0) safecpy(host, sizeof host, "hostunknown"); char *p = strchr(host, '.'); if (p) *p = '\0'; struct dyString *dyPath = dyStringCreate("%s/%s/%s.%s", - serverDir, trackHubSkipHubName(db), host, file); + serverDir, trackHubSkipHubName(org), host, file); return dyStringCannibalize(&dyPath); } -static char *getUsherServerMfifoPath(char *db) -/* Alloc & return path to server manager fifo file for db. */ +static char *getUsherServerMfifoPath(char *org) +/* Alloc & return path to server manager fifo file for org. */ { -return getUsherServerFilePath(db, "server.fifo"); +return getUsherServerFilePath(org, "server.fifo"); } -static char *getUsherServerSocketPath(char *db) -/* Alloc & return path to server socket file for db. */ +static char *getUsherServerSocketPath(char *org) +/* Alloc & return path to server socket file for org. */ { -return getUsherServerFilePath(db, "server.socket"); +return getUsherServerFilePath(org, "server.socket"); } -static char *getUsherServerLogPath(char *db) -/* Alloc & return path to server log file (very important because it tells us the pid) for db. */ +static char *getUsherServerLogPath(char *org) +/* Alloc & return path to server log file (very important because it tells us the pid) for org. */ { -return getUsherServerFilePath(db, "server.log"); +return getUsherServerFilePath(org, "server.log"); } -boolean serverIsRunning(char *db, FILE *errFile) +boolean serverIsRunning(char *org, FILE *errFile) /* Return TRUE if logFile exists and its first line specifies a pid as expected from daemonishSpawn, * and that pid looks alive according to /proc. */ { boolean pidIsLive = FALSE; -char *logFile = getUsherServerLogPath(db); +char *logFile = getUsherServerLogPath(org); struct lineFile *lf = lineFileMayOpen(logFile, TRUE); if (lf) { char *line; if (lineFileNext(lf, &line, NULL)) { if (isAllDigits(line)) { int pid = atol(line); if (pid > 1) { char procStatus[PATH_LEN]; safef(procStatus, sizeof procStatus, "/proc/%d/status", pid); struct lineFile *psLf = lineFileMayOpen(procStatus, TRUE); if (psLf) @@ -972,302 +972,302 @@ { // Second fork child sets stdin and stdout to /dev/null, stderr to errLogFile (after // writing pid), then becomes cmd using execvp. int newStderr = openWrite(errlogFile, FALSE); char pidBuf[16]; safef(pidBuf, sizeof pidBuf, "%llu\n", (long long unsigned)getpid()); write(newStderr, pidBuf, strlen(pidBuf)); daemonishInit(newStderr); execvp(cmd[0], cmd); } } } return fork1; } -boolean startServer(char *db, struct treeChoices *treeChoices, FILE *errFile) +boolean startServer(char *org, struct treeChoices *treeChoices, FILE *errFile) /* Start up an usher-sampled-server process to run in the background. */ { boolean success = FALSE; -if (serverIsConfigured(db)) +if (serverIsConfigured(org)) { char *serverDir = cfgOption("hgPhyloPlaceServerDir"); if (! fileExists(serverDir)) makeDir(serverDir); char path[PATH_LEN]; - safef(path, sizeof path, "%s/%s", serverDir, db); + safef(path, sizeof path, "%s/%s", serverDir, org); if (! fileExists(path)) makeDir(path); char *usherServerPath = PHYLOPLACE_DATA_DIR "/usher-sampled-server"; // Each protobuf file from treeChoices must appear as a separate arg after -l, // so we have to dynamically build up serverCmd, but can use a static for the first args. char *serverCmdBase[] = { usherServerPath, - "-m", getUsherServerMfifoPath(db), - "-s", getUsherServerSocketPath(db), + "-m", getUsherServerMfifoPath(org), + "-s", getUsherServerSocketPath(org), "-T", USHER_NUM_THREADS, "-t", USHER_SERVER_CHILD_TIMEOUT, "-l" }; size_t serverCmdBaseSize = ArraySize(serverCmdBase); size_t serverCmdSize = serverCmdBaseSize + treeChoices->count + 1; char *serverCmd[serverCmdSize]; int ix; for (ix = 0; ix < serverCmdBaseSize; ix++) serverCmd[ix] = serverCmdBase[ix]; for (ix = 0; ix < treeChoices->count; ix++) serverCmd[serverCmdBaseSize + ix] = treeChoices->protobufFiles[ix]; serverCmd[serverCmdBaseSize + ix] = NULL; fputs("Spawning server with command: ", errFile); for (ix = 0; serverCmd[ix] != NULL; ix++) { fputc(' ', errFile); fputs(serverCmd[ix], errFile); } fputc('\n', errFile); - success = (daemonishSpawn(serverCmd, getUsherServerLogPath(db)) > 0); + success = (daemonishSpawn(serverCmd, getUsherServerLogPath(org)) > 0); if (success) fputs("Server spawned; first fork returned successfully.\n", errFile); else fputs("Server spawn failed at first fork!\n", errFile); } else - errAbort("Usher server is not configured for %s, not starting", db); + errAbort("Usher server is not configured for %s, not starting", org); return success; } -void serverReloadProtobufs(char *db, struct treeChoices *treeChoices) -/* Send a reload command and list of protobufs for db to usher server. */ +void serverReloadProtobufs(char *org, struct treeChoices *treeChoices) +/* Send a reload command and list of protobufs for org to usher server. */ { -char *usherServerMfifoPath = getUsherServerMfifoPath(db); +char *usherServerMfifoPath = getUsherServerMfifoPath(org); FILE *mf = fopen(usherServerMfifoPath, "a"); if (mf) { fprintf(mf, "reload\n"); int ix; for (ix = 0; ix < treeChoices->count; ix++) fprintf(mf, "%s\n", treeChoices->protobufFiles[ix]); fputc('\n', mf); carefulClose(&mf); } else warn("serverReload: unable to open '%s', command not sent", usherServerMfifoPath); } -void serverStop(char *db) +void serverStop(char *org) /* Send stop command to usher server. */ { -char *usherServerMfifoPath = getUsherServerMfifoPath(db); +char *usherServerMfifoPath = getUsherServerMfifoPath(org); FILE *mf = fopen(usherServerMfifoPath, "a"); if (mf) { fprintf(mf, "stop\n"); carefulClose(&mf); } else warn("serverStop: unable to open '%s', command not sent", usherServerMfifoPath); } -void serverSetThreadCount(char *db, int val) +void serverSetThreadCount(char *org, int val) /* Send thread command and value to usher server. */ { -char *usherServerMfifoPath = getUsherServerMfifoPath(db); +char *usherServerMfifoPath = getUsherServerMfifoPath(org); FILE *mf = fopen(usherServerMfifoPath, "a"); if (mf) { if (val > 0) fprintf(mf, "thread %d\n", val); else errAbort("Bad value %d passed to serverSetThreadCount, not sending", val); carefulClose(&mf); } else warn("serverSetTimeout: unable to open '%s', command not sent", usherServerMfifoPath); } -void serverSetTimeout(char *db, int val) +void serverSetTimeout(char *org, int val) /* Send timeout command and value (in seconds) to usher server. */ { -char *usherServerMfifoPath = getUsherServerMfifoPath(db); +char *usherServerMfifoPath = getUsherServerMfifoPath(org); FILE *mf = fopen(usherServerMfifoPath, "a"); if (mf) { if (val > 0) fprintf(mf, "timeout %d\n", val); else errAbort("Bad value %d passed to serverSetTimeout, not sending", val); carefulClose(&mf); } else warn("serverSetTimeout: unable to open '%s', command not sent", usherServerMfifoPath); } -static int getServerSocket(char *db, struct treeChoices *treeChoices, FILE *errFile) +static int getServerSocket(char *org, struct treeChoices *treeChoices, FILE *errFile) /* Try to connect to server; attempt to restart server and then connect if it seems like the server * is down. Return -1 if unable to connect. */ { int socketFd = socket(AF_UNIX, SOCK_STREAM, 0); if (socketFd < 0) { // OS-level failure, not anything we can do about it fprintf(errFile, "Failed to create a UNIX socket: %s\n", strerror(errno)); } else { // From "man 7 unix": // struct sockaddr_un { // sa_family_t sun_family; /* AF_UNIX */ // char sun_path[UNIX_PATH_MAX]; /* pathname */ // }; struct sockaddr_un addr; addr.sun_family=AF_UNIX; - char *usherServerSocketPath = getUsherServerSocketPath(db); + char *usherServerSocketPath = getUsherServerSocketPath(org); safecpy(addr.sun_path, sizeof(addr.sun_path), usherServerSocketPath); int ret = connect(socketFd, (struct sockaddr *)&addr, sizeof(addr)); if (ret < 0) { fprintf(errFile, "Cannot connect socket %d to path '%s': %s\n", socketFd, usherServerSocketPath, strerror(errno)); // Does the server need to be restarted, or (in case another web request prompted some // other hgPhyloPlace process to restart it but it isn't quite up yet) should we wait a // few seconds and try again? - if (serverIsRunning(db, errFile) || - startServer(db, treeChoices, errFile)) + if (serverIsRunning(org, errFile) || + startServer(org, treeChoices, errFile)) { sleep(5); ret = connect(socketFd, (struct sockaddr *)&addr, sizeof(addr)); if (ret < 0) { // Give up - fall back on regular usher command fprintf(errFile, "Second attempt to connect socket %d to path '%s' failed: %s\n", socketFd, usherServerSocketPath, strerror(errno)); socketFd = -1; } } } } return socketFd; } // Server sends ASCII EOT character (4) when done. Sadly I can't find a header file that defines EOT. #define EOT 4 -static boolean sendQuery(int socketFd, char *cmd[], char *db, struct treeChoices *treeChoices, +static boolean sendQuery(int socketFd, char *cmd[], char *org, struct treeChoices *treeChoices, FILE *errFile, boolean addNoIgnorePrefix) /* Send command to socket, read response on socket, return TRUE if we get a successful response. */ { boolean success = FALSE; struct dyString *dyMessage = dyStringNew(0); int ix; for (ix = 0; cmd[ix] != NULL; ix++) { // Don't include args from -T onward; server rejects requests with -T or --optimization_radius if (sameString("-T", cmd[ix])) break; dyStringPrintf(dyMessage, "%s\n", cmd[ix]); } if (addNoIgnorePrefix) // Needed when placing uploaded sequences, but not when finding uploaded names dyStringPrintf(dyMessage, "--no-ignore-prefix\n"USHER_DEDUP_PREFIX"\n"); dyStringAppendC(dyMessage, '\n'); boolean serverError = FALSE; int bytesWritten = write(socketFd, dyMessage->string, dyMessage->stringSize); if (bytesWritten == dyMessage->stringSize) { struct lineFile *lf = lineFileAttach("server socket", TRUE, socketFd); if (lf) { char *line; while (lineFileNext(lf, &line, NULL)) { if (startsWith("Tree", line) && endsWith(line, "not found")) { // Tell the server to reload the latest protobufs - serverReloadProtobufs(getUsherServerMfifoPath(db), treeChoices); + serverReloadProtobufs(getUsherServerMfifoPath(org), treeChoices); // Reloading multiple trees takes a while, so fall back on standalone usher(-sampled) serverError = TRUE; // Continue reading output from server. } else if (line[0] == EOT) { success = ! serverError; break; } else if (isNotEmpty(line)) fprintf(errFile, "%s\n", line); } } else fprintf(errFile, "Failed to attach linefile to socket %d.\n", socketFd); } else fprintf(errFile, "Failed to send query to socket %d: attempted to write %ld bytes, ret=%d\n", socketFd, dyMessage->stringSize, bytesWritten); dyStringFree(&dyMessage); return success; } -static boolean runUsherServer(char *db, char *cmd[], char *stderrFile, +static boolean runUsherServer(char *org, char *cmd[], char *stderrFile, struct treeChoices *treeChoices, int *pStartTime) /* Start the server if necessary, connect to it, send a query, get response and return TRUE if. * all goes well. If unsuccessful, write reasons to errFile and return FALSE. */ { boolean success = FALSE; -if (serverIsConfigured(db)) +if (serverIsConfigured(org)) { FILE *errFile = mustOpen(stderrFile, "w"); - int serverSocket = getServerSocket(db, treeChoices, errFile); + int serverSocket = getServerSocket(org, treeChoices, errFile); reportTiming(pStartTime, "get socket"); if (serverSocket > 0) { - success = sendQuery(serverSocket, cmd, db, treeChoices, errFile, TRUE); + success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, TRUE); close(serverSocket); reportTiming(pStartTime, "send query"); } carefulClose(&errFile); } return success; } #define MAX_SUBTREES 1000 -struct usherResults *runUsher(char *db, char *usherPath, char *usherAssignmentsPath, char *vcfFile, +struct usherResults *runUsher(char *org, char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName **pUserSampleIds, struct treeChoices *treeChoices, int *pStartTime) /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files * and parse other results out of stderr output. The usher-sampled version of usher might * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir"); char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi, "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-u", "-T", USHER_NUM_THREADS, // Don't pass args from -T onward to server "--optimization_radius", "0", // Don't pass these to original usher, only -sampled "--no-ignore-prefix", USHER_DEDUP_PREFIX, NULL }; struct tempName tnStderr; trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt"); struct tempName tnServerStderr; trashDirFile(&tnServerStderr, "ct", "usher_server_stderr", ".txt"); char *stderrFile = tnServerStderr.forCgi; -if (! runUsherServer(db, cmd, tnServerStderr.forCgi, treeChoices, pStartTime)) +if (! runUsherServer(org, cmd, tnServerStderr.forCgi, treeChoices, pStartTime)) { if (!endsWith(usherPath, "-sampled")) { // Truncate cmd for original usher: remove usher-sampled-specific option int ix = stringArrayIx("--optimization_radius", cmd, ArraySize(cmd)-1); if (ix > 0) cmd[ix] = NULL; } runUsherCommand(cmd, tnStderr.forCgi, pStartTime); stderrFile = tnStderr.forCgi; } struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; parsePlacements(tnOutDir.forCgi, stderrFile, results->samplePlacements, pUserSampleIds); @@ -1313,80 +1313,80 @@ * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ { char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir->forCgi, "-s", tnSamples->forCgi, "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS, "--usher-clades-txt", NULL }; char **cmds[] = { cmd, NULL }; struct tempName tnStderr; trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt"); struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0); pipelineClose(&pl); reportTiming(pStartTime, "run matUtils"); } -static boolean runMatUtilsServer(char *db, char *protobufPath, char *subtreeSizeStr, +static boolean runMatUtilsServer(char *org, char *protobufPath, char *subtreeSizeStr, struct tempName *tnSamples, struct tempName *tnOutDir, struct treeChoices *treeChoices, int *pStartTime) /* Cheng Ye added a 'matUtils mode' to usher-sampled-server so we can get subtrees super-fast * for uploaded sample names too. */ { boolean success = FALSE; char *cmd[] = { "usher-sampled-server", "-i", protobufPath, "-d", tnOutDir->forCgi, "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "--existing_samples", tnSamples->forCgi, "-D", NULL }; struct tempName tnErrFile; trashDirFile(&tnErrFile, "ct", "matUtils_server_stderr", ".txt"); -if (serverIsConfigured(db)) +if (serverIsConfigured(org)) { FILE *errFile = mustOpen(tnErrFile.forCgi, "w"); - int serverSocket = getServerSocket(db, treeChoices, errFile); + int serverSocket = getServerSocket(org, treeChoices, errFile); reportTiming(pStartTime, "get socket"); if (serverSocket > 0) { - success = sendQuery(serverSocket, cmd, db, treeChoices, errFile, FALSE); + success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, FALSE); close(serverSocket); reportTiming(pStartTime, "send query"); } carefulClose(&errFile); } return success; } -struct usherResults *runMatUtilsExtractSubtrees(char *db, char *matUtilsPath, char *protobufPath, +struct usherResults *runMatUtilsExtractSubtrees(char *org, char *matUtilsPath, char *protobufPath, int subtreeSize, struct slName *sampleIds, struct treeChoices *treeChoices, int *pStartTime) /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); struct tempName tnSamples; trashDirFile(&tnSamples, "ct", "matUtilsExtractSamples", ".txt"); FILE *f = mustOpen(tnSamples.forCgi, "w"); struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) fprintf(f, "%s\n", sample->name); carefulClose(&f); struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "matUtils_outdir", ".dir"); -if (! runMatUtilsServer(db, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, treeChoices, +if (! runMatUtilsServer(org, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, treeChoices, pStartTime)) runMatUtilsExtractCommand(matUtilsPath, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, pStartTime); addEmptyPlacements(sampleIds, results->samplePlacements); struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, subtreeTns, subtreeMuts, sampleIds); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; reportTiming(pStartTime, "process results from matUtils"); return results; }