0b186effb2d2b536d2c280ec31bec6e153e6bd7e angie Tue May 21 13:26:39 2024 -0700 Add support for uploaded names/IDs for the multi-ref/tree organism.ra case. Add new config option anchorSamples, pass to usher-sampled/matUtils/server if present. anchorSamples is a file with names of sequences that should always be included in the subtree to provide some larger-scale context, e.g. well-known vaccine or reference material strains. Influenza user request. diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c index 074ba4d..11401cc 100644 --- src/hg/hgPhyloPlace/runUsher.c +++ src/hg/hgPhyloPlace/runUsher.c @@ -714,31 +714,31 @@ static void removeOutDir(char *outDir) /* Remove outDir and its files. */ { struct slName *outDirFiles = listDir(outDir, "*"), *file; struct dyString *dyScratch = dyStringNew(0); for (file = outDirFiles; file != NULL; file = file->next) { char *path = dirPlusFile(dyScratch, outDir, file->name); unlink(path); } dyStringFree(&dyScratch); rmdir(outDir); } -static void runUsherCommand(char *cmd[], char *stderrFile, int *pStartTime) +static void runUsherCommand(char *cmd[], char *stderrFile, char *anchorFile, int *pStartTime) /* Run the standalone usher command with its stderr output redirected to stderrFile. */ { char **cmds[] = { cmd, NULL }; struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, stderrFile, 0); pipelineClose(&pl); reportTiming(pStartTime, "run usher command"); } boolean serverIsConfigured(char *org) /* Return TRUE if all necessary configuration settings are in place to run usher-sampled-server. */ { char *serverDir = cfgOption("hgPhyloPlaceServerDir"); if (isNotEmpty(serverDir)) { char *usherServerEnabled = phyloPlaceOrgSetting(org, "usherServerEnabled"); @@ -1140,46 +1140,48 @@ // Give up - fall back on regular usher command fprintf(errFile, "Second attempt to connect socket %d to path '%s' failed: %s\n", socketFd, usherServerSocketPath, strerror(errno)); socketFd = -1; } } } } return socketFd; } // Server sends ASCII EOT character (4) when done. Sadly I can't find a header file that defines EOT. #define EOT 4 static boolean sendQuery(int socketFd, char *cmd[], char *org, struct treeChoices *treeChoices, - FILE *errFile, boolean addNoIgnorePrefix) + FILE *errFile, boolean addNoIgnorePrefix, char *anchorFile) /* Send command to socket, read response on socket, return TRUE if we get a successful response. */ { boolean success = FALSE; struct dyString *dyMessage = dyStringNew(0); int ix; for (ix = 0; cmd[ix] != NULL; ix++) { // Don't include args from -T onward; server rejects requests with -T or --optimization_radius if (sameString("-T", cmd[ix])) break; dyStringPrintf(dyMessage, "%s\n", cmd[ix]); } if (addNoIgnorePrefix) // Needed when placing uploaded sequences, but not when finding uploaded names dyStringPrintf(dyMessage, "--no-ignore-prefix\n"USHER_DEDUP_PREFIX"\n"); +if (isNotEmpty(anchorFile)) + dyStringPrintf(dyMessage, "--anchor-samples\n%s\n", anchorFile); dyStringAppendC(dyMessage, '\n'); boolean serverError = FALSE; int bytesWritten = write(socketFd, dyMessage->string, dyMessage->stringSize); if (bytesWritten == dyMessage->stringSize) { struct lineFile *lf = lineFileAttach("server socket", TRUE, socketFd); if (lf) { char *line; while (lineFileNext(lf, &line, NULL)) { if (startsWith("Tree", line) && endsWith(line, "not found")) { // Tell the server to reload the latest protobufs serverReloadProtobufs(getUsherServerMfifoPath(org), treeChoices); @@ -1195,91 +1197,99 @@ else if (isNotEmpty(line)) fprintf(errFile, "%s\n", line); } } else fprintf(errFile, "Failed to attach linefile to socket %d.\n", socketFd); } else fprintf(errFile, "Failed to send query to socket %d: attempted to write %ld bytes, ret=%d\n", socketFd, dyMessage->stringSize, bytesWritten); dyStringFree(&dyMessage); return success; } static boolean runUsherServer(char *org, char *cmd[], char *stderrFile, - struct treeChoices *treeChoices, int *pStartTime) + struct treeChoices *treeChoices, char *anchorFile, int *pStartTime) /* Start the server if necessary, connect to it, send a query, get response and return TRUE if. * all goes well. If unsuccessful, write reasons to errFile and return FALSE. */ { boolean success = FALSE; if (serverIsConfigured(org)) { FILE *errFile = mustOpen(stderrFile, "w"); int serverSocket = getServerSocket(org, treeChoices, errFile); reportTiming(pStartTime, "get socket"); if (serverSocket > 0) { - success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, TRUE); + success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, TRUE, anchorFile); close(serverSocket); if (success) reportTiming(pStartTime, "send query and get response (successful)"); else reportTiming(pStartTime, "send query and get response (failed)"); } carefulClose(&errFile); } return success; } #define MAX_SUBTREES 1000 struct usherResults *runUsher(char *org, char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName **pUserSampleIds, - struct treeChoices *treeChoices, int *pStartTime) + struct treeChoices *treeChoices, char *anchorFile, int *pStartTime) /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files * and parse other results out of stderr output. The usher-sampled version of usher might * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir"); char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi, "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-u", "-T", USHER_NUM_THREADS, // Don't pass args from -T onward to server "--optimization_radius", "0", // Don't pass these to original usher, only -sampled "--no-ignore-prefix", USHER_DEDUP_PREFIX, + "--anchor-samples", anchorFile, NULL }; struct tempName tnStderr; trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt"); struct tempName tnServerStderr; trashDirFile(&tnServerStderr, "ct", "usher_server_stderr", ".txt"); char *stderrFile = tnServerStderr.forCgi; -if (! runUsherServer(org, cmd, tnServerStderr.forCgi, treeChoices, pStartTime)) +if (! runUsherServer(org, cmd, tnServerStderr.forCgi, treeChoices, anchorFile, pStartTime)) { if (!endsWith(usherPath, "-sampled")) { // Truncate cmd for original usher: remove usher-sampled-specific option int ix = stringArrayIx("--optimization_radius", cmd, ArraySize(cmd)-1); if (ix > 0) cmd[ix] = NULL; } - runUsherCommand(cmd, tnStderr.forCgi, pStartTime); + else if (isEmpty(anchorFile)) + { + // Don't pass --anchor-samples option unless it's configured + int ix = stringArrayIx("--anchor-samples", cmd, ArraySize(cmd)-1); + if (ix > 0) + cmd[ix] = NULL; + } + runUsherCommand(cmd, tnStderr.forCgi, anchorFile, pStartTime); stderrFile = tnStderr.forCgi; } struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; parsePlacements(tnOutDir.forCgi, stderrFile, results->samplePlacements, pUserSampleIds); int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); if (singleSubtreeTn) { results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, subtreeTns, subtreeMuts, *pUserSampleIds); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; removeOutDir(tnOutDir.forCgi); @@ -1299,100 +1309,108 @@ * have placementInfo for each sample. When running usher, those are added when we parse * placement_stats.tsv; when running matUtils, just allocate one for each sample. */ { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) { struct placementInfo *info; AllocVar(info); hashAdd(samplePlacements, sample->name, info); info->sampleId = cloneString(sample->name); } } static void runMatUtilsExtractCommand(char *matUtilsPath, char *protobufPath, char *subtreeSizeStr, struct tempName *tnSamples, - struct tempName *tnOutDir, int *pStartTime) + struct tempName *tnOutDir, char *anchorFile, int *pStartTime) /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ { char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir->forCgi, "-s", tnSamples->forCgi, "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS, - "--usher-clades-txt", NULL }; + "--usher-clades-txt", "--usher-anchor-samples", anchorFile, NULL }; char **cmds[] = { cmd, NULL }; +// Don't pass --usher-anchor-samples option unless it's configured +if (isEmpty(anchorFile)) + { + int ix = stringArrayIx("--usher-anchor-samples", cmd, ArraySize(cmd)-1); + if (ix > 0) + cmd[ix] = NULL; + } struct tempName tnStderr; trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt"); struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0); pipelineClose(&pl); reportTiming(pStartTime, "run matUtils command"); } static boolean runMatUtilsServer(char *org, char *protobufPath, char *subtreeSizeStr, struct tempName *tnSamples, struct tempName *tnOutDir, - struct treeChoices *treeChoices, int *pStartTime) + struct treeChoices *treeChoices, char *anchorFile, int *pStartTime) /* Cheng Ye added a 'matUtils mode' to usher-sampled-server so we can get subtrees super-fast * for uploaded sample names too. */ { boolean success = FALSE; char *cmd[] = { "usher-sampled-server", "-i", protobufPath, "-d", tnOutDir->forCgi, "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "--existing_samples", tnSamples->forCgi, "-D", NULL }; struct tempName tnErrFile; trashDirFile(&tnErrFile, "ct", "matUtils_server_stderr", ".txt"); if (serverIsConfigured(org)) { FILE *errFile = mustOpen(tnErrFile.forCgi, "w"); int serverSocket = getServerSocket(org, treeChoices, errFile); reportTiming(pStartTime, "get socket"); if (serverSocket > 0) { - success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, FALSE); + success = sendQuery(serverSocket, cmd, org, treeChoices, errFile, FALSE, anchorFile); close(serverSocket); if (success) reportTiming(pStartTime, "send query and get response (successful)"); else reportTiming(pStartTime, "send query and get response (failed)"); } carefulClose(&errFile); } return success; } struct usherResults *runMatUtilsExtractSubtrees(char *org, char *matUtilsPath, char *protobufPath, int subtreeSize, struct slName *sampleIds, - struct treeChoices *treeChoices, int *pStartTime) + struct treeChoices *treeChoices, char *anchorFile, + int *pStartTime) /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); struct tempName tnSamples; trashDirFile(&tnSamples, "ct", "matUtilsExtractSamples", ".txt"); FILE *f = mustOpen(tnSamples.forCgi, "w"); struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) fprintf(f, "%s\n", sample->name); carefulClose(&f); struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "matUtils_outdir", ".dir"); if (! runMatUtilsServer(org, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, treeChoices, - pStartTime)) + anchorFile, pStartTime)) runMatUtilsExtractCommand(matUtilsPath, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, - pStartTime); + anchorFile, pStartTime); addEmptyPlacements(sampleIds, results->samplePlacements); struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, subtreeTns, subtreeMuts, sampleIds); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; reportTiming(pStartTime, "process results from matUtils"); return results; }