827de614bca5613228664b8944048f61c61f336a angie Fri Dec 2 09:44:45 2022 -0800 Use @yceh's new --existing_samples option to usher-sampled-server as a faster alternative to running matUtils. diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c index d35c987..05f688d 100644 --- src/hg/hgPhyloPlace/runUsher.c +++ src/hg/hgPhyloPlace/runUsher.c @@ -1136,44 +1136,45 @@ // Give up - fall back on regular usher command fprintf(errFile, "Second attempt to connect socket %d to path '%s' failed: %s\n", socketFd, usherServerSocketPath, strerror(errno)); socketFd = -1; } } } } return socketFd; } // Server sends ASCII EOT character (4) when done. Sadly I can't find a header file that defines EOT. #define EOT 4 static boolean sendQuery(int socketFd, char *cmd[], char *db, struct treeChoices *treeChoices, - FILE *errFile) + FILE *errFile, boolean addNoIgnorePrefix) /* Send command to socket, read response on socket, return TRUE if we get a successful response. */ { boolean success = FALSE; struct dyString *dyMessage = dyStringNew(0); int ix; for (ix = 0; cmd[ix] != NULL; ix++) { // Don't include args from -T onward; server rejects requests with -T or --optimization_radius if (sameString("-T", cmd[ix])) break; dyStringPrintf(dyMessage, "%s\n", cmd[ix]); } -// But we do need --no-ignore-prefix: +if (addNoIgnorePrefix) + // Needed when placing uploaded sequences, but not when finding uploaded names dyStringPrintf(dyMessage, "--no-ignore-prefix\n"USHER_DEDUP_PREFIX"\n"); dyStringAppendC(dyMessage, '\n'); boolean serverError = FALSE; int bytesWritten = write(socketFd, dyMessage->string, dyMessage->stringSize); if (bytesWritten == dyMessage->stringSize) { struct lineFile *lf = lineFileAttach("server socket", TRUE, socketFd); if (lf) { char *line; while (lineFileNext(lf, &line, NULL)) { if (startsWith("Tree", line) && endsWith(line, "not found")) { // Tell the server to reload the latest protobufs @@ -1203,31 +1204,31 @@ static boolean runUsherServer(char *db, char *cmd[], char *stderrFile, struct treeChoices *treeChoices, int *pStartTime) /* Start the server if necessary, connect to it, send a query, get response and return TRUE if. * all goes well. If unsuccessful, write reasons to errFile and return FALSE. */ { boolean success = FALSE; if (serverIsConfigured(db)) { FILE *errFile = mustOpen(stderrFile, "w"); int serverSocket = getServerSocket(db, treeChoices, errFile); reportTiming(pStartTime, "get socket"); if (serverSocket > 0) { - success = sendQuery(serverSocket, cmd, db, treeChoices, errFile); + success = sendQuery(serverSocket, cmd, db, treeChoices, errFile, TRUE); close(serverSocket); reportTiming(pStartTime, "send query"); } carefulClose(&errFile); } return success; } #define MAX_SUBTREES 1000 struct usherResults *runUsher(char *db, char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName **pUserSampleIds, struct treeChoices *treeChoices, int *pStartTime) /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files @@ -1277,69 +1278,111 @@ removeOutDir(tnOutDir.forCgi); } else { results = NULL; warn("Sorry, there was a problem running usher. " "Please ask genome-www@soe.ucsc.edu to take a look at %s.", stderrFile); } reportTiming(pStartTime, "parse results from usher"); return results; } static void addEmptyPlacements(struct slName *sampleIds, struct hash *samplePlacements) /* Parsing an usher-style clades.txt file from matUtils extract requires samplePlacements to * have placementInfo for each sample. When running usher, those are added when we parse - * usher stderr; when running matUtils, just allocate one for each sample. */ + * placement_stats.tsv; when running matUtils, just allocate one for each sample. */ { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) { struct placementInfo *info; AllocVar(info); hashAdd(samplePlacements, sample->name, info); info->sampleId = cloneString(sample->name); } } -struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath, +static void runMatUtilsExtractCommand(char *matUtilsPath, char *protobufPath, + char *subtreeSizeStr, struct tempName *tnSamples, + struct tempName *tnOutDir, int *pStartTime) +/* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees + * containing sampleIds, save resulting subtrees to trash files, return subtree results. + * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ +{ +char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir->forCgi, + "-s", tnSamples->forCgi, + "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS, + "--usher-clades-txt", NULL }; +char **cmds[] = { cmd, NULL }; +struct tempName tnStderr; +trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt"); +struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0); +pipelineClose(&pl); +reportTiming(pStartTime, "run matUtils"); +} + +static boolean runMatUtilsServer(char *db, char *protobufPath, char *subtreeSizeStr, + struct tempName *tnSamples, struct tempName *tnOutDir, + struct treeChoices *treeChoices, int *pStartTime) +/* Cheng Ye added a 'matUtils mode' to usher-sampled-server so we can get subtrees super-fast + * for uploaded sample names too. */ +{ +boolean success = FALSE; +char *cmd[] = { "usher-sampled-server", "-i", protobufPath, "-d", tnOutDir->forCgi, + "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, + "--existing_samples", tnSamples->forCgi, "-D", + NULL }; +struct tempName tnErrFile; +trashDirFile(&tnErrFile, "ct", "matUtils_server_stderr", ".txt"); +if (serverIsConfigured(db)) + { + FILE *errFile = mustOpen(tnErrFile.forCgi, "w"); + int serverSocket = getServerSocket(db, treeChoices, errFile); + + reportTiming(pStartTime, "get socket"); + if (serverSocket > 0) + { + success = sendQuery(serverSocket, cmd, db, treeChoices, errFile, FALSE); + close(serverSocket); + reportTiming(pStartTime, "send query"); + } + carefulClose(&errFile); + } +return success; +} + +struct usherResults *runMatUtilsExtractSubtrees(char *db, char *matUtilsPath, char *protobufPath, int subtreeSize, struct slName *sampleIds, - int *pStartTime) + struct treeChoices *treeChoices, int *pStartTime) /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); -char *numThreadsStr = "16"; struct tempName tnSamples; trashDirFile(&tnSamples, "ct", "matUtilsExtractSamples", ".txt"); FILE *f = mustOpen(tnSamples.forCgi, "w"); struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) fprintf(f, "%s\n", sample->name); carefulClose(&f); struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "matUtils_outdir", ".dir"); -char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir.forCgi, - "-s", tnSamples.forCgi, - "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", numThreadsStr, - "--usher-clades-txt", NULL }; -char **cmds[] = { cmd, NULL }; -struct tempName tnStderr; -trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt"); -struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0); -pipelineClose(&pl); -reportTiming(pStartTime, "run matUtils"); +if (! runMatUtilsServer(db, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, treeChoices, + pStartTime)) + runMatUtilsExtractCommand(matUtilsPath, protobufPath, subtreeSizeStr, &tnSamples, &tnOutDir, + pStartTime); addEmptyPlacements(sampleIds, results->samplePlacements); struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, subtreeTns, subtreeMuts, sampleIds); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; reportTiming(pStartTime, "process results from matUtils"); return results; }