d903a7409978ea81cc0f2e6e34687319f7d9d222 angie Wed Jul 28 15:50:24 2021 -0700 When extracting samples from the tree with matUtils, add the new --usher-clades-txt option and create empty placementInfo's to hold the results of parsing clades.txt so we can show clades/lineages according to the tree in the result summary table. diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c index cfcd2fe..3052cf9 100644 --- src/hg/hgPhyloPlace/runUsher.c +++ src/hg/hgPhyloPlace/runUsher.c @@ -854,55 +854,71 @@ int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); if (singleSubtreeTn == NULL) { warn("Sorry, there was a problem running usher. " "Please ask genome-www@soe.ucsc.edu to take a look at %s.", tnStderr.forCgi); return NULL; } results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, subtreeTns, subtreeMuts, userSampleIds, condensedNodes); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; return results; } +static void addEmptyPlacements(struct slName *sampleIds, struct hash *samplePlacements) +/* Parsing an usher-style clades.txt file from matUtils extract requires samplePlacements to + * have placementInfo for each sample. When running usher, those are added when we parse + * usher stderr; when running matUtils, just allocate one for each sample. */ +{ +struct slName *sample; +for (sample = sampleIds; sample != NULL; sample = sample->next) + { + struct placementInfo *info; + AllocVar(info); + hashAdd(samplePlacements, sample->name, info); + info->sampleId = cloneString(sample->name); + } +} + struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath, int subtreeSize, struct slName *sampleIds, struct hash *condensedNodes, int *pStartTime) /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); char *numThreadsStr = "16"; struct tempName tnSamples; trashDirFile(&tnSamples, "ct", "matUtilsExtractSamples", ".txt"); FILE *f = mustOpen(tnSamples.forCgi, "w"); struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) fprintf(f, "%s\n", sample->name); carefulClose(&f); struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "matUtils_outdir", ".dir"); char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir.forCgi, "-s", tnSamples.forCgi, "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", numThreadsStr, - NULL }; + "--usher-clades-txt", NULL }; char **cmds[] = { cmd, NULL }; struct tempName tnStderr; trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt"); struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi); pipelineClose(&pl); reportTiming(pStartTime, "run matUtils"); +addEmptyPlacements(sampleIds, results->samplePlacements); struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, subtreeTns, subtreeMuts, sampleIds, condensedNodes); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; return results; }