e14c2761ca01d8175578df87bd5311b5c0442bfb angie Thu Nov 19 13:04:01 2020 -0800 New CGI hgPhyloPlace: place SARS-CoV-2 genomes in fasta or VCF on phylogenetic tree using Yatish Turakhia's UShER program; add subtree custom tracks and Nextstrain linkouts to visualize results. refs #25943 diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c new file mode 100644 index 0000000..946ac87 --- /dev/null +++ src/hg/hgPhyloPlace/runUsher.c @@ -0,0 +1,767 @@ +/* Invoke usher to place user's uploaded samples in the phylogenetic tree & parse output files. */ + +/* Copyright (C) 2020 The Regents of the University of California */ + +#include "common.h" +#include "dnautil.h" +#include "hash.h" +#include "linefile.h" +#include "obscure.h" +#include "parsimonyProto.h" +#include "phyloPlace.h" +#include "regexHelper.h" +#include "pipeline.h" +#include "trashDir.h" + +// Keywords in stderr output of usher: +#define sampleIdPrefix "Sample name:" +#define pScorePrefix "Parsimony score:" +#define numPlacementsPrefix "Number of parsimony-optimal placements:" +#define bestNodePrefix "Best node (" +#define mutationsPrefix "Mutations: " +#define sampleMutsPrefix "Sample mutations:" +#define imputedMutsPrefix "Imputed mutations:" + +static void parseSampleIdAndParsimonyScore(char **words, char **retSampleId, + struct hash *samplePlacements) +/* If words[] seems to contain columns of the line that gives sample ID and parsimony score, + * then parse out those values. */ +{ +// Example line: +// words[0] = Current tree size (#nodes): 70775 +// words[1] = Sample name: MyLabSequence2 +// words[2] = Parsimony score: 1 +// words[3] = Number of parsimony-optimal placements: 1 +char *p = stringIn(sampleIdPrefix, words[1]); +if (p) + { + *retSampleId = cloneString(trimSpaces(p + strlen(sampleIdPrefix))); + struct placementInfo *info; + AllocVar(info); + hashAdd(samplePlacements, *retSampleId, info); + info->sampleId = *retSampleId; + p = stringIn(pScorePrefix, words[2]); + if (p) + info->parsimonyScore = atoi(skipToSpaces(p + strlen(pScorePrefix))); + else + errAbort("Problem parsing stderr output of usher: " + "expected '" sampleIdPrefix "...' to be followed by '" + pScorePrefix "...' but could not find the latter." ); + p = stringIn(numPlacementsPrefix, words[3]); + if (p) + info->bestNodeCount = atoi(skipToSpaces(p + strlen(numPlacementsPrefix))); + else + errAbort("Problem parsing stderr output of usher: " + "expected '" sampleIdPrefix "... " pScorePrefix " ...' to be followed by '" + numPlacementsPrefix "...' but could not find the latter." ); + } +else + errAbort("Unexpected format of sample ID line:\n%s\t%s\t%s\t%s", + words[0], words[1], words[2], words[3]); +} + +static struct singleNucChange *parseSnc(char *sncStr) +/* If sncStr looks like a <old><pos><new>-style single nucleotide change then parse out those + * values & return singleNucChange (with parBase and newBase; no refBase), otherwise return NULL. */ +{ +struct singleNucChange *snc = NULL; +regmatch_t substrs[4]; +if (regexMatchSubstr(sncStr, "^([ACGT])([0-9]+)([ACGT])$", substrs, ArraySize(substrs))) + { + int chromStart = regexSubstringInt(sncStr, substrs[2]) - 1; + snc = sncNew(chromStart, '\0', sncStr[0], sncStr[substrs[3].rm_so]); + } +return snc; +} + +static struct variantPathNode *parsePipeyPath(char *pipeyPath) +/* Parse something like "|C8782T|T28144C| > |C29095T| > |G2494A|T11083G|C18501T|" + * into a variant path with unknown node names. */ +{ +if (isEmpty(pipeyPath)) + return NULL; +struct variantPathNode *variantPath = NULL; +char *words[strlen(pipeyPath) / 5]; +int wordCount = chopString(pipeyPath, " > ", words, ArraySize(words)); +int i; +for (i = 0; i < wordCount; i++) + { + struct variantPathNode *vpn; + AllocVar(vpn); + vpn->nodeName = cloneString("?"); + char *mutStr = words[i]; + // Trim first and last pipes + if (mutStr[0] == '|') + mutStr++; + if (mutStr[strlen(mutStr)-1] == '|') + mutStr[strlen(mutStr)-1] = '\0'; + // Split by pipe + char *muts[strlen(mutStr) / 4]; + int sncCount = chopString(mutStr, "|", muts, ArraySize(muts)); + int j; + for (j = 0; j < sncCount; j++) + { + struct singleNucChange *snc = parseSnc(muts[j]); + if (!snc) + errAbort("Expected single-nucleotide change but got '%s' when parsing pipe-separated " + "best node path", muts[j]); + slAddHead(&vpn->sncList, snc); + } + slReverse(&vpn->sncList); + slAddHead(&variantPath, vpn); + } +slReverse(&variantPath); +return variantPath; +} + +static boolean parseBestNode(char **words, struct placementInfo *info) +/* If the line starts with "Best node", parse out the name and path of the node + * (or one of the nodes) with the lowest parsimony distance from the sample being placed; + * add to info->bestNodes and return TRUE. */ +{ +// Example line (* means this node was chosen for placement): +// words[0] = Best node (child)*: 1239 +// words[1] = Mutations: |C8782T|T28144C| > |C29095T| > |T8782C| > |T29095C| > |C28144T| +// or +// words[0] = Best node (sibling): SomeLeafName +// words[1] = Mutations: |C8782T|T28144C| > |C29095T| > |T8782C| > |T29095C| > |C28144T| > |G11083T| > |G2494A|T11083G|C18501T| +boolean matches = FALSE; +if (stringIn(bestNodePrefix, words[0])) + { + matches = TRUE; + struct bestNodeInfo *bn; + AllocVar(bn); + char *p = words[0] + strlen(bestNodePrefix); + if (startsWith("sibling", p)) + bn->isSibling = TRUE; + boolean isChosen = (stringIn(")*:", words[0]) != NULL); + p = stringIn(": ", words[0]); + if (p) + bn->name = cloneString(p + strlen(": ")); + else + errAbort("parseBestNode: expected first column to have ': ' followed by name, but got '%s'", + words[0]); + if (startsWith(mutationsPrefix, words[1])) + bn->variantPath = parsePipeyPath(words[1] + strlen(mutationsPrefix)); + else + errAbort("parseBestNode: expected second column to have '" mutationsPrefix"' followed by " + "path, but got '%s'", words[1]); + if (isChosen) + slAddHead(&info->bestNodes, bn); + else + slAddTail(&info->bestNodes, bn); + } +return matches; +} + + +static boolean parseSampleMutations(char **words, struct placementInfo *info) +/* If words[] looks like it defines the sample mutations relative to the reference genome, + * then parse out the list and add to info->sampleMuts and return TRUE. */ +{ +// Example line: +// words[0] = Sample mutations: +// words[1] = |C241T| |C3037T| |C14408T| |A23403G| +boolean matches = FALSE; +if (stringIn(sampleMutsPrefix, words[0])) + { + matches = TRUE; + char *mutStr = words[1]; + stripChar(mutStr, '|'); + info->sampleMuts = slNameListFromString(mutStr, ' '); + } +return matches; +} + +static boolean parseImputedMutations(char **words, struct placementInfo *info) +/* If words[] looks like it defines imputed mutations of the most recently named sample, + * then parse out the list and add to info->imputedBases and return TRUE. */ +{ +// Example line: +// words[0] = Imputed mutations: +// words[1] = 6709:A;23403:G +boolean matches = FALSE; +if (stringIn(imputedMutsPrefix, words[0])) + { + matches = TRUE; + char *muts[strlen(words[1]) / 4]; + int mutCount = chopString(words[1], ";", muts, ArraySize(muts)); + struct baseVal *bvList = NULL; + int i; + for (i = 0; i < mutCount; i++) + { + boolean problem = FALSE; + char *colon = strchr(muts[i], ':'); + if (colon) + { + int pos = atoi(muts[i]); + char *val = cloneString(colon+1); + if (pos < 1) + problem = TRUE; + else if (!isAllNt(val, strlen(val))) + problem = TRUE; + else + { + struct baseVal *bv; + AllocVar(bv); + bv->chromStart = pos - 1; + bv->val = val; + slAddHead(&bvList, bv); + } + } + if (problem) + errAbort("Problem parsing stderr output of usher: " + "expected imputed mutation to be number:base, but got '%s'", muts[i]); + } + slReverse(&bvList); + info->imputedBases = bvList; + } +return matches; +} + +static void parseStderr(char *amsStderrFile, struct hash *samplePlacements) +/* The stderr output of usher is where we find important info for each sample: + * the path of variants on nodes from root to sample leaf, imputed values of ambiguous bases + * (if any), and parsimony score. */ +{ +struct lineFile *lf = lineFileOpen(amsStderrFile, TRUE); +char *sampleId = NULL; +int size; +char *line; +while (lineFileNext(lf, &line, &size)) + { + char lineCpy[size+1]; + safencpy(lineCpy, sizeof lineCpy, line, size); + char *words[16]; + int wordCount = chopTabs(lineCpy, words); + if (wordCount == 4) + parseSampleIdAndParsimonyScore(words, &sampleId, samplePlacements); + else if (wordCount == 2) + { + if (! sampleId) + errAbort("Problem parsing stderr output of usher: " + "Got line starting with '%s' that was not preceded by a line that " + "defines sample ID.:\n%s", words[0], line); + struct placementInfo *info = hashFindVal(samplePlacements, sampleId); + if (!info) + errAbort("Problem parsing stderr output of usher: " + "Can't find placement info for sample '%s'", sampleId); + if (! parseBestNode(words, info) && + ! parseSampleMutations(words, info)) + parseImputedMutations(words, info); + } + } +} + +static void parseVariantPaths(char *filename, struct hash *samplePlacements) +/* Parse out space-sep list of {node ID, ':', node-associated ,-sep variant list} into + * variantPathNode list and associate with sample ID. */ +{ +// Example line (note the back-mutation at 28144T... may want to highlight those): +// words[0] = MySeq +// words[1] = 1:C8782T,T28144C 2309:C29095T 2340:T8782C 2342:T29095C 2588:C28144T MySeq:C29867T +struct lineFile *lf = lineFileOpen(filename, TRUE); +char *line; +while (lineFileNext(lf, &line, NULL)) + { + char *words[3]; + int wordCount = chopTabs(line, words); + lineFileExpectWords(lf, 2, wordCount); + char *sampleId = words[0]; + char *nodePath = words[1]; + char *nodes[strlen(nodePath) / 4]; + int nodeCount = chopString(nodePath, " ", nodes, ArraySize(nodes)); + struct variantPathNode *vpNodeList = NULL; + int i; + for (i = 0; i < nodeCount; i++) + { + struct variantPathNode *vpn; + AllocVar(vpn); + char *nodeVariants = nodes[i]; + // First there is a node ID followed by ':'. If node ID is numeric, ignore; + // otherwise it is a sample ID, indicating a mutation specific to the sample ID, + // so include it. + char *colon = strrchr(nodeVariants, ':'); + if (colon) + { + *colon = '\0'; + char *nodeName = nodeVariants; + nodeVariants = colon+1; + vpn->nodeName = cloneString(nodeName); + } + // Next there should be a comma-sep list of mutations/variants. + char *variants[strlen(nodeVariants) / 4]; + int varCount = chopCommas(nodeVariants, variants); + int j; + for (j = 0; j < varCount; j++) + { + variants[j] = trimSpaces(variants[j]); + struct singleNucChange *snc = parseSnc(variants[j]); + if (snc) + slAddHead(&vpn->sncList, snc); + else + errAbort("parseVariantPath: Expected variant path for %s to specify " + "single-nucleotide changes but got '%s'", sampleId, variants[j]); + } + slReverse(&vpn->sncList); + slAddHead(&vpNodeList, vpn); + } + slReverse(&vpNodeList); + struct placementInfo *info = hashFindVal(samplePlacements, sampleId); + if (!info) + errAbort("parseVariantPath: can't find placementInfo for sample '%s'", sampleId); + info->variantPath = vpNodeList; + } +lineFileClose(&lf); +} + +static struct variantPathNode *parseSubtreeMut(char *line) +/* Parse a line with a node name and list of mutations. Examples: + * ROOT->1: C241T,C14408T,A23403G,C3037T,A20268G,C28854T,T24076C + * 1: C20759T + * USA/CA-CZB-4019/2020|EPI_ISL_548621|20-08-01: G17608T,G22199T + * node_6849_condensed_4_leaves: + */ +{ +struct variantPathNode *vpn = NULL; +char *colon = strrchr(line, ':'); +if (colon) + { + AllocVar(vpn); + char *nodeName = line; + *colon = '\0'; + vpn->nodeName = cloneString(nodeName); + char *mutString = trimSpaces(colon+1); + char *mutWords[strlen(mutString)/4]; + int mutCount = chopCommas(mutString, mutWords); + int i; + for (i = 0; i < mutCount; i++) + { + struct singleNucChange *snc = parseSnc(mutWords[i]); + if (snc) + slAddHead(&vpn->sncList, snc); + else + errAbort("parseSubtreeMut: Expected subtree mutation list to specify single-nucleotide " + "changes but got '%s'", mutWords[i]); + } + slReverse(&vpn->sncList); + } +else + errAbort("parseSubtreeMut: Expected line to contain colon but got '%s'", line); +return vpn; +} + +static struct variantPathNode *parseSubtreeMutations(char *filename) +/* Parse subtree node mutation lists out of usher subtree-N-mutations.txt file. */ +{ +struct variantPathNode *subtreeMutList = NULL; +struct lineFile *lf = lineFileOpen(filename, TRUE); +char *line; +while (lineFileNext(lf, &line, NULL)) + { + struct variantPathNode *vpn = parseSubtreeMut(line); + if (vpn) + { + // The ROOT->1 (subtree ancestor) sncList needs to be prepended to the subtree + // root's sncList because Auspice doesn't seem to display mutations on nodes + // with only one child. Discard the subtree ancestor element. + if (subtreeMutList && subtreeMutList->next == NULL && + startsWith("ROOT->", subtreeMutList->nodeName)) + { + vpn->sncList = slCat(subtreeMutList->sncList, vpn->sncList); + subtreeMutList = NULL; + } + slAddHead(&subtreeMutList, vpn); + } + } +slReverse(&subtreeMutList); +lineFileClose(&lf); +return subtreeMutList; +} + +static struct usherResults *usherResultsNew() +/* Allocate & return usherResults (just bigTreePlusTn and samplePlacements, subtrees come later). */ +{ +struct usherResults *results; +AllocVar(results); +AllocVar(results->bigTreePlusTn); +trashDirFile(results->bigTreePlusTn, "ct", "phyloPlusSamples", ".nwk"); +results->samplePlacements = hashNew(0); +return results; +} + +static void rPhyloLeafNames(struct phyloTree *node, struct slName **pList) +/* Build up a list of leaf names under node in reverse depth-first-search order. */ +{ +if (node->numEdges > 0) + { + int i; + for (i = 0; i < node->numEdges; i++) + rPhyloLeafNames(node->edges[i], pList); + } +else + slAddHead(pList, slNameNew(node->ident->name)); +} + +static struct slName *phyloLeafNames(struct phyloTree *tree) +/* Return a list of leaf names in tree in depth-first-search order. */ +{ +struct slName *list = NULL; +rPhyloLeafNames(tree, &list); +slReverse(&list); +return list; +} + +static struct hash *slNameListToIxHash(struct slName *list) +/* Given a list of names, add each name to a hash of ints with the index of the name in the list. */ +{ +struct hash *hash = hashNew(0); +struct slName *sln; +int ix; +for (ix = 0, sln = list; sln != NULL; ix++, sln = sln->next) + hashAddInt(hash, sln->name, ix); +return hash; +} + +static struct slName *getSubtreeSampleIds(struct slName *sampleIds, struct hash *subtreeIds) +/* Return a list of sampleIds that are found in subtreeIds. */ +{ +struct slName *subtreeSampleIds = NULL; +struct slName *id; +for (id = sampleIds; id != NULL; id = id->next) + if (hashLookup(subtreeIds, id->name)) + slNameAddHead(&subtreeSampleIds, id->name); +slReverse(&subtreeSampleIds); +return subtreeSampleIds; +} + +static int hashElIntCmpDesc(const void *va, const void *vb) +/* Compare two hashEl by integer value, descending. */ +{ +const struct hashEl *a = *((struct hashEl **)va); +const struct hashEl *b = *((struct hashEl **)vb); +return b->val - a->val; +} + +static char *summarizeCountries(struct slName *idList) +/* If enough sample names in idList have identifiable countries, then return a summary string, + * else NULL. */ +{ +char *summary = NULL; +int idCount = slCount(idList); +int idWithCountryCount = 0; +struct hash *countryCounts = hashNew(0); +struct slName *id; +for (id = idList; id != NULL; id = id->next) + { + regmatch_t substrs[2]; + if (regexMatchSubstr(id->name, "([A-Za-z_]+)/[^/]+/20", substrs, ArraySize(substrs))) + { + char country[256]; + regexSubstringCopy(id->name, substrs[1], country, sizeof country); + idWithCountryCount++; + hashIncInt(countryCounts, country); + } + } +if ((double)idWithCountryCount / idCount >= 0.5) + { + struct dyString *dy = dyStringCreate("%d from ", idCount); + struct hashEl *helList = hashElListHash(countryCounts); + if (helList->next == NULL) + dyStringAppend(dy, helList->name); + else + { + slSort(&helList, hashElIntCmpDesc); + struct hashEl *hel; + for (hel = helList; hel != NULL; hel = hel->next) + { + if (hel != helList) + dyStringAppend(dy, "+"); + dyStringAppend(dy, hel->name); + int count = ptToInt(hel->val); + if (count > 1) + dyStringPrintf(dy, "[%d]", count); + if (dyStringLen(dy) > 100) + { + dyStringAppend(dy, "+..."); + break; + } + } + } + // Labels have to be unique -- tack on an example ID + dyStringPrintf(dy, " eg %s", idList->name); + summary = dyStringCannibalize(&dy); + subChar(summary, ' ', '_'); + } +return summary; +} + +static char *expandCondensedNodeName(struct hash *condensedNodes, char *name) +/* If name is found in condensedNodes, then return an expanded name that indicates countries of + * origin (if we can find them) and an example ID. Otherwise return NULL. */ +{ +char *expandedName = NULL; +struct slName *nodeList = hashFindVal(condensedNodes, name); +if (nodeList) + { + char *summary = summarizeCountries(nodeList); + if (summary) + expandedName = summary; + else + { + int nodeCount = slCount(nodeList); + struct dyString *dy = dyStringCreate("%s_and_%d_others", nodeList->name, nodeCount-1); + expandedName= dyStringCannibalize(&dy); + } + } +return expandedName; +} + +static struct hash *expandCondensedNodeNames(struct hash *condensedNodes, struct slName *idList) +/* If idList contains names found in condensedNodes, then hash those names to new names that are + * _-sep strings from the lists in condensedNodes. Return hash (which may have no elements + * if nothing in idList is found in condensedNodes). */ +{ +struct hash *hash = hashNew(0); +struct slName *id; +for (id = idList; id != NULL; id = id->next) + { + char *expandedName = expandCondensedNodeName(condensedNodes, id->name); + if (expandedName) + hashAdd(hash, id->name, expandedName); + } +return hash; +} + +static void rSubstTreeNames(struct phyloTree *node, struct hash *nameSubstitutions) +/* If node or descendants have names in nameSubstitutions, then substitute those names. */ +{ +if (node->ident->name) + { + char *subst = hashFindVal(nameSubstitutions, node->ident->name); + if (subst) + node->ident->name = subst; + } +int i; +for (i = 0; i < node->numEdges; i++) + rSubstTreeNames(node->edges[i], nameSubstitutions); +} + +static struct tempName *substituteTreeNames(struct phyloTree *tree, struct hash *nameSubstitutions) +/* If tree has any nodes whose names are in nameSubstitutions, then substitute those names. + * Write tree out to a trash file and return its location. */ +{ +rSubstTreeNames(tree, nameSubstitutions); +struct tempName *newTn; +AllocVar(newTn); +trashDirFile(newTn, "ct", "treeNameSubst", ".nwk"); +FILE *f = mustOpen(newTn->forCgi, "w"); +phyloPrintTree(tree, f); +carefulClose(&f); +return newTn; +} + +static struct slName *substituteNameList(struct slName *idList, struct hash *nameSubstitutions) +/* Return a new list that is just like idList, except if any item in idList has a value in + * nameSubstitutions, then the item is replaced by the substitution. */ +{ +struct slName *newList = NULL; +struct slName *id; +for (id = idList; id != NULL; id = id->next) + { + char *subst = hashFindVal(nameSubstitutions, id->name); + slNameAddHead(&newList, subst ? subst : id->name); + } +slReverse(&newList); +return newList; +} + +static void addMutationsToTree(struct phyloTree *node, struct variantPathNode **pNodeMutList) +/* Store the list of mutations associated with each node in node->priv. */ +{ +struct variantPathNode *nodeMuts = slPopHead(pNodeMutList); +if (! nodeMuts) + errAbort("addMutationsToTree: subtree mutation list has fewer nodes than subtree"); +if (! sameString(nodeMuts->nodeName, node->ident->name)) + errAbort("addMutationsToTree: subtree node name is '%s' but subtree mutation list item is '%s'", + node->ident->name, nodeMuts->nodeName); +if (node->priv != NULL) + errAbort("addMutationsToTree: node already has mutations assigned"); +node->priv = nodeMuts->sncList; +int i; +for (i = 0; i < node->numEdges; i++) + addMutationsToTree(node->edges[i], pNodeMutList); +} + +static struct subtreeInfo *parseSubtrees(int subtreeCount, struct tempName *subtreeTns[], + struct variantPathNode *subtreeMuts[], + struct slName *userSampleIds, struct hash *condensedNodes) +/* Parse usher's subtree output, figure out which user samples are in each subtree, expand names + * of condensed nodes, and write auspice json. */ +{ +struct subtreeInfo *subtreeInfoList = NULL; +int sIx; +for (sIx = 0; sIx < subtreeCount; sIx++) + { + struct subtreeInfo *ti; + AllocVar(ti); + ti->subtreeTn = subtreeTns[sIx]; + ti->subtree = phyloOpenTree(ti->subtreeTn->forCgi); + addMutationsToTree(ti->subtree, &subtreeMuts[sIx]); + if (subtreeMuts[sIx] != NULL) + errAbort("addMutationsToTree: subtreeMutationList has more nodes than subtree"); + struct slName *subtreeIdList = phyloLeafNames(ti->subtree); + // Don't do name substitutions on condensed node names in subtreeIdToIx since the IDs have to + // match those in the original tree from protobuf. + ti->subtreeIdToIx = slNameListToIxHash(subtreeIdList); + ti->subtreeUserSampleIds = getSubtreeSampleIds(userSampleIds, ti->subtreeIdToIx); + if (slCount(ti->subtreeUserSampleIds) == 0) + errAbort("No user sample IDs found in subtree file %s", ti->subtreeTn->forCgi); + // Substitute in nicer node names for condensed nodes for displaying to the user in + // custom tracks and Nextstrain/auspice JSON. + struct hash *nameSubstitutions = expandCondensedNodeNames(condensedNodes, subtreeIdList); + if (nameSubstitutions->elCount > 0) + ti->subtreeTn = substituteTreeNames(ti->subtree, nameSubstitutions); + ti->subtreeNameList = substituteNameList(subtreeIdList, nameSubstitutions); + slAddHead(&subtreeInfoList, ti); + hashFree(&nameSubstitutions); + slFreeList(&subtreeIdList); + } +slReverse(&subtreeInfoList); +return subtreeInfoList; +} + +static char *dirPlusFile(struct dyString *dy, char *dir, char *file) +/* Write dir/file into dy and return pointer to dy->string. */ +{ +dyStringClear(dy); +dyStringPrintf(dy, "%s/%s", dir, file); +return dy->string; +} + +static int processOutDirFiles(struct usherResults *results, char *outDir, + struct tempName *subtreeTns[], struct variantPathNode *subtreeMuts[], + int maxSubtrees) +/* Get paths to files in outDir; parse them, move files that we'll keep up to trash/ct/, + * remove outDir. */ +{ +int subtreeCount = 0; +memset(subtreeTns, 0, maxSubtrees * sizeof(*subtreeTns)); +memset(subtreeMuts, 0, maxSubtrees * sizeof(*subtreeMuts)); +struct dyString *dyScratch = dyStringNew(0); +struct slName *outDirFiles = listDir(outDir, "*"), *file; +for (file = outDirFiles; file != NULL; file = file->next) + { + char *path = dirPlusFile(dyScratch, outDir, file->name); + if (sameString(file->name, "uncondensed-final-tree.nh")) + { + mustRename(path, results->bigTreePlusTn->forCgi); + } + else if (sameString(file->name, "mutation-paths.txt")) + { + parseVariantPaths(path, results->samplePlacements); + } + else if (startsWith("subtree-", file->name)) + { + char fnameCpy[strlen(file->name)+1]; + safecpy(fnameCpy, sizeof fnameCpy, file->name); + char *parts[4]; + int partCount = chopString(fnameCpy, "-", parts, ArraySize(parts)); + if (partCount == 2) + { + // Expect "subtree-N.nh" + char *dot = strstr(parts[1], ".nh"); + if (dot) + { + *dot = '\0'; + int subtreeIx = atol(parts[1]) - 1; + if (subtreeIx >= maxSubtrees) + errAbort("Too many subtrees in usher output (max %d)", maxSubtrees); + if (subtreeIx >= subtreeCount) + subtreeCount = subtreeIx + 1; + char sname[32]; + safef(sname, sizeof sname, "subtree%d", subtreeIx+1); + AllocVar(subtreeTns[subtreeIx]); + trashDirFile(subtreeTns[subtreeIx], "ct", sname, ".nwk"); + mustRename(path, subtreeTns[subtreeIx]->forCgi); + } + else + warn("Unexpected filename '%s' from usher, ignoring", file->name); + } + else if (partCount == 3) + { + // Expect "subtree-N-mutations.txt" or "subtree-N-expanded.txt" + int subtreeIx = atol(parts[1]) - 1; + if (subtreeIx >= maxSubtrees) + errAbort("Too many subtrees in usher output (max %d)", maxSubtrees); + if (subtreeIx >= subtreeCount) + subtreeCount = subtreeIx + 1; + if (sameString(parts[2], "mutations.txt")) + { + subtreeMuts[subtreeIx] = parseSubtreeMutations(path); + } + else if (sameString(parts[2], "expanded.txt")) + { + // Don't need this, just remove it + } + else + warn("Unexpected filename '%s' from usher, ignoring", file->name); + } + else + warn("Unexpected filename '%s' from usher, ignoring", file->name); + } + else if (sameString(file->name, "final-tree.nh")) + { + // Don't need this, just remove it. + } + else + warn("Unexpected filename '%s' from usher, ignoring", file->name); + unlink(path); + } +rmdir(outDir); +// Make sure we got a complete range of subtrees [0..subtreeCount-1] +int i; +for (i = 0; i < subtreeCount; i++) + { + if (subtreeTns[i] == NULL) + errAbort("Missing file subtree-%d.nh in usher results", i+1); + if (subtreeMuts[i] == NULL) + errAbort("Missing file subtree-%d-mutations.txt in usher results", i+1); + } +return subtreeCount; +} + +#define MAX_SUBTREES 1000 + +struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile, + int subtreeSize, struct slName *userSampleIds, + struct hash *condensedNodes, int *pStartTime) +/* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and + * subtrees to trash files, return list of slRef to struct tempName for the trash files + * and parse other results out of stderr output. */ +{ +struct usherResults *results = usherResultsNew(); +char subtreeSizeStr[16]; +safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); +char *numThreadsStr = "16"; +struct tempName tnOutDir; +trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir"); +char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi, + "-k", subtreeSizeStr, "-T", numThreadsStr, "-u", "-l", NULL }; +char **cmds[] = { cmd, NULL }; +struct tempName tnStderr; +trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt"); +struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi); +pipelineClose(&pl); +reportTiming(pStartTime, "run usher"); +parseStderr(tnStderr.forCgi, results->samplePlacements); + +struct tempName *subtreeTns[MAX_SUBTREES]; +struct variantPathNode *subtreeMuts[MAX_SUBTREES]; +int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, subtreeTns, subtreeMuts, + MAX_SUBTREES); +results->subtreeInfoList = parseSubtrees(subtreeCount, subtreeTns, subtreeMuts, userSampleIds, + condensedNodes); +return results; +} +