0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3 angie Fri Oct 28 15:08:06 2022 -0700 Performance improvements for trees with millions of sequences: * Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher * Use usher-sampled instead of usher if server is not configured but usher-sampled is available * Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running * Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results) * Avoid parsing the protobuf and traversing the big tree unless absolutely necessary ** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c ** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree) ** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new. ** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c index 3c04bdc..d35c987 100644 --- src/hg/hgPhyloPlace/runUsher.c +++ src/hg/hgPhyloPlace/runUsher.c @@ -1,857 +1,1345 @@ /* Invoke usher to place user's uploaded samples in the phylogenetic tree & parse output files. */ -/* Copyright (C) 2020 The Regents of the University of California */ +/* Copyright (C) 2020-2022 The Regents of the University of California */ #include "common.h" #include "dnautil.h" #include "hash.h" +#include "hgConfig.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "regexHelper.h" #include "pipeline.h" +#include "trackHub.h" #include "trashDir.h" +#include <signal.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> // Keywords in stderr output of usher: #define sampleIdPrefix "Sample name:" #define pScorePrefix "Parsimony score:" #define numPlacementsPrefix "Number of parsimony-optimal placements:" #define imputedMutsPrefix "Imputed mutations:" static void parseSampleIdAndParsimonyScore(char **words, char **retSampleId, struct hash *samplePlacements) /* If words[] seems to contain columns of the line that gives sample ID and parsimony score, * then parse out those values. */ { // Example line: // words[0] = Current tree size (#nodes): 70775 // words[1] = Sample name: MyLabSequence2 // words[2] = Parsimony score: 1 // words[3] = Number of parsimony-optimal placements: 1 char *p = stringIn(sampleIdPrefix, words[1]); if (p) { *retSampleId = cloneString(trimSpaces(p + strlen(sampleIdPrefix))); struct placementInfo *info; AllocVar(info); hashAdd(samplePlacements, *retSampleId, info); info->sampleId = *retSampleId; p = stringIn(pScorePrefix, words[2]); if (p) info->parsimonyScore = atoi(skipToSpaces(p + strlen(pScorePrefix))); else errAbort("Problem parsing stderr output of usher: " "expected '" sampleIdPrefix "...' to be followed by '" pScorePrefix "...' but could not find the latter." ); p = stringIn(numPlacementsPrefix, words[3]); if (p) info->bestNodeCount = atoi(skipToSpaces(p + strlen(numPlacementsPrefix))); else errAbort("Problem parsing stderr output of usher: " "expected '" sampleIdPrefix "... " pScorePrefix " ...' to be followed by '" numPlacementsPrefix "...' but could not find the latter." ); } else errAbort("Unexpected format of sample ID line:\n%s\t%s\t%s\t%s", words[0], words[1], words[2], words[3]); } static struct singleNucChange *parseSnc(char *sncStr) /* If sncStr looks like a <old><pos><new>-style single nucleotide change then parse out those * values & return singleNucChange (with parBase and newBase; no refBase), otherwise return NULL. */ { struct singleNucChange *snc = NULL; regmatch_t substrs[4]; if (regexMatchSubstr(sncStr, "^([ACGT])([0-9]+)([ACGT])$", substrs, ArraySize(substrs))) { int chromStart = regexSubstringInt(sncStr, substrs[2]) - 1; snc = sncNew(chromStart, '\0', sncStr[0], sncStr[substrs[3].rm_so]); } return snc; } static struct baseVal *bvListFromSemiColonSep(char *mutStr) /* Parse a string of ;-sep'd position:allele & return a list of struct baseVal. */ { char *muts[strlen(mutStr) / 4]; int mutCount = chopString(mutStr, ";", muts, ArraySize(muts)); struct baseVal *bvList = NULL; int i; for (i = 0; i < mutCount; i++) { boolean problem = FALSE; char *colon = strchr(muts[i], ':'); if (colon) { int pos = atoi(muts[i]); char *val = cloneString(colon+1); if (pos < 1) problem = TRUE; else if (!isAllNt(val, strlen(val))) problem = TRUE; else { struct baseVal *bv; AllocVar(bv); bv->chromStart = pos - 1; bv->val = val; slAddHead(&bvList, bv); } } if (problem) errAbort("Problem parsing stderr output of usher: " "expected imputed mutation to be number:base, but got '%s'", muts[i]); } slReverse(&bvList); return bvList; } static boolean parseImputedMutations(char **words, struct placementInfo *info) /* If words[] looks like it defines imputed mutations of the most recently named sample, * then parse out the list and add to info->imputedBases and return TRUE. */ { // Example line: // words[0] = Imputed mutations: // words[1] = 6709:A;23403:G boolean matches = FALSE; if (stringIn(imputedMutsPrefix, words[0])) { matches = TRUE; info->imputedBases = bvListFromSemiColonSep(words[1]); } return matches; } static void parseStderr(char *amsStderrFile, struct hash *samplePlacements) /* The stderr output of usher is where we find important info for each sample: * the path of variants on nodes from root to sample leaf, imputed values of ambiguous bases * (if any), and parsimony score. */ { struct lineFile *lf = lineFileOpen(amsStderrFile, TRUE); char *sampleId = NULL; int size; char *line; while (lineFileNext(lf, &line, &size)) { char lineCpy[size+1]; safencpy(lineCpy, sizeof lineCpy, line, size); char *words[16]; int wordCount = chopTabs(lineCpy, words); if (wordCount == 4) parseSampleIdAndParsimonyScore(words, &sampleId, samplePlacements); else if (wordCount == 2) { if (! sampleId) errAbort("Problem parsing stderr output of usher: " "Got line starting with '%s' that was not preceded by a line that " "defines sample ID.:\n%s", words[0], line); struct placementInfo *info = hashFindVal(samplePlacements, sampleId); if (!info) errAbort("Problem parsing stderr output of usher: " "Can't find placement info for sample '%s'", sampleId); parseImputedMutations(words, info); } } } -static void parsePlacements(char *outDirName, char *stderrName, struct hash *samplePlacements) +static void parsePlacements(char *outDirName, char *stderrName, struct hash *samplePlacements, + struct slName **pSampleIds) /* If usher created the file outdir/placement_stats.tsv then parse its contents into - * samplePlacements; otherwise parse the same info out of the stderr output. */ + * samplePlacements; otherwise parse the same info out of the stderr output. + * Check placements vs. *pSampleIds; if usher renamed a sample with prefix to avoid clash with + * sample already in the tree then rename the sample in *pSampleIds. */ { char placementsFileName[PATH_LEN]; safef(placementsFileName, sizeof placementsFileName, "%s/placement_stats.tsv", outDirName); +int placementCount = 0; if (fileExists(placementsFileName)) { + // Frustratingly, the file from server can exist but appear to be empty for a couple seconds! + // Worse, it might have a multiple-of-4096 size yet still be incomplete! + // If we don't get this file's data, then we'll run into errors when parsing subsequent files. + // So wait a while if the size seems suspicious. + int maxTries = 30; + int nx = 0; + long long size = 0; + while ((size = fileSize(placementsFileName)) % 4096 == 0 && nx < maxTries) + { + fprintf(stderr, "parsePlacements: after %ds, %s has multiple-of-4096 size %lld.\n", + nx, placementsFileName, size); + sleep(1); + nx++; + } + if ((size % 4096) == 0) + { + fprintf(stderr, "parsePlacements: %s still has multiple-of-4096 size %lld but we've waited " + "a long time (%ds), proceed.\n", + placementsFileName, size, nx); + } + else if (nx > 0) + fprintf(stderr, "parsePlacements: after %ds, %s size is %lld, getting started.\n", + nx, placementsFileName, size); struct lineFile *lf = lineFileOpen(placementsFileName, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { char *words[5]; int wordCount = chopTabs(line, words); lineFileExpectAtLeast(lf, 4, wordCount); char *sampleId = words[0]; struct placementInfo *info; AllocVar(info); hashAdd(samplePlacements, sampleId, info); info->sampleId = cloneString(sampleId); info->parsimonyScore = atoi(words[1]); info->bestNodeCount = atoi(words[2]); info->imputedBases = bvListFromSemiColonSep(words[3]); + placementCount++; } lineFileClose(&lf); } else + { + fprintf(stderr, "parsePlacements: No %s, reading stderr %s\n", placementsFileName, stderrName); parseStderr(stderrName, samplePlacements); } +// Check whether we placed all samples, and whether any sample names have the USHER_DEDUP_PREFIX +// in usher results files. +struct slName *newSampleIds = NULL; +struct slName *sample, *next = NULL; +for (sample = *pSampleIds; sample != NULL; sample = next) + { + next = sample->next; + if (!hashLookup(samplePlacements, sample->name)) + { + // Usher might have added a prefix to distinguish from a sequence with the same name + // already in the tree. + char nameWithPrefix[strlen(USHER_DEDUP_PREFIX) + strlen(sample->name) + 1]; + safef(nameWithPrefix, sizeof nameWithPrefix, "%s%s", USHER_DEDUP_PREFIX, sample->name); + if (!hashLookup(samplePlacements, nameWithPrefix)) + { + warn("parsePlacements: did not find placement for sample '%s'", sample->name); + slAddHead(&newSampleIds, sample); + } + else + { + warn("Renamed %s to %s to distinguish it from sequence already in the tree", + sample->name, nameWithPrefix); + slNameAddHead(&newSampleIds, nameWithPrefix); + } + } + else + slAddHead(&newSampleIds, sample); + } +slReverse(&newSampleIds); +*pSampleIds = newSampleIds; +} static void parseVariantPaths(char *filename, struct hash *samplePlacements) /* Parse out space-sep list of {node ID, ':', node-associated ,-sep variant list} into * variantPathNode list and associate with sample ID. */ { // Example line (note the back-mutation at 28144T... may want to highlight those): // words[0] = MySeq // words[1] = 1:C8782T,T28144C 2309:C29095T 2340:T8782C 2342:T29095C 2588:C28144T MySeq:C29867T struct lineFile *lf = lineFileOpen(filename, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { char *words[3]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, 2, wordCount); char *sampleId = words[0]; char *nodePath = words[1]; char *nodes[strlen(nodePath) / 4]; int nodeCount = chopString(nodePath, " ", nodes, ArraySize(nodes)); struct variantPathNode *vpNodeList = NULL; int i; for (i = 0; i < nodeCount; i++) { struct variantPathNode *vpn; AllocVar(vpn); char *nodeVariants = nodes[i]; // First there is a node ID followed by ':'. If node ID is numeric, ignore; // otherwise it is a sample ID, indicating a mutation specific to the sample ID, // so include it. char *colon = strrchr(nodeVariants, ':'); if (colon) { *colon = '\0'; char *nodeName = nodeVariants; nodeVariants = colon+1; vpn->nodeName = cloneString(nodeName); } // Next there should be a comma-sep list of mutations/variants. char *variants[strlen(nodeVariants) / 4]; int varCount = chopCommas(nodeVariants, variants); int j; for (j = 0; j < varCount; j++) { variants[j] = trimSpaces(variants[j]); struct singleNucChange *snc = parseSnc(variants[j]); if (snc) slAddHead(&vpn->sncList, snc); else errAbort("parseVariantPath: Expected variant path for %s to specify " "single-nucleotide changes but got '%s'", sampleId, variants[j]); } slReverse(&vpn->sncList); slAddHead(&vpNodeList, vpn); } slReverse(&vpNodeList); struct placementInfo *info = hashFindVal(samplePlacements, sampleId); if (!info) errAbort("parseVariantPath: can't find placementInfo for sample '%s'", sampleId); info->variantPath = vpNodeList; } lineFileClose(&lf); } static struct variantPathNode *parseSubtreeMut(char *line) /* Parse a line with a node name and list of mutations. Examples: * ROOT->1: C241T,C14408T,A23403G,C3037T,A20268G,C28854T,T24076C * 1: C20759T * USA/CA-CZB-4019/2020|EPI_ISL_548621|20-08-01: G17608T,G22199T * node_6849_condensed_4_leaves: */ { struct variantPathNode *vpn = NULL; char *colon = strrchr(line, ':'); if (colon) { AllocVar(vpn); char *nodeName = line; *colon = '\0'; vpn->nodeName = cloneString(nodeName); char *mutString = trimSpaces(colon+1); char *mutWords[strlen(mutString)/4]; int mutCount = chopCommas(mutString, mutWords); int i; for (i = 0; i < mutCount; i++) { struct singleNucChange *snc = parseSnc(mutWords[i]); if (snc) slAddHead(&vpn->sncList, snc); else errAbort("parseSubtreeMut: Expected subtree mutation list to specify single-nucleotide " "changes but got '%s'", mutWords[i]); } slReverse(&vpn->sncList); } else errAbort("parseSubtreeMut: Expected line to contain colon but got '%s'", line); return vpn; } static struct variantPathNode *parseSubtreeMutations(char *filename) /* Parse subtree node mutation lists out of usher subtree-N-mutations.txt file. */ { struct variantPathNode *subtreeMutList = NULL; struct lineFile *lf = lineFileOpen(filename, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { struct variantPathNode *vpn = parseSubtreeMut(line); if (vpn) { // The ROOT->1 (subtree ancestor) sncList needs to be prepended to the subtree // root's sncList because Auspice doesn't seem to display mutations on nodes // with only one child. Discard the subtree ancestor element. if (subtreeMutList && subtreeMutList->next == NULL && startsWith("ROOT->", subtreeMutList->nodeName)) { vpn->sncList = slCat(subtreeMutList->sncList, vpn->sncList); subtreeMutList = NULL; } slAddHead(&subtreeMutList, vpn); } } slReverse(&subtreeMutList); lineFileClose(&lf); return subtreeMutList; } static struct usherResults *usherResultsNew() /* Allocate & return usherResults (just bigTreePlusTn and samplePlacements, subtrees come later). */ { struct usherResults *results; AllocVar(results); AllocVar(results->bigTreePlusTn); trashDirFile(results->bigTreePlusTn, "ct", "phyloPlusSamples", ".nwk"); results->samplePlacements = hashNew(0); return results; } static void rPhyloLeafNames(struct phyloTree *node, struct slName **pList) /* Build up a list of leaf names under node in reverse depth-first-search order. */ { if (node->numEdges > 0) { int i; for (i = 0; i < node->numEdges; i++) rPhyloLeafNames(node->edges[i], pList); } else slAddHead(pList, slNameNew(node->ident->name)); } static struct slName *phyloLeafNames(struct phyloTree *tree) /* Return a list of leaf names in tree in depth-first-search order. */ { struct slName *list = NULL; rPhyloLeafNames(tree, &list); slReverse(&list); return list; } static struct hash *slNameListToIxHash(struct slName *list) /* Given a list of names, add each name to a hash of ints with the index of the name in the list. */ { struct hash *hash = hashNew(0); struct slName *sln; int ix; for (ix = 0, sln = list; sln != NULL; ix++, sln = sln->next) hashAddInt(hash, sln->name, ix); return hash; } static struct slName *getSubtreeSampleIds(struct slName *sampleIds, struct hash *subtreeIds) /* Return a list of sampleIds that are found in subtreeIds. */ { struct slName *subtreeSampleIds = NULL; struct slName *id; for (id = sampleIds; id != NULL; id = id->next) if (hashLookup(subtreeIds, id->name)) slNameAddHead(&subtreeSampleIds, id->name); slReverse(&subtreeSampleIds); return subtreeSampleIds; } -static int hashElIntCmpDesc(const void *va, const void *vb) -/* Compare two hashEl by integer value, descending. */ -{ -const struct hashEl *a = *((struct hashEl **)va); -const struct hashEl *b = *((struct hashEl **)vb); -return b->val - a->val; -} - -static char *summarizeCountries(struct slName *idList) -/* If enough sample names in idList have identifiable countries, then return a summary string, - * else NULL. */ -{ -char *summary = NULL; -int idCount = slCount(idList); -int idWithCountryCount = 0; -struct hash *countryCounts = hashNew(0); -struct slName *id; -for (id = idList; id != NULL; id = id->next) - { - regmatch_t substrs[2]; - if (regexMatchSubstr(id->name, "([A-Za-z_]+)/[^/]+/20", substrs, ArraySize(substrs))) - { - char country[256]; - regexSubstringCopy(id->name, substrs[1], country, sizeof country); - idWithCountryCount++; - hashIncInt(countryCounts, country); - } - } -if ((double)idWithCountryCount / idCount >= 0.5) - { - struct dyString *dy = dyStringCreate("%d from ", idCount); - struct hashEl *helList = hashElListHash(countryCounts); - if (helList->next == NULL) - dyStringAppend(dy, helList->name); - else - { - slSort(&helList, hashElIntCmpDesc); - struct hashEl *hel; - for (hel = helList; hel != NULL; hel = hel->next) - { - if (hel != helList) - dyStringAppend(dy, "+"); - dyStringAppend(dy, hel->name); - int count = ptToInt(hel->val); - if (count > 1) - dyStringPrintf(dy, "[%d]", count); - if (dyStringLen(dy) > 100) - { - dyStringAppend(dy, "+..."); - break; - } - } - } - // Labels have to be unique -- tack on an example ID - dyStringPrintf(dy, " eg %s", idList->name); - summary = dyStringCannibalize(&dy); - subChar(summary, ' ', '_'); - } -return summary; -} - -static char *expandCondensedNodeName(struct hash *condensedNodes, char *name) -/* If name is found in condensedNodes, then return an expanded name that indicates countries of - * origin (if we can find them) and an example ID. Otherwise return NULL. */ -{ -char *expandedName = NULL; -struct slName *nodeList = hashFindVal(condensedNodes, name); -if (nodeList) +static struct hash *hashVpnList(struct variantPathNode *vpnList) +/* Return a hash of nodeName to sncList for each variantPathNode in vpnList. */ { - char *summary = summarizeCountries(nodeList); - if (summary) - expandedName = summary; - else - { - int nodeCount = slCount(nodeList); - struct dyString *dy = dyStringCreate("%s_and_%d_others", nodeList->name, nodeCount-1); - expandedName= dyStringCannibalize(&dy); - } - } -return expandedName; -} - -static struct hash *expandCondensedNodeNames(struct hash *condensedNodes, struct slName *idList) -/* If idList contains names found in condensedNodes, then hash those names to new names that are - * _-sep strings from the lists in condensedNodes. Return hash (which may have no elements - * if nothing in idList is found in condensedNodes). */ -{ -struct hash *hash = hashNew(0); -struct slName *id; -for (id = idList; id != NULL; id = id->next) - { - char *expandedName = expandCondensedNodeName(condensedNodes, id->name); - if (expandedName) - hashAdd(hash, id->name, expandedName); - } -return hash; -} - -static void rSubstTreeNames(struct phyloTree *node, struct hash *nameSubstitutions) -/* If node or descendants have names in nameSubstitutions, then substitute those names. */ -{ -if (node->ident->name) - { - char *subst = hashFindVal(nameSubstitutions, node->ident->name); - if (subst) - node->ident->name = subst; - } -int i; -for (i = 0; i < node->numEdges; i++) - rSubstTreeNames(node->edges[i], nameSubstitutions); -} - -static struct tempName *substituteTreeNames(struct phyloTree *tree, char *treeName, - struct hash *nameSubstitutions) -/* If tree has any nodes whose names are in nameSubstitutions, then substitute those names. - * Write tree out to a trash file and return its location. */ -{ -rSubstTreeNames(tree, nameSubstitutions); -struct tempName *newTn; -AllocVar(newTn); -trashDirFile(newTn, "ct", treeName, ".nwk"); -FILE *f = mustOpen(newTn->forCgi, "w"); -phyloPrintTree(tree, f); -carefulClose(&f); -return newTn; -} - -static struct slName *substituteNameList(struct slName *idList, struct hash *nameSubstitutions) -/* Return a new list that is just like idList, except if any item in idList has a value in - * nameSubstitutions, then the item is replaced by the substitution. */ -{ -struct slName *newList = NULL; -struct slName *id; -for (id = idList; id != NULL; id = id->next) - { - char *subst = hashFindVal(nameSubstitutions, id->name); - slNameAddHead(&newList, subst ? subst : id->name); - } -slReverse(&newList); -return newList; +struct hash *vpnHash = hashNew(13); +struct variantPathNode *vpn; +for (vpn = vpnList; vpn != NULL; vpn = vpn->next) + hashAdd(vpnHash, vpn->nodeName, vpn); +return vpnHash; } -static void addMutationsToTree(struct phyloTree *node, struct variantPathNode **pNodeMutList) -/* Store the list of mutations associated with each node in node->priv. */ +static void addMutationsToTree(struct phyloTree *node, struct hash *subtreeMutHash) +/* Store the list of mutations associated with each node in node->priv, removing node from + * subtreeMutHash so we can check that it's empty when done with tree. */ { -struct variantPathNode *nodeMuts = slPopHead(pNodeMutList); -if (! nodeMuts) - errAbort("addMutationsToTree: subtree mutation list has fewer nodes than subtree"); -if (node->ident->name && ! sameString(nodeMuts->nodeName, node->ident->name)) - errAbort("addMutationsToTree: subtree node name is '%s' but subtree mutation list item is '%s'", - node->ident->name, nodeMuts->nodeName); +if (! node->ident->name) + errAbort("addMutationsToTree: node name is NULL but I need a node name to find mutations"); if (node->priv != NULL) - errAbort("addMutationsToTree: node already has mutations assigned"); + errAbort("addMutationsToTree: node '%s' already has mutations assigned (duplicated in tree?)", + node->ident->name); +struct variantPathNode *nodeMuts = hashFindVal(subtreeMutHash, node->ident->name); +if (! nodeMuts) + errAbort("addMutationsToTree: can't find node '%s' in subtree mutations file", + node->ident->name); +hashRemove(subtreeMutHash, node->ident->name); node->priv = nodeMuts->sncList; int i; for (i = 0; i < node->numEdges; i++) - addMutationsToTree(node->edges[i], pNodeMutList); + addMutationsToTree(node->edges[i], subtreeMutHash); } static struct subtreeInfo *parseOneSubtree(struct tempName *subtreeTn, char *subtreeName, struct variantPathNode *subtreeMuts, - struct slName *userSampleIds, struct hash *condensedNodes) -/* Parse usher's subtree output, figure out which user samples are in subtree and expand names - * of condensed nodes. */ + struct slName *userSampleIds) +/* Parse usher's subtree output and figure out which user samples are in subtree. */ { struct subtreeInfo *ti; AllocVar(ti); ti->subtreeTn = subtreeTn; ti->subtree = phyloOpenTree(ti->subtreeTn->forCgi); -addMutationsToTree(ti->subtree, &subtreeMuts); -if (subtreeMuts != NULL) - errAbort("addMutationsToTree: subtreeMutationList has more nodes than subtree"); -struct slName *subtreeIdList = phyloLeafNames(ti->subtree); -// Don't do name substitutions on condensed node names in subtreeIdToIx since the IDs have to -// match those in the original tree from protobuf. -ti->subtreeIdToIx = slNameListToIxHash(subtreeIdList); +struct hash *subtreeMutHash = hashVpnList(subtreeMuts); +addMutationsToTree(ti->subtree, subtreeMutHash); +if (hashNumEntries(subtreeMutHash) != 0) + errAbort("addMutationsToTree: subtree has fewer nodes than defined in subtree mutations file"); +ti->subtreeNameList = phyloLeafNames(ti->subtree); +ti->subtreeIdToIx = slNameListToIxHash(ti->subtreeNameList); ti->subtreeUserSampleIds = getSubtreeSampleIds(userSampleIds, ti->subtreeIdToIx); if (slCount(ti->subtreeUserSampleIds) == 0) errAbort("No user sample IDs found in subtree file %s", ti->subtreeTn->forCgi); -// Substitute in nicer node names for condensed nodes for displaying to the user in -// custom tracks and Nextstrain/auspice JSON. -struct hash *nameSubstitutions = expandCondensedNodeNames(condensedNodes, subtreeIdList); -if (nameSubstitutions->elCount > 0) - ti->subtreeTn = substituteTreeNames(ti->subtree, subtreeName, nameSubstitutions); -ti->subtreeNameList = substituteNameList(subtreeIdList, nameSubstitutions); -hashFree(&nameSubstitutions); -slFreeList(&subtreeIdList); +hashFree(&subtreeMutHash); return ti; } static struct subtreeInfo *parseSubtrees(int subtreeCount, struct tempName *singleSubtreeTn, struct variantPathNode *singleSubtreeMuts, struct tempName *subtreeTns[], struct variantPathNode *subtreeMuts[], - struct slName *userSampleIds, struct hash *condensedNodes) -/* Parse usher's subtree output, figure out which user samples are in each subtree, expand names - * of condensed nodes. Add parsed singleSubtree at head of list, followed by numbered subtrees. */ + struct slName *userSampleIds) +/* Parse usher's subtree output and figure out which user samples are in each subtree. + * Add parsed singleSubtree at head of list, followed by numbered subtrees. */ { struct subtreeInfo *subtreeInfoList = NULL; int sIx; for (sIx = 0; sIx < subtreeCount; sIx++) { char subtreeName[512]; safef(subtreeName, sizeof(subtreeName), "subtree%d", sIx+1); struct subtreeInfo *ti = parseOneSubtree(subtreeTns[sIx], subtreeName, subtreeMuts[sIx], - userSampleIds, condensedNodes); + userSampleIds); slAddHead(&subtreeInfoList, ti); } slReverse(&subtreeInfoList); struct subtreeInfo *ti = parseOneSubtree(singleSubtreeTn, "singleSubtree", singleSubtreeMuts, - userSampleIds, condensedNodes); + userSampleIds); slAddHead(&subtreeInfoList, ti); return subtreeInfoList; } static void parseClades(char *filename, struct hash *samplePlacements) /* Parse usher's clades.txt, which might have {sample, clade} or {sample, clade, lineage}. */ { struct hash *wordStore = hashNew(0); struct lineFile *lf = lineFileOpen(filename, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { char *words[3]; int wordCount = chopTabs(line, words); char *sampleId = words[0]; struct placementInfo *info = hashFindVal(samplePlacements, sampleId); if (!info) errAbort("parseClades: can't find placementInfo for sample '%s'", sampleId); if (wordCount > 1) { // Nextstrain's clade "20E (EU1)" has to be tweaked to "20E.EU1" for matUtils to avoid // whitespace trouble; tweak it back. if (sameString(words[1], "20E.EU1")) words[1] = "20E (EU1)"; + // Chop extra chars in usher-sampled clades.txt output + char *ptr = strstr(words[1], "*|"); + if (ptr) + *ptr = '\0'; info->nextClade = hashStoreName(wordStore, words[1]); } if (wordCount > 2) + { + // Chop extra chars in usher-sampled clades.txt output + char *ptr = strstr(words[2], "*|"); + if (ptr) + *ptr = '\0'; info->pangoLineage = hashStoreName(wordStore, words[2]); } + } lineFileClose(&lf); } static char *dirPlusFile(struct dyString *dy, char *dir, char *file) -/* Write dir/file into dy and return pointer to dy->string. */ +/* Write dir/file into dy and return pointer to dy->string. Do not free result! */ { dyStringClear(dy); dyStringPrintf(dy, "%s/%s", dir, file); return dy->string; } +#define USHER_SAMPLED_TREE_PREFIX "tree-0-" + static int processOutDirFiles(struct usherResults *results, char *outDir, struct tempName **retSingleSubtreeTn, struct variantPathNode **retSingleSubtreeMuts, struct tempName *subtreeTns[], struct variantPathNode *subtreeMuts[], int maxSubtrees) -/* Get paths to files in outDir; parse them, move files that we'll keep up to trash/ct/, - * remove outDir. */ +/* Get paths to files in outDir; parse them and move files that we'll keep up to trash/ct/, + * leaving behind files that we will remove when done. */ { int subtreeCount = 0; memset(subtreeTns, 0, maxSubtrees * sizeof(*subtreeTns)); memset(subtreeMuts, 0, maxSubtrees * sizeof(*subtreeMuts)); struct dyString *dyScratch = dyStringNew(0); struct slName *outDirFiles = listDir(outDir, "*"), *file; for (file = outDirFiles; file != NULL; file = file->next) { char *path = dirPlusFile(dyScratch, outDir, file->name); if (sameString(file->name, "uncondensed-final-tree.nh")) { mustRename(path, results->bigTreePlusTn->forCgi); } - else if (sameString(file->name, "mutation-paths.txt")) + else if (sameString(file->name, "mutation-paths.txt") || // original usher + sameString(file->name, "mutation-paths-1.txt")) // usher-sampled(-server) { parseVariantPaths(path, results->samplePlacements); } - else if (startsWith("subtree-", file->name)) + else if (startsWith("subtree-", file->name) || + startsWith(USHER_SAMPLED_TREE_PREFIX "subtree-", file->name)) { char fnameCpy[strlen(file->name)+1]; + if (startsWith(USHER_SAMPLED_TREE_PREFIX, file->name)) + safecpy(fnameCpy, sizeof fnameCpy, file->name + strlen(USHER_SAMPLED_TREE_PREFIX)); + else safecpy(fnameCpy, sizeof fnameCpy, file->name); char *parts[4]; int partCount = chopString(fnameCpy, "-", parts, ArraySize(parts)); if (partCount == 2) { // Expect "subtree-N.nh" char *dot = strstr(parts[1], ".nh"); if (dot) { *dot = '\0'; int subtreeIx = atol(parts[1]) - 1; + if (subtreeIx < 0) + errAbort("processOutDirFiles: expected subtree file names to have numbers " + "starting with 1, but found one with 0 (%s)", file->name); if (subtreeIx >= maxSubtrees) errAbort("Too many subtrees in usher output (max %d)", maxSubtrees); if (subtreeIx >= subtreeCount) subtreeCount = subtreeIx + 1; char sname[32]; safef(sname, sizeof sname, "subtree%d", subtreeIx+1); AllocVar(subtreeTns[subtreeIx]); trashDirFile(subtreeTns[subtreeIx], "ct", sname, ".nwk"); mustRename(path, subtreeTns[subtreeIx]->forCgi); } else warn("Unexpected filename '%s' from usher, ignoring", file->name); } else if (partCount == 3) { // Expect "subtree-N-mutations.txt" or "subtree-N-expanded.txt" int subtreeIx = atol(parts[1]) - 1; + if (subtreeIx < 0) + errAbort("processOutDirFiles: expected subtree file names to have numbers " + "starting with 1, but found one with 0 (%s)", file->name); if (subtreeIx >= maxSubtrees) errAbort("Too many subtrees in usher output (max %d)", maxSubtrees); if (subtreeIx >= subtreeCount) subtreeCount = subtreeIx + 1; if (sameString(parts[2], "mutations.txt")) { subtreeMuts[subtreeIx] = parseSubtreeMutations(path); } else if (sameString(parts[2], "expanded.txt")) { // Don't need this, just remove it } else warn("Unexpected filename '%s' from usher, ignoring", file->name); } else warn("Unexpected filename '%s' from usher, ignoring", file->name); } - else if (startsWith("single-subtree", file->name)) + else if (startsWith("single-subtree", file->name) || + startsWith(USHER_SAMPLED_TREE_PREFIX "single-subtree", file->name)) { // One subtree to bind them all char fnameCpy[strlen(file->name)+1]; + if (startsWith(USHER_SAMPLED_TREE_PREFIX, file->name)) + safecpy(fnameCpy, sizeof fnameCpy, file->name + strlen(USHER_SAMPLED_TREE_PREFIX)); + else safecpy(fnameCpy, sizeof fnameCpy, file->name); char *parts[4]; int partCount = chopString(fnameCpy, "-", parts, ArraySize(parts)); if (partCount == 2) { // Expect single-subtree.nh if (!endsWith(parts[1], ".nh")) warn("Unexpected filename '%s' from usher, ignoring", file->name); else if (retSingleSubtreeTn) { struct tempName *tn; AllocVar(tn); trashDirFile(tn, "ct", "subtree", ".nwk"); mustRename(path, tn->forCgi); *retSingleSubtreeTn = tn; } } else if (partCount == 3) { // Expect single-subtree-mutations.txt or single-subtree-expanded.txt if (sameString(parts[2], "mutations.txt")) { if (retSingleSubtreeMuts) *retSingleSubtreeMuts = parseSubtreeMutations(path); } else if (sameString(parts[2], "expanded.txt")) { // Don't need this, just remove it } else warn("Unexpected filename '%s' from usher, ignoring", file->name); } else warn("Unexpected filename '%s' from usher, ignoring", file->name); } else if (sameString(file->name, "clades.txt")) { parseClades(path, results->samplePlacements); } else if (sameString(file->name, "final-tree.nh") || + sameString(file->name, "current-tree.nh") || sameString(file->name, "placement_stats.tsv")) { - // Don't need this, just remove it. + // Don't need this (or already parsed it elsewhere not here), just remove it. } else warn("Unexpected filename '%s' from usher, ignoring", file->name); - unlink(path); } -rmdir(outDir); +dyStringFree(&dyScratch); // Make sure we got a complete range of subtrees [0..subtreeCount-1] int i; for (i = 0; i < subtreeCount; i++) { if (subtreeTns[i] == NULL) errAbort("Missing file subtree-%d.nh in usher results", i+1); if (subtreeMuts[i] == NULL) errAbort("Missing file subtree-%d-mutations.txt in usher results", i+1); } return subtreeCount; } +static void removeOutDir(char *outDir) +/* Remove outDir and its files. */ +{ +struct slName *outDirFiles = listDir(outDir, "*"), *file; +struct dyString *dyScratch = dyStringNew(0); +for (file = outDirFiles; file != NULL; file = file->next) + { + char *path = dirPlusFile(dyScratch, outDir, file->name); + unlink(path); + } +dyStringFree(&dyScratch); +rmdir(outDir); +} + +static void runUsherCommand(char *cmd[], char *stderrFile, int *pStartTime) +/* Run the standalone usher command with its stderr output redirected to stderrFile. */ +{ +char **cmds[] = { cmd, NULL }; +struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, stderrFile, 0); +pipelineClose(&pl); +reportTiming(pStartTime, "run usher"); +} + +boolean serverIsConfigured(char *db) +/* Return TRUE if all necessary configuration settings are in place to run usher-sampled-server. */ +{ +char *serverDir = cfgOption("hgPhyloPlaceServerDir"); +if (isNotEmpty(serverDir)) + { + char *usherServerEnabled = phyloPlaceDbSetting(db, "usherServerEnabled"); + if (isNotEmpty(usherServerEnabled) && SETTING_IS_ON(usherServerEnabled) && + fileExists(PHYLOPLACE_DATA_DIR "/usher-sampled-server")) + { + return TRUE; + } + } +return FALSE; +} + +static char *getUsherServerFilePath(char *db, char *file) +/* Alloc & return the path to special server file in $hgPhyloPlaceServerDir/$db/ . */ +{ +char *serverDir = cfgOption("hgPhyloPlaceServerDir"); +// We need host name in order to prevent clashes between hgw1 and hgw2 sharing the same hg.conf +// contents and NFS filesystem. And not the vhost name from getHost(), the real base host name. +char host[PATH_LEN]; +int ret = gethostname(host, sizeof host); +if (ret != 0) + safecpy(host, sizeof host, "hostunknown"); +char *p = strchr(host, '.'); +if (p) + *p = '\0'; +struct dyString *dyPath = dyStringCreate("%s/%s/%s.%s", + serverDir, trackHubSkipHubName(db), host, file); +return dyStringCannibalize(&dyPath); +} + +static char *getUsherServerMfifoPath(char *db) +/* Alloc & return path to server manager fifo file for db. */ +{ +return getUsherServerFilePath(db, "server.fifo"); +} + +static char *getUsherServerSocketPath(char *db) +/* Alloc & return path to server socket file for db. */ +{ +return getUsherServerFilePath(db, "server.socket"); +} + +static char *getUsherServerLogPath(char *db) +/* Alloc & return path to server log file (very important because it tells us the pid) for db. */ +{ +return getUsherServerFilePath(db, "server.log"); +} + +boolean serverIsRunning(char *db, FILE *errFile) +/* Return TRUE if logFile exists and its first line specifies a pid as expected from daemonishSpawn, + * and that pid looks alive according to /proc. */ +{ +boolean pidIsLive = FALSE; +char *logFile = getUsherServerLogPath(db); +struct lineFile *lf = lineFileMayOpen(logFile, TRUE); +if (lf) + { + char *line; + if (lineFileNext(lf, &line, NULL)) + { + if (isAllDigits(line)) + { + int pid = atol(line); + if (pid > 1) + { + char procStatus[PATH_LEN]; + safef(procStatus, sizeof procStatus, "/proc/%d/status", pid); + struct lineFile *psLf = lineFileMayOpen(procStatus, TRUE); + if (psLf) + { + const static char *statePrefix ="State:\t"; + boolean foundState = FALSE; + while (lineFileNext(psLf, &line, NULL)) + { + if (startsWith(statePrefix, line)) + { + foundState = TRUE; + char state = *(line + strlen(statePrefix)); + // State Running, Sleep or Deep (uninterruptible) sleep is OK + if (state == 'R' || state == 'S' || state == 'D') + { + pidIsLive = TRUE; + fprintf(errFile, "serverIsRunning: Found pid %d with state %c.\n", + pid, state); + } + else + { + // State sTopped or Zombie not OK + fprintf(errFile, "serverIsRunning: WARNING: pid %d has state %c.\n", + pid, state); + } + } + } + lineFileClose(&psLf); + if (!foundState) + fprintf(errFile, + "serverIsRunning: No line of %s begins with '%s', " + "can't find process state.\n", + procStatus, statePrefix); + } + else + fprintf(errFile, "serverIsRunning: Can't open %s, looks like server is dead.\n", + procStatus); + } + else + fprintf(errFile, "serverIsRunning: expected number greater than 1 (pid) in " + "first line of log file %s, but got '%s'\n", logFile, line); + } + else + { + fprintf(errFile, "serverIsRunning: First line of log file %s has unexpected format '%s' " + "(expected to begin with pid)\n", + logFile, line); + } + } + else + fprintf(errFile, "serverIsRunning: log file '%s' appears empty\n", logFile); + lineFileClose(&lf); + } +else + fprintf(errFile, "serverIsRunning: Can't open log file '%s'\n", logFile); +return pidIsLive; +} + + +//******************* BEGIN copied from lib/pipeline.c, consider libifying these *************** +static void closeNonStdDescriptors(void) +/* close non-standard file descriptors */ +{ +long maxFd = sysconf(_SC_OPEN_MAX); +if (maxFd < 0) + maxFd = 4096; // shouldn't really happen +int fd; +for (fd = STDERR_FILENO+1; fd < maxFd; fd++) + close(fd); +} + +static int openRead(char *fname) +/* open a file for reading */ +{ +int fd = open(fname, O_RDONLY); +if (fd < 0) + errnoAbort("can't open for read access: %s", fname); +return fd; +} + +static int openWrite(char *fname, boolean append) +/* open a file for write access */ +{ +int flags = O_WRONLY|O_CREAT; +if (append) + flags |= O_APPEND; +else + flags |= O_TRUNC; +int fd = open(fname, flags, 0666); +if (fd < 0) + errnoAbort("can't open for write access: %s", fname); +return fd; +} +//******************* END copied from lib/pipeline.c, consider libifying these *************** + + +static void daemonishInit(int newStderr) +/* Isolate a double-fork-spawned process from its parent processes by closing file descriptors, + * setting stderr to errlogFile, resetting umask etc. + * See "man 7 daemon". This function is "daemonish" because I'm not doing 100% of the recommended + * initialization things from "man 7 daemon", because I'm not running a Linux system service, + * just a program that needs to remain running in the background after this CGI exits. + * This errAborts if setting stdin, stdout or stderr fails, and ignores any other failures. */ +{ +// Set stdin and stdout to /dev/null, stderr to errlogFile, close all other possibly open fds. +int newStdin = openRead("/dev/null"); +if (dup2(newStdin, STDIN_FILENO) < 0) + errnoAbort("daemonishInit: can't dup2 to stdin"); +int newStdout = openWrite("/dev/null", TRUE); +if (dup2(newStdout, STDOUT_FILENO) < 0) + errnoAbort("daemonishInit: can't dup2 to stdout"); +if (dup2(newStderr, STDERR_FILENO) < 0) + errnoAbort("daemonishInit: can't dup2 to stderr"); +closeNonStdDescriptors(); + +// Reset all signal handlers to their default. +int ix; +for (ix = 0; ix <= SIGRTMAX; ix++) + signal(ix, SIG_DFL); + +// Reset the signal mask +sigset_t sigset; +sigemptyset(&sigset); +sigprocmask(SIG_SETMASK, &sigset, NULL); + +// Reset the umask +umask(0); +} + +int daemonishSpawn(char *cmd[], char *errlogFile) +/* Use a double fork & setgid to create a process with parent process group = initd so it has + * no controlling terminal and so the CGI won't become a zombie process when done. + * See "man 7 daemon". This function is "daemonish" because I'm not doing 100% of the recommended + * initialization things from "man 7 daemon", because I'm not running a Linux system service, + * just a program that needs to remain running in the background after this CGI exits. + * Double-fork grandchild execs cmd, uses /dev/null for stdin and stdout, and sends stderr + * to errlogFile -- after first writing its pid to errlogFile so that later processes can use + * errlogFile to check whether the spawned process is still running. + * Calling process/first fork parent returns result of first fork (pid for success, -1 for error) */ +{ +int fork1 = fork(); +if (fork1 < 0) + errnoWarn("first fork failed (%d)", fork1); +else if (fork1 == 0) + { + // First fork child will setsid and fork again + if (setsid() < 0) + errnoAbort("setsid failed; first fork child errAborting"); + else + { + int fork2 = fork(); + if (fork2 < 0) + errnoAbort("second fork failed (%d); first fork child errAborting", fork2); + else if (fork2 > 0) + { + // First fork child / second fork parent is all done. Use _exit not exit so we + // don't close mysql connections etc. + _exit(EXIT_SUCCESS); + } + else + { + // Second fork child sets stdin and stdout to /dev/null, stderr to errLogFile (after + // writing pid), then becomes cmd using execvp. + int newStderr = openWrite(errlogFile, FALSE); + char pidBuf[16]; + safef(pidBuf, sizeof pidBuf, "%llu\n", (long long unsigned)getpid()); + write(newStderr, pidBuf, strlen(pidBuf)); + daemonishInit(newStderr); + execvp(cmd[0], cmd); + } + } + } +return fork1; +} + +boolean startServer(char *db, struct treeChoices *treeChoices, FILE *errFile) +/* Start up an usher-sampled-server process to run in the background. */ +{ +boolean success = FALSE; +if (serverIsConfigured(db)) + { + char *serverDir = cfgOption("hgPhyloPlaceServerDir"); + if (! fileExists(serverDir)) + makeDir(serverDir); + char path[PATH_LEN]; + safef(path, sizeof path, "%s/%s", serverDir, db); + if (! fileExists(path)) + makeDir(path); + + char *usherServerPath = PHYLOPLACE_DATA_DIR "/usher-sampled-server"; + // Each protobuf file from treeChoices must appear as a separate arg after -l, + // so we have to dynamically build up serverCmd, but can use a static for the first args. + char *serverCmdBase[] = { usherServerPath, + "-m", getUsherServerMfifoPath(db), + "-s", getUsherServerSocketPath(db), + "-T", USHER_NUM_THREADS, + "-t", USHER_SERVER_CHILD_TIMEOUT, + "-l" }; + size_t serverCmdBaseSize = ArraySize(serverCmdBase); + size_t serverCmdSize = serverCmdBaseSize + treeChoices->count + 1; + char *serverCmd[serverCmdSize]; + int ix; + for (ix = 0; ix < serverCmdBaseSize; ix++) + serverCmd[ix] = serverCmdBase[ix]; + for (ix = 0; ix < treeChoices->count; ix++) + serverCmd[serverCmdBaseSize + ix] = treeChoices->protobufFiles[ix]; + serverCmd[serverCmdBaseSize + ix] = NULL; + + fputs("Spawning server with command: ", errFile); + for (ix = 0; serverCmd[ix] != NULL; ix++) + { + fputc(' ', errFile); + fputs(serverCmd[ix], errFile); + } + fputc('\n', errFile); + success = (daemonishSpawn(serverCmd, getUsherServerLogPath(db)) > 0); + if (success) + fputs("Server spawned; first fork returned successfully.\n", errFile); + else + fputs("Server spawn failed at first fork!\n", errFile); + } +else + errAbort("Usher server is not configured for %s, not starting", db); +return success; +} + +void serverReloadProtobufs(char *db, struct treeChoices *treeChoices) +/* Send a reload command and list of protobufs for db to usher server. */ +{ +char *usherServerMfifoPath = getUsherServerMfifoPath(db); +FILE *mf = fopen(usherServerMfifoPath, "a"); +if (mf) + { + fprintf(mf, "reload\n"); + int ix; + for (ix = 0; ix < treeChoices->count; ix++) + fprintf(mf, "%s\n", treeChoices->protobufFiles[ix]); + fputc('\n', mf); + carefulClose(&mf); + } +else + warn("serverReload: unable to open '%s', command not sent", usherServerMfifoPath); +} + +void serverStop(char *db) +/* Send stop command to usher server. */ +{ +char *usherServerMfifoPath = getUsherServerMfifoPath(db); +FILE *mf = fopen(usherServerMfifoPath, "a"); +if (mf) + { + fprintf(mf, "stop\n"); + carefulClose(&mf); + } +else + warn("serverStop: unable to open '%s', command not sent", usherServerMfifoPath); +} + +void serverSetThreadCount(char *db, int val) +/* Send thread command and value to usher server. */ +{ +char *usherServerMfifoPath = getUsherServerMfifoPath(db); +FILE *mf = fopen(usherServerMfifoPath, "a"); +if (mf) + { + if (val > 0) + fprintf(mf, "thread %d\n", val); + else + errAbort("Bad value %d passed to serverSetThreadCount, not sending", val); + carefulClose(&mf); + } +else + warn("serverSetTimeout: unable to open '%s', command not sent", usherServerMfifoPath); +} + +void serverSetTimeout(char *db, int val) +/* Send timeout command and value (in seconds) to usher server. */ +{ +char *usherServerMfifoPath = getUsherServerMfifoPath(db); +FILE *mf = fopen(usherServerMfifoPath, "a"); +if (mf) + { + if (val > 0) + fprintf(mf, "timeout %d\n", val); + else + errAbort("Bad value %d passed to serverSetTimeout, not sending", val); + carefulClose(&mf); + } +else + warn("serverSetTimeout: unable to open '%s', command not sent", usherServerMfifoPath); +} + +static int getServerSocket(char *db, struct treeChoices *treeChoices, FILE *errFile) +/* Try to connect to server; attempt to restart server and then connect if it seems like the server + * is down. Return -1 if unable to connect. */ +{ +int socketFd = socket(AF_UNIX, SOCK_STREAM, 0); +if (socketFd < 0) + { + // OS-level failure, not anything we can do about it + fprintf(errFile, "Failed to create a UNIX socket: %s\n", strerror(errno)); + } +else + { + // From "man 7 unix": + // struct sockaddr_un { + // sa_family_t sun_family; /* AF_UNIX */ + // char sun_path[UNIX_PATH_MAX]; /* pathname */ + // }; + struct sockaddr_un addr; + addr.sun_family=AF_UNIX; + char *usherServerSocketPath = getUsherServerSocketPath(db); + safecpy(addr.sun_path, sizeof(addr.sun_path), usherServerSocketPath); + int ret = connect(socketFd, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) + { + fprintf(errFile, "Cannot connect socket %d to path '%s': %s\n", + socketFd, usherServerSocketPath, strerror(errno)); + // Does the server need to be restarted, or (in case another web request prompted some + // other hgPhyloPlace process to restart it but it isn't quite up yet) should we wait a + // few seconds and try again? + if (serverIsRunning(db, errFile) || + startServer(db, treeChoices, errFile)) + { + sleep(5); + ret = connect(socketFd, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) + { + // Give up - fall back on regular usher command + fprintf(errFile, "Second attempt to connect socket %d to path '%s' failed: %s\n", + socketFd, usherServerSocketPath, strerror(errno)); + socketFd = -1; + } + } + } + } +return socketFd; +} + +// Server sends ASCII EOT character (4) when done. Sadly I can't find a header file that defines EOT. +#define EOT 4 + +static boolean sendQuery(int socketFd, char *cmd[], char *db, struct treeChoices *treeChoices, + FILE *errFile) +/* Send command to socket, read response on socket, return TRUE if we get a successful response. */ +{ +boolean success = FALSE; +struct dyString *dyMessage = dyStringNew(0); +int ix; +for (ix = 0; cmd[ix] != NULL; ix++) + { + // Don't include args from -T onward; server rejects requests with -T or --optimization_radius + if (sameString("-T", cmd[ix])) + break; + dyStringPrintf(dyMessage, "%s\n", cmd[ix]); + } +// But we do need --no-ignore-prefix: +dyStringPrintf(dyMessage, "--no-ignore-prefix\n"USHER_DEDUP_PREFIX"\n"); +dyStringAppendC(dyMessage, '\n'); +boolean serverError = FALSE; +int bytesWritten = write(socketFd, dyMessage->string, dyMessage->stringSize); +if (bytesWritten == dyMessage->stringSize) + { + struct lineFile *lf = lineFileAttach("server socket", TRUE, socketFd); + if (lf) + { + char *line; + while (lineFileNext(lf, &line, NULL)) + { + if (startsWith("Tree", line) && endsWith(line, "not found")) + { + // Tell the server to reload the latest protobufs + serverReloadProtobufs(getUsherServerMfifoPath(db), treeChoices); + // Reloading multiple trees takes a while, so fall back on standalone usher(-sampled) + serverError = TRUE; + // Continue reading output from server. + } + else if (line[0] == EOT) + { + success = ! serverError; + break; + } + else if (isNotEmpty(line)) + fprintf(errFile, "%s\n", line); + } + } + else + fprintf(errFile, "Failed to attach linefile to socket %d.\n", socketFd); + } +else + fprintf(errFile, "Failed to send query to socket %d: attempted to write %ld bytes, ret=%d\n", + socketFd, dyMessage->stringSize, bytesWritten); +dyStringFree(&dyMessage); +return success; +} + +static boolean runUsherServer(char *db, char *cmd[], char *stderrFile, + struct treeChoices *treeChoices, int *pStartTime) +/* Start the server if necessary, connect to it, send a query, get response and return TRUE if. + * all goes well. If unsuccessful, write reasons to errFile and return FALSE. */ +{ +boolean success = FALSE; +if (serverIsConfigured(db)) + { + FILE *errFile = mustOpen(stderrFile, "w"); + int serverSocket = getServerSocket(db, treeChoices, errFile); + + reportTiming(pStartTime, "get socket"); + if (serverSocket > 0) + { + success = sendQuery(serverSocket, cmd, db, treeChoices, errFile); + close(serverSocket); + reportTiming(pStartTime, "send query"); + } + carefulClose(&errFile); + } +return success; +} + #define MAX_SUBTREES 1000 -struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile, - int subtreeSize, struct slName *userSampleIds, - struct hash *condensedNodes, int *pStartTime) +struct usherResults *runUsher(char *db, char *usherPath, char *usherAssignmentsPath, char *vcfFile, + int subtreeSize, struct slName **pUserSampleIds, + struct treeChoices *treeChoices, int *pStartTime) /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files - * and parse other results out of stderr output. */ + * and parse other results out of stderr output. The usher-sampled version of usher might + * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir"); char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi, - "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS, "-u", + "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-u", + "-T", USHER_NUM_THREADS, // Don't pass args from -T onward to server + "--optimization_radius", "0", // Don't pass these to original usher, only -sampled + "--no-ignore-prefix", USHER_DEDUP_PREFIX, NULL }; -char **cmds[] = { cmd, NULL }; struct tempName tnStderr; trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt"); -struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0); -pipelineClose(&pl); -reportTiming(pStartTime, "run usher"); -parseStderr(tnStderr.forCgi, results->samplePlacements); +struct tempName tnServerStderr; +trashDirFile(&tnServerStderr, "ct", "usher_server_stderr", ".txt"); +char *stderrFile = tnServerStderr.forCgi; +if (! runUsherServer(db, cmd, tnServerStderr.forCgi, treeChoices, pStartTime)) + { + if (!endsWith(usherPath, "-sampled")) + { + // Truncate cmd for original usher: remove usher-sampled-specific option + int ix = stringArrayIx("--optimization_radius", cmd, ArraySize(cmd)-1); + if (ix > 0) + cmd[ix] = NULL; + } + runUsherCommand(cmd, tnStderr.forCgi, pStartTime); + stderrFile = tnStderr.forCgi; + } struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; -parsePlacements(tnOutDir.forCgi, tnStderr.forCgi, results->samplePlacements); +parsePlacements(tnOutDir.forCgi, stderrFile, results->samplePlacements, pUserSampleIds); int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); -if (singleSubtreeTn == NULL) +if (singleSubtreeTn) { - warn("Sorry, there was a problem running usher. " - "Please ask genome-www@soe.ucsc.edu to take a look at %s.", tnStderr.forCgi); - return NULL; - } results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, - subtreeTns, subtreeMuts, userSampleIds, condensedNodes); + subtreeTns, subtreeMuts, *pUserSampleIds); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; + removeOutDir(tnOutDir.forCgi); + } +else + { + results = NULL; + warn("Sorry, there was a problem running usher. " + "Please ask genome-www@soe.ucsc.edu to take a look at %s.", stderrFile); + } +reportTiming(pStartTime, "parse results from usher"); return results; } static void addEmptyPlacements(struct slName *sampleIds, struct hash *samplePlacements) /* Parsing an usher-style clades.txt file from matUtils extract requires samplePlacements to * have placementInfo for each sample. When running usher, those are added when we parse * usher stderr; when running matUtils, just allocate one for each sample. */ { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) { struct placementInfo *info; AllocVar(info); hashAdd(samplePlacements, sample->name, info); info->sampleId = cloneString(sample->name); } } struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath, int subtreeSize, struct slName *sampleIds, - struct hash *condensedNodes, int *pStartTime) + int *pStartTime) /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); char *numThreadsStr = "16"; struct tempName tnSamples; trashDirFile(&tnSamples, "ct", "matUtilsExtractSamples", ".txt"); FILE *f = mustOpen(tnSamples.forCgi, "w"); struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) fprintf(f, "%s\n", sample->name); carefulClose(&f); struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "matUtils_outdir", ".dir"); char *cmd[] = { matUtilsPath, "extract", "-i", protobufPath, "-d", tnOutDir.forCgi, "-s", tnSamples.forCgi, "-x", subtreeSizeStr, "-X", SINGLE_SUBTREE_SIZE, "-T", numThreadsStr, "--usher-clades-txt", NULL }; char **cmds[] = { cmd, NULL }; struct tempName tnStderr; trashDirFile(&tnStderr, "ct", "matUtils_stderr", ".txt"); struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0); pipelineClose(&pl); reportTiming(pStartTime, "run matUtils"); addEmptyPlacements(sampleIds, results->samplePlacements); struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, - subtreeTns, subtreeMuts, sampleIds, condensedNodes); + subtreeTns, subtreeMuts, sampleIds); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; +reportTiming(pStartTime, "process results from matUtils"); return results; }