0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3 angie Fri Oct 28 15:08:06 2022 -0700 Performance improvements for trees with millions of sequences: * Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher * Use usher-sampled instead of usher if server is not configured but usher-sampled is available * Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running * Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results) * Avoid parsing the protobuf and traversing the big tree unless absolutely necessary ** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c ** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree) ** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new. ** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 7cf8d44..fe1e4a4 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1,42 +1,43 @@ /* Place SARS-CoV-2 sequences in phylogenetic tree using usher program. */ -/* Copyright (C) 2020 The Regents of the University of California */ +/* Copyright (C) 2020-2022 The Regents of the University of California */ #include "common.h" #include "bigBed.h" #include "cart.h" #include "cheapcgi.h" #include "errCatch.h" #include "fa.h" #include "genePred.h" #include "grp.h" #include "hCommon.h" #include "hash.h" #include "hgConfig.h" #include "htmshell.h" #include "hubConnect.h" #include "hui.h" #include "iupac.h" #include "jsHelper.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "pipeline.h" #include "psl.h" +#include "pthreadWrap.h" #include "ra.h" #include "regexHelper.h" #include "trackHub.h" #include "trashDir.h" #include "vcf.h" // Globals: static boolean measureTiming = FALSE; // Parameter constants: int maxGenotypes = 1000; // Upper limit on number of samples user can upload at once. boolean showParsimonyScore = FALSE; struct slName *phyloPlaceDbList(struct cart *cart) @@ -141,36 +142,43 @@ { char *fileName = phyloPlaceDbSetting(db, settingName); if (isNotEmpty(fileName) && fileName[0] != '/' && !isUrl(fileName) && !fileExists(fileName)) { struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s", trackHubSkipHubName(db), fileName); if (fileExists(dy->string)) return dyStringCannibalize(&dy); else return NULL; } return fileName; } char *getUsherPath(boolean abortIfNotFound) -/* Return hgPhyloPlaceData/usher if it exists, else NULL. Do not free the returned value. */ +/* Return hgPhyloPlaceData/usher-sampled if it exists, else try hgPhyloPlaceData/usher, else NULL. + * Do not free the returned value. */ { -char *usherPath = PHYLOPLACE_DATA_DIR "/usher"; +char *usherPath = PHYLOPLACE_DATA_DIR "/usher-sampled"; if (fileExists(usherPath)) return usherPath; -else if (abortIfNotFound) +else + { + usherPath = PHYLOPLACE_DATA_DIR "/usher"; + if (fileExists(usherPath)) + return usherPath; + } +if (abortIfNotFound) errAbort("Missing usher executable (expected to be at %s)", usherPath); return NULL; } char *getMatUtilsPath(boolean abortIfNotFound) /* Return hgPhyloPlaceData/matUtils if it exists, else NULL. Do not free the returned value. */ { char *matUtilsPath = PHYLOPLACE_DATA_DIR "/matUtils"; if (fileExists(matUtilsPath)) return matUtilsPath; else if (abortIfNotFound) errAbort("Missing matUtils executable (expected to be at %s)", matUtilsPath); return NULL; } @@ -211,64 +219,70 @@ struct treeChoices *loadTreeChoices(char *db) /* If /config.ra specifies a treeChoices file, load it up, else return NULL. */ { struct treeChoices *treeChoices = NULL; char *filename = phyloPlaceDbSettingPath(db, "treeChoices"); if (isNotEmpty(filename) && fileExists(filename)) { AllocVar(treeChoices); int maxChoices = 128; AllocArray(treeChoices->protobufFiles, maxChoices); AllocArray(treeChoices->metadataFiles, maxChoices); AllocArray(treeChoices->sources, maxChoices); AllocArray(treeChoices->descriptions, maxChoices); AllocArray(treeChoices->aliasFiles, maxChoices); + AllocArray(treeChoices->sampleNameFiles, maxChoices); struct lineFile *lf = lineFileOpen(filename, TRUE); char *line; while (lineFileNextReal(lf, &line)) { - char *words[6]; + char *words[7]; int wordCount = chopTabs(line, words); lineFileExpectAtLeast(lf, 4, wordCount); if (treeChoices->count >= maxChoices) { warn("File %s has too many lines, only showing first %d phylogenetic tree choices", filename, maxChoices); break; } struct dyString *dy = dyStringNew(0); addPathIfNecessary(dy, db, words[0]); treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string); addPathIfNecessary(dy, db, words[1]); treeChoices->metadataFiles[treeChoices->count] = cloneString(dy->string); treeChoices->sources[treeChoices->count] = cloneString(words[2]); // Description can be either a file or just some text. addPathIfNecessary(dy, db, words[3]); if (fileExists(dy->string)) { char *desc = NULL; readInGulp(dy->string, &desc, NULL); treeChoices->descriptions[treeChoices->count] = desc; } else treeChoices->descriptions[treeChoices->count] = cloneString(words[3]); if (wordCount > 4) { addPathIfNecessary(dy, db, words[4]); treeChoices->aliasFiles[treeChoices->count] = cloneString(dy->string); } + if (wordCount > 5) + { + addPathIfNecessary(dy, db, words[5]); + treeChoices->sampleNameFiles[treeChoices->count] = cloneString(dy->string); + } treeChoices->count++; dyStringFree(&dy); } lineFileClose(&lf); } return treeChoices; } static char *urlFromTn(struct tempName *tn) /* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't * just leave it up to the user's web browser to do the right thing with "../". */ { struct dyString *dy = dyStringCreate("%s%s", hLocalHostCgiBinUrl(), tn->forHtml); return dyStringCannibalize(&dy); } @@ -555,33 +569,35 @@ struct seqInfo *si = hashFindVal(seqInfoHash, pi->sampleId); if (si) { struct slName *sampleMuts = NULL; struct singleNucChange *snc; for (snc = si->sncList; snc != NULL; snc = snc->next) { char mutStr[64]; safef(mutStr, sizeof mutStr, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); slNameAddHead(&sampleMuts, mutStr); } slReverse(&sampleMuts); pi->sampleMuts = sampleMuts; } else + { errAbort("addSampleMutsFromSeqInfo: no seqInfo for placed sequence '%s'", pi->sampleId); } } +} static void displaySampleMuts(struct placementInfo *info, char *refAcc) { printf("

Differences from the reference genome " "(%s): ", refAcc, refAcc); if (info->sampleMuts == NULL) printf("(None; identical to reference)"); else { struct slName *sln; for (sln = info->sampleMuts; sln != NULL; sln = sln->next) { if (sln != info->sampleMuts) printf(", "); @@ -624,127 +640,91 @@ static void displayVariantPath(struct variantPathNode *variantPath, char *sampleId) /* Display mutations on the path to this sample. */ { printf("

Mutations along the path from the root of the phylogenetic tree to %s:
", sampleId); if (variantPath) { variantPathPrint(variantPath); puts("
"); } else puts("(None; your sample was placed at the root of the phylogenetic tree)"); puts("

"); } -static struct variantPathNode *findLastInternalNode(struct variantPathNode *variantPath, - int minNewNode) -/* Return the last node in variantPath with a numeric name less than minNewNode, or NULL. */ -{ -if (!variantPath || !isInternalNodeName(variantPath->nodeName, minNewNode)) - return NULL; -while (variantPath->next && isInternalNodeName(variantPath->next->nodeName, minNewNode)) - variantPath = variantPath->next; -if (variantPath && isInternalNodeName(variantPath->nodeName, minNewNode)) - return variantPath; -return NULL; -} - static int mutCountCmp(const void *a, const void *b) /* Compare number of mutations of phyloTree nodes a and b. */ { const struct phyloTree *nodeA = *(struct phyloTree * const *)a; const struct phyloTree *nodeB = *(struct phyloTree * const *)b; return slCount(nodeA->priv) - slCount(nodeB->priv); } -static char *leafWithLeastMuts(struct phyloTree *node, struct mutationAnnotatedTree *bigTree) -/* If node has a leaf child with no mutations of its own, return the name of that leaf. +static char *leafWithLeastMuts(struct phyloTree *node, struct hash *excludeHash) +/* If node has a leaf child (not in excludeHash) with no mutations of its own, return that leaf name. * Otherwise, if node has leaf children, return the name of the leaf with the fewest mutations. * Otherwise return NULL. */ { char *leafName = NULL; int leafCount = 0; int i; for (i = 0; i < node->numEdges; i++) { struct phyloTree *kid = node->edges[i]; - if (kid->numEdges == 0) + if (kid->numEdges == 0 && !hashLookup(excludeHash, kid->ident->name)) { leafCount++; struct singleNucChange *kidMuts = kid->priv; if (!kidMuts) { - struct slName *nodeList = hashFindVal(bigTree->condensedNodes, kid->ident->name); - if (nodeList) - leafName = cloneString(nodeList->name); - else leafName = cloneString(kid->ident->name); break; } } } if (leafName == NULL && leafCount) { // Pick the leaf with the fewest mutations. struct phyloTree *leafKids[leafCount]; int leafIx = 0; for (i = 0; i < node->numEdges; i++) { struct phyloTree *kid = node->edges[i]; - if (kid->numEdges == 0) + if (kid->numEdges == 0 && !hashLookup(excludeHash, kid->ident->name)) leafKids[leafIx++] = kid; } qsort(leafKids, leafCount, sizeof(leafKids[0]), mutCountCmp); leafName = cloneString(leafKids[0]->ident->name); } return leafName; } -static char *findNearestNeighbor(struct mutationAnnotatedTree *bigTree, char *sampleId, - struct variantPathNode *variantPath) -/* Use the sequence of mutations in variantPath to find sampleId's parent node in bigTree, - * then look for most similar leaf sibling(s). */ -{ -char *nearestNeighbor = NULL; -int bigTreeINodeCount = phyloCountInternalNodes(bigTree->tree); -int minNewNode = bigTreeINodeCount + 1; // 1-based -struct variantPathNode *lastOldNode = findLastInternalNode(variantPath, minNewNode); -struct phyloTree *node = lastOldNode ? hashFindVal(bigTree->nodeHash, lastOldNode->nodeName) : - bigTree->tree; -if (lastOldNode && !node) - { - if (startsWith(USHER_NODE_PREFIX, lastOldNode->nodeName)) - // protobuf still has number even if usher prepends USHER_NODE_PREFIX when parsing. - node = hashFindVal(bigTree->nodeHash, lastOldNode->nodeName+strlen(USHER_NODE_PREFIX)); - if (node == NULL) - errAbort("Can't find last internal node %s for sample %s", lastOldNode->nodeName, sampleId); - } -// Look for a leaf kid with no mutations relative to the parent, should be closest. -if (node->numEdges == 0) +static char *findNearestNeighbor(struct phyloTree *tree, char *sampleId, + struct hash *samplePlacements) +/* Find sampleId's parent node in bigTree, then look for most similar leaf sibling(s), + * excluding other uploaded samples (which can be found in samplePlacements). */ { - struct slName *nodeList = hashFindVal(bigTree->condensedNodes, node->ident->name); - if (nodeList) - nearestNeighbor = cloneString(nodeList->name); - else - nearestNeighbor = cloneString(node->ident->name); - } -else +struct phyloTree *sampleNode = phyloFindName(tree, sampleId); +if (!sampleNode || ! sampleNode->parent) + return NULL; +struct phyloTree *node = sampleNode->parent; +char *nearestNeighbor = leafWithLeastMuts(node, samplePlacements); +while (nearestNeighbor == NULL && node->parent != NULL) { - nearestNeighbor = leafWithLeastMuts(node, bigTree); - if (nearestNeighbor == NULL && node->parent != NULL) - nearestNeighbor = leafWithLeastMuts(node->parent, bigTree); + node = node->parent; + nearestNeighbor = leafWithLeastMuts(node, samplePlacements); } return nearestNeighbor; } static void printVariantPathNoNodeNames(FILE *f, struct variantPathNode *variantPath) /* Print out variant path with no node names (even if non-numeric) to f. */ { struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) fprintf(f, " > "); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { @@ -942,48 +922,30 @@ met = hashFindVal(sampleMetadata, eg+strlen("_eg_")); } return met; } static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ { char *lineage = NULL; struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); if (met) lineage = met->lineage ? met->lineage : met->nLineage; return lineage; } -static void findNearestNeighbors(struct hash *samplePlacements, struct hash *sampleMetadata, - struct mutationAnnotatedTree *bigTree) -/* For each placed sample, find the nearest neighbor in the bigTree and its assigned lineage, - * and fill in those two fields of placementInfo. */ -{ -if (!bigTree) - return; -struct hashCookie cookie = hashFirst(samplePlacements); -struct hashEl *hel; -while ((hel = hashNext(&cookie)) != NULL) - { - struct placementInfo *info = hel->val; - info->nearestNeighbor = findNearestNeighbor(bigTree, info->sampleId, info->variantPath); - if (isNotEmpty(info->nearestNeighbor)) - info->neighborLineage = lineageForSample(sampleMetadata, info->nearestNeighbor); - } -} - static void printLineageLink(char *lineage, char *db) /* Print lineage with link to outbreak.info or pango-designation issue if appropriate. * Caller must handle case of empty/NULL lineage. */ { boolean isSarsCov2 = sameString(db, "wuhCor1"); if (isEmpty(lineage)) errAbort("programmer error:printLineageLink called with empty lineage"); else if (isSarsCov2 && startsWith("proposed", lineage)) printf("%s", lineage+strlen("proposed"), lineage); else if (isSarsCov2 && differentString(lineage, "None") && differentString(lineage, "Unassigned")) { // Trim _suffix that I add to extra tree annotations for problematic branches, if present. char lineageCopy[strlen(lineage)+1]; safecpy(lineageCopy, sizeof lineageCopy, lineage); @@ -1040,32 +1002,40 @@ break; } } return diff; } static struct slRef *getPlacementRefList(struct slName *sampleIds, struct hash *samplePlacements) /* Look up sampleIds in samplePlacements and return ref list of placements. */ { struct slRef *placementRefs = NULL; struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) { struct placementInfo *info = hashFindVal(samplePlacements, sample->name); if (!info) + { + // Usher might have added a prefix to distinguish from a sequence with the same name + // already in the tree. + char nameWithPrefix[strlen(USHER_DEDUP_PREFIX) + strlen(sample->name) + 1]; + safef(nameWithPrefix, sizeof nameWithPrefix, "%s%s", USHER_DEDUP_PREFIX, sample->name); + info = hashFindVal(samplePlacements, nameWithPrefix); + if (!info) errAbort("getPlacementRefList: can't find placement info for sample '%s'", sample->name); + } slAddHead(&placementRefs, slRefNew(info)); } slReverse(&placementRefs); return placementRefs; } static int countIdentical(struct slRef *placementRefs) /* Return the number of placements that have identical sampleMuts lists. */ { int clumpCount = 0; struct slRef *ref; for (ref = placementRefs; ref != NULL; ref = ref->next) { clumpCount++; if (ref->next == NULL || placementInfoRefCmpSampleMuts(&ref, &ref->next)) @@ -1225,51 +1195,74 @@ if (kidCount > 1) { slReverse(&prunedKids); // There is no phyloTreeFree, but if we ever add one, should use it here. node->numEdges = kidCount; struct phyloTree *kid; for (i = 0, kid = prunedKids; i < kidCount; i++, kid = kid->next) { node->edges[i] = kid; kid->parent = node; } } else return prunedKids; } -else if (! (node->ident->name && slNameInList(sampleIds, node->ident->name))) +else if (! (node->ident->name && slNameInListUseCase(sampleIds, node->ident->name))) node = NULL; return node; } +// NOTE: it would be more efficient to associate a subtree with each sample after parsing +// and sorting subtrees, e.g. hash and/or store a link in placementInfo so we don't have to search +// subtrees for every sample like this, but this will do for now. static struct subtreeInfo *subtreeInfoForSample(struct subtreeInfo *subtreeInfoList, char *name, int *retIx) /* Find the subtree that contains sample name and set *retIx to its index in the list. * If we can't find it, return NULL and set *retIx to -1. */ { struct subtreeInfo *ti; int ix; for (ti = subtreeInfoList, ix = 0; ti != NULL; ti = ti->next, ix++) - if (slNameInList(ti->subtreeUserSampleIds, name)) + if (slNameInListUseCase(ti->subtreeUserSampleIds, name)) break; if (ti == NULL) ix = -1; *retIx = ix; return ti; } +static void findNearestNeighbors(struct usherResults *results, struct hash *sampleMetadata) +/* For each placed sample, find the nearest neighbor in the subtree and its assigned lineage, + * and fill in those two fields of placementInfo. */ +{ +struct hashCookie cookie = hashFirst(results->samplePlacements); +struct hashEl *hel; +while ((hel = hashNext(&cookie)) != NULL) + { + struct placementInfo *info = hel->val; + int ix; + struct subtreeInfo *ti = subtreeInfoForSample(results->subtreeInfoList, info->sampleId, &ix); + if (!ti) + continue; + info->nearestNeighbor = findNearestNeighbor(ti->subtree, info->sampleId, + results->samplePlacements); + if (isNotEmpty(info->nearestNeighbor)) + info->neighborLineage = lineageForSample(sampleMetadata, info->nearestNeighbor); + } +} + static void lookForCladesAndLineages(struct hash *samplePlacements, boolean *retGotClades, boolean *retGotLineages) /* See if UShER has annotated any clades and/or lineages for seqs. */ { boolean gotClades = FALSE, gotLineages = FALSE; struct hashEl *hel; struct hashCookie cookie = hashFirst(samplePlacements); while ((hel = hashNext(&cookie)) != NULL) { struct placementInfo *pi = hel->val; if (pi) { if (isNotEmpty(pi->nextClade)) gotClades = TRUE; if (isNotEmpty(pi->pangoLineage)) @@ -1832,31 +1825,31 @@ if (gotClades) printf("n/a"); if (gotLineages) printf("n/a"); printf("n/an/an/an/an/a"); } printSubtreeTd(ur->subtreeInfoList, jsonTns, si->seq->name); puts(""); } puts("

"); } } static void summarizeSubtrees(struct slName *sampleIds, struct usherResults *results, struct hash *sampleMetadata, struct tempName *jsonTns[], - struct mutationAnnotatedTree *bigTree, char *db) + char *db) /* Print a summary table of pasted/uploaded identifiers and subtrees */ { boolean gotClades = FALSE, gotLineages = FALSE; lookForCladesAndLineages(results->samplePlacements, &gotClades, &gotLineages); puts(""); puts(""); if (gotClades) puts(""); if (gotLineages) puts("
SequenceNextstrain clade (UShER)" TOOLTIP("The Nextstrain clade " "assigned to the sequence by UShER according to its place in the phylogenetic tree") "Pango lineage (UShER)" TOOLTIP("The numEdges) { int i; for (i = 0; i < node->numEdges; i++) rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } else { writeOneTsvRow(f, node->ident->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } } -static struct hash *hashFromSeqInfoList(struct seqInfo *seqInfoList) -/* Hash sequence name to seqInfo for quick lookup. */ +static struct hash *hashFromSeqInfoListAndIds(struct seqInfo *seqInfoList, struct slName *sampleIds) +/* Hash sequence name (possibly prefixed by usher) to seqInfo for quick lookup. */ { struct hash *hash = hashNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) + { + if (! slNameInListUseCase(sampleIds, si->seq->name)) + { + // Usher might have added a prefix to distinguish from a sequence with the same name + // already in the tree. + char nameWithPrefix[strlen(USHER_DEDUP_PREFIX) + strlen(si->seq->name) + 1]; + safef(nameWithPrefix, sizeof nameWithPrefix, "%s%s", USHER_DEDUP_PREFIX, si->seq->name); + if (slNameInListUseCase(sampleIds, nameWithPrefix)) + si->seq->name = cloneString(nameWithPrefix); + } hashAdd(hash, si->seq->name, si); + } return hash; } static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree, struct slName *sampleIds, struct hash *seqInfoHash, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *spikeChanges, int *pStartTime) /* Write a tab-separated summary file for download. If the user uploaded enough samples to make * a tree, then write out samples in tree order; otherwise use sampleIds list. * Accumulate S gene changes. */ { struct tempName *tsvTn = NULL; AllocVar(tsvTn); trashDirFile(tsvTn, "ct", "usher_samples", ".tsv"); FILE *f = mustOpen(tsvTn->forCgi, "w"); @@ -2444,66 +2448,70 @@ printf("Global phylogenetic tree with your sequences | ", treeFile); printf("TSV summary of sequences and placements | ", sampleSummaryFile); printf("TSV summary of Spike mutations | ", spikeSummaryFile); printf("ZIP file of subtree JSON and Newick files | ", subtreeZipFile); puts("

"); } static int subTreeInfoUserSampleCmp(const void *pa, const void *pb) /* Compare subtreeInfo by number of user sample IDs (highest number first). */ { struct subtreeInfo *tiA = *(struct subtreeInfo **)pa; struct subtreeInfo *tiB = *(struct subtreeInfo **)pb; return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds); } -static void getProtobufMetadataSource(char *db, char *protobufFile, char **retProtobufPath, - char **retMetadataFile, char **retSource, char **retAliasFile) +static void getProtobufMetadataSource(char *db, struct treeChoices *treeChoices, + char *protobufFile, char **retProtobufPath, + char **retMetadataFile, char **retSource, + char **retAliasFile, char **retSampleNameFile) /* If the config file specifies a list of tree choices, and protobufFile is a valid choice, then * set ret* to the files associated with that choice. Otherwise fall back on older conf settings. * Return the selected treeChoice if there is one. */ { -struct treeChoices *treeChoices = loadTreeChoices(db); if (treeChoices) { *retProtobufPath = protobufFile; if (isEmpty(*retProtobufPath)) *retProtobufPath = treeChoices->protobufFiles[0]; int i; for (i = 0; i < treeChoices->count; i++) if (sameString(treeChoices->protobufFiles[i], *retProtobufPath)) { *retMetadataFile = treeChoices->metadataFiles[i]; *retSource = treeChoices->sources[i]; *retAliasFile = treeChoices->aliasFiles[i]; + *retSampleNameFile = treeChoices->sampleNameFiles[i]; break; } if (i == treeChoices->count) { *retProtobufPath = treeChoices->protobufFiles[0]; *retMetadataFile = treeChoices->metadataFiles[0]; *retSource = treeChoices->sources[0]; *retAliasFile = treeChoices->aliasFiles[0]; + *retSampleNameFile = treeChoices->sampleNameFiles[0]; } } else { // Fall back on old settings *retProtobufPath = getUsherAssignmentsPath(db, TRUE); *retMetadataFile = phyloPlaceDbSettingPath(db, "metadataFile"); *retSource = "GISAID"; *retAliasFile = NULL; + *retSampleNameFile = NULL; } } static void addModVersion(struct hash *nameHash, char *id, char *fullName) /* Map id to fullName. If id has .version, strip that (modifying id!) and map versionless id * to fullName. */ { hashAdd(nameHash, id, fullName); char *p = strchr(id, '.'); if (p) { *p = '\0'; hashAdd(nameHash, id, fullName); } } @@ -2598,39 +2606,61 @@ int wordCount = chopTabs(line, words); lineFileExpectWords(lf, 2, wordCount); char *fullName = hashFindVal(nameHash, words[1]); if (fullName) hashAdd(nameHash, words[0], fullName); else { if (missExample == NULL) missExample = cloneString(words[1]); } } lineFileClose(&lf); } } -static struct hash *getTreeNames(struct mutationAnnotatedTree *bigTree, char *aliasFile, - boolean addComponents) -/* Make a hash of full names of leaves of bigTree; if addComponents, also map components of those - * names to the full names in case the user gives us partial names/IDs. */ +static struct hash *getTreeNames(char *nameFile, char *protobufFile, + struct mutationAnnotatedTree **pBigTree, + boolean addComponents, int *pStartTime) +/* Make a hash of full names of leaves of bigTree, using nameFile if it exists, otherwise + * falling back on the bigTree itself, loading it if necessary. If addComponents, also map + * components of those names to the full names in case the user gives us partial names/IDs. */ { +struct hash *nameHash = NULL; +if (isNotEmpty(nameFile) && fileExists(nameFile)) + { + nameHash = hashNew(26); + struct lineFile *lf = lineFileOpen(nameFile, TRUE); + char *line; + while (lineFileNext(lf, &line, NULL)) + addNameMaybeComponents(nameHash, line, addComponents); + lineFileClose(&lf); + } +else + { + if (*pBigTree == NULL) + { + *pBigTree = parseParsimonyProtobuf(protobufFile); + reportTiming(pStartTime, "parse protobuf file (at startup, because sample names file was " + "not provided)"); + } + struct mutationAnnotatedTree *bigTree = *pBigTree; int nodeCount = bigTree->nodeHash->elCount; -struct hash *nameHash = hashNew(digitsBaseTwo(nodeCount) + 3); + nameHash = hashNew(digitsBaseTwo(nodeCount) + 3); rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash, addComponents); -addAliases(nameHash, aliasFile); + } +reportTiming(pStartTime, "get tree names"); return nameHash; } static char *matchName(struct hash *nameHash, char *name) /* Look for a possibly partial name or ID provided by the user in nameHash. Return the result, * possibly NULL. If the full name doesn't match, try components of the name. */ { name = trimSpaces(name); // GISAID fasta headers all have hCoV-19/country/isolate/year|EPI_ISL_#|date; strip the hCoV-19 // because Nextstrain strips it in nextmeta/nextfasta download files, and so do I when building // UCSC's tree. if (startsWithNoCase("hCoV-19/", name)) name += strlen("hCoV-19/"); char *match = hashFindVal(nameHash, name); int minWordSize=5; @@ -2739,39 +2769,37 @@ } else { // Just one ID char oneId[strlen(line)+1]; safef(oneId, sizeof oneId, "%s%s", prefixA, numberA); char *match = hashFindVal(nameHash, oneId); foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched); } // Skip past this match to see if the line has another range next. line += (substrArr[0].rm_eo - substrArr[0].rm_so); } return foundAny; } -static struct slName *readSampleIds(struct lineFile *lf, struct mutationAnnotatedTree *bigTree, - char *aliasFile) +static struct slName *readSampleIds(struct lineFile *lf, struct hash *nameHash) /* Read a file of sample names/IDs from the user; typically these will not be exactly the same * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt * to find component matches if an exact match isn't found. */ { struct slName *sampleIds = NULL; struct slName *unmatched = NULL; -struct hash *nameHash = getTreeNames(bigTree, aliasFile, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { // If tab-sep or comma-sep, just try first word in line char *tab = strchr(line, '\t'); if (tab) *tab = '\0'; else { char *comma = strchr(line, ','); if (comma) *comma = '\0'; } char *match = matchName(nameHash, line); if (match) @@ -2798,203 +2826,253 @@ i++, example = example->next) { dyStringAppendSep(firstFew, ", "); dyStringPrintf(firstFew, "'%s'", example->name); } warn("Unable to find %d of your sequences in the tree, e.g. %s", slCount(unmatched), firstFew->string); dyStringFree(&firstFew); } else if (sampleIds == NULL) warn("Could not find any names in input; empty file uploaded?"); slNameFreeList(&unmatched); return sampleIds; } +void *loadMetadataWorker(void *arg) +/* pthread worker function to read a tab-sep metadata file and return it hashed by name. */ +{ +char *metadataFile = arg; +int startTime = clock1000(); +struct hash *sampleMetadata = getSampleMetadata(metadataFile); +reportTiming(&startTime, "read sample metadata (in a pthread)"); +return sampleMetadata; +} + +static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *)) +/* Fork off a child process that parses a file and returns the resulting data structure. */ +{ +pthread_t *pt; +AllocVar(pt); +if (! pthreadMayCreate(pt, NULL, workerFunction, filename)) + pt = NULL; +return pt; +} + char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf, boolean doMeasureTiming, int subtreeSize, int fontHeight, boolean *retSuccess) /* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids: * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results, * make custom track files and return the top-level custom track file. * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils * to make subtrees, show subtree results, and return NULL. Set retSuccess to TRUE if we were * able to get at least some results for the user's input. */ { char *ctFile = NULL; if (retSuccess) *retSuccess = FALSE; measureTiming = doMeasureTiming; int startTime = clock1000(); struct tempName *vcfTn = NULL; struct slName *sampleIds = NULL; char *usherPath = getUsherPath(TRUE); char *protobufPath = NULL; char *source = NULL; char *metadataFile = NULL; char *aliasFile = NULL; -getProtobufMetadataSource(db, defaultProtobuf, &protobufPath, &metadataFile, &source, &aliasFile); -struct mutationAnnotatedTree *bigTree = parseParsimonyProtobuf(protobufPath); -reportTiming(&startTime, "parse protobuf file"); -if (! bigTree) - { - warn("Problem parsing %s; can't make subtree subtracks.", protobufPath); - } +char *sampleNameFile = NULL; +struct treeChoices *treeChoices = loadTreeChoices(db); +getProtobufMetadataSource(db, treeChoices, defaultProtobuf, + &protobufPath, &metadataFile, &source, &aliasFile, &sampleNameFile); +reportTiming(&startTime, "start up and find the tree etc. files"); +struct mutationAnnotatedTree *bigTree = NULL; lineFileCarefulNewlines(lf); char *chrom = hDefaultChrom(db); int chromSize = hChromSize(db, chrom); struct slName **maskSites = getProblematicSites(db, chrom, chromSize); -boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites, chromSize); +//#*** TODO: add CGI param option for this almost-never-needed tweak: +if (0) + { + bigTree = parseParsimonyProtobuf(protobufPath); + reportTiming(&startTime, "parse protobuf file (at startup, for excluding informativeBases " + "from maskSites)"); + informativeBasesFromTree(bigTree->tree, maskSites, chromSize); + reportTiming(&startTime, "remove any informative bases in tree from maskSites"); + } struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize); boolean isFasta = FALSE; boolean subtreesOnly = FALSE; struct seqInfo *seqInfoList = NULL; if (lfLooksLikeFasta(lf)) { struct slPair *failedSeqs; struct slPair *failedPsls; - struct hash *treeNames = getTreeNames(bigTree, NULL, FALSE); - vcfTn = vcfFromFasta(lf, db, refGenome, informativeBases, maskSites, treeNames, + struct hash *treeNames = NULL; + // We need to check uploaded names in fasta only for original usher, not usher-sampled(-server). + if (!serverIsConfigured(db) && !endsWith(usherPath, "-sampled")) + treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, FALSE, &startTime); + vcfTn = vcfFromFasta(lf, db, refGenome, maskSites, treeNames, &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime); if (failedSeqs) { puts("

"); struct slPair *fail; for (fail = failedSeqs; fail != NULL; fail = fail->next) printf("%s
\n", fail->name); puts("

"); } if (failedPsls) { puts("

"); struct slPair *fail; for (fail = failedPsls; fail != NULL; fail = fail->next) printf("%s
\n", fail->name); puts("

"); } if (seqInfoList == NULL) printf("

Sorry, could not align any sequences to reference well enough to place in " "the phylogenetic tree.

\n"); isFasta = TRUE; } else if (lfLooksLikeVcf(lf)) { vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds); reportTiming(&startTime, "check uploaded VCF"); } else { subtreesOnly = TRUE; - sampleIds = readSampleIds(lf, bigTree, aliasFile); + struct hash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE, &startTime); + addAliases(treeNames, aliasFile); + reportTiming(&startTime, "load sample name aliases"); + sampleIds = readSampleIds(lf, treeNames); + reportTiming(&startTime, "look up uploaded sample names"); } lineFileClose(&lf); -struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList); if (sampleIds == NULL) { return ctFile; } + +// Kick off child thread to load metadata simultaneously with running usher or matUtils. +pthread_t *metadataPthread = mayStartLoaderPthread(metadataFile, loadMetadataWorker); + struct usherResults *results = NULL; if (vcfTn) { fflush(stdout); - results = runUsher(usherPath, protobufPath, vcfTn->forCgi, - subtreeSize, sampleIds, bigTree->condensedNodes, - &startTime); - if (results) - addSampleMutsFromSeqInfo(results->samplePlacements, seqInfoHash); + results = runUsher(db, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds, + treeChoices, &startTime); } else if (subtreesOnly) { char *matUtilsPath = getMatUtilsPath(TRUE); results = runMatUtilsExtractSubtrees(matUtilsPath, protobufPath, subtreeSize, - sampleIds, bigTree->condensedNodes, - &startTime); + sampleIds, &startTime); + } + +struct hash *sampleMetadata = NULL; +if (metadataPthread) + { + pthreadJoin(metadataPthread, (void **)(&sampleMetadata)); + reportTiming(&startTime, "wait for sample metadata loading thread to finish"); } +else + { + // We really need metadata -- load it the slow way. + sampleMetadata = getSampleMetadata(metadataFile); + reportTiming(&startTime, "load sample metadata without pthread"); + } + if (results && results->singleSubtreeInfo) { if (retSuccess) *retSuccess = TRUE; puts("

"); readQcThresholds(db); int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome); struct seqWindow *gSeqWin = memSeqWindowNew(chrom, refGenome->dna); - struct hash *sampleMetadata = getSampleMetadata(metadataFile); struct hash *sampleUrls = hashNew(0); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); char subtreeName[512]; safef(subtreeName, sizeof(subtreeName), "subtreeAuspice%d", ix+1); trashDirFile(jsonTns[ix], "ct", subtreeName, ".json"); treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, NULL, jsonTns[ix]->forCgi, source); // Add a link for every sample to this subtree, so the single-subtree JSON can // link to subtree JSONs char *subtreeUrl = nextstrainUrlFromTn(jsonTns[ix]); struct slName *sample; for (sample = ti->subtreeUserSampleIds; sample != NULL; sample = sample->next) hashAdd(sampleUrls, sample->name, subtreeUrl); } struct tempName *singleSubtreeJsonTn; AllocVar(singleSubtreeJsonTn); trashDirFile(singleSubtreeJsonTn, "ct", "singleSubtreeAuspice", ".json"); treeToAuspiceJson(results->singleSubtreeInfo, db, geneInfoList, gSeqWin, sampleMetadata, sampleUrls, singleSubtreeJsonTn->forCgi, source); + reportTiming(&startTime, "make Auspice JSON"); struct subtreeInfo *subtreeInfoForButtons = results->subtreeInfoList; if (subtreeCount > MAX_SUBTREE_BUTTONS) subtreeInfoForButtons = NULL; makeButtonRow(singleSubtreeJsonTn, jsonTns, subtreeInfoForButtons, subtreeSize, isFasta, !subtreesOnly); printf("

If you have metadata you wish to display, click a 'view subtree in " "Nextstrain' button, and then you can drag on a CSV file to " "add it to the tree view." "

\n"); puts("

Note: " "The Nextstrain subtree views, and Download files below, are temporary files and will " "expire within two days. " "Please download the Nextstrain subtree JSON files if you will want to view them " "again in the future. The JSON files can be drag-dropped onto " "https://auspice.us/." "

"); struct tempName *tsvTn = NULL, *sTsvTn = NULL; struct tempName *zipTn = makeSubtreeZipFile(results, jsonTns, singleSubtreeJsonTn, &startTime); struct tempName *ctTn = NULL; if (subtreesOnly) { - summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, bigTree, db); + summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, db); reportTiming(&startTime, "describe subtrees"); } else { - findNearestNeighbors(results->samplePlacements, sampleMetadata, bigTree); + findNearestNeighbors(results, sampleMetadata); + reportTiming(&startTime, "find nearest neighbors"); // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; - ctTn = writeCustomTracks(db, vcfTn, results, sampleIds, bigTree, source, fontHeight, + ctTn = writeCustomTracks(db, vcfTn, results, sampleIds, source, fontHeight, &sampleTree, &startTime); // Make a sample summary TSV file and accumulate S gene changes + struct hash *seqInfoHash = hashFromSeqInfoListAndIds(seqInfoList, sampleIds); + addSampleMutsFromSeqInfo(results->samplePlacements, seqInfoHash); struct hash *spikeChanges = hashNew(0); tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoHash, geneInfoList, gSeqWin, spikeChanges, &startTime); sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds)); downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml, zipTn->forHtml); int seqCount = slCount(seqInfoList); if (seqCount <= MAX_SEQ_DETAILS) { char *refAcc = cloneString(chrom); if (regexMatch(refAcc, "v[0-9]+$")) { char *v = strrchr(refAcc, 'v'); assert(v != NULL);