be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index b2e5c13..2a44636 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -96,30 +96,80 @@ else if (abortIfNotFound) errAbort("Missing required file %s", usherAssignmentsPath); return NULL; } //#*** This needs to go in a lib so CGIs know whether to include it in the menu. needs better name. boolean hgPhyloPlaceEnabled() /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */ { char *cfgSetting = cfgOption("hgPhyloPlaceEnabled"); boolean isEnabled = (isNotEmpty(cfgSetting) && differentWord(cfgSetting, "off") && differentWord(cfgSetting, "no")); return (isEnabled && hDbExists("wuhCor1")); } +static void addPathIfNecessary(struct dyString *dy, char *db, char *fileName) +/* If fileName exists, copy it into dy, else try hgPhyloPlaceData/<db>/fileName */ +{ +dyStringClear(dy); +if (fileExists(fileName)) + dyStringAppend(dy, fileName); +else + dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", db, fileName); +} + +struct treeChoices *loadTreeChoices(char *db) +/* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */ +{ +struct treeChoices *treeChoices = NULL; +char *filename = phyloPlaceDbSettingPath(db, "treeChoices"); +if (isNotEmpty(filename) && fileExists(filename)) + { + AllocVar(treeChoices); + int maxChoices = 128; + AllocArray(treeChoices->protobufFiles, maxChoices); + AllocArray(treeChoices->metadataFiles, maxChoices); + AllocArray(treeChoices->sources, maxChoices); + AllocArray(treeChoices->descriptions, maxChoices); + struct lineFile *lf = lineFileOpen(filename, TRUE); + char *line; + while (lineFileNextReal(lf, &line)) + { + char *words[5]; + int wordCount = chopTabs(line, words); + lineFileExpectWords(lf, 4, wordCount); + if (treeChoices->count >= maxChoices) + { + warn("File %s has too many lines, only showing first %d phylogenetic tree choices", + filename, maxChoices); + break; + } + struct dyString *dy = dyStringNew(0); + addPathIfNecessary(dy, db, words[0]); + treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string); + addPathIfNecessary(dy, db, words[1]); + treeChoices->metadataFiles[treeChoices->count] = dyStringCannibalize(&dy); + treeChoices->sources[treeChoices->count] = cloneString(words[2]); + treeChoices->descriptions[treeChoices->count] = cloneString(words[3]); + treeChoices->count++; + } + lineFileClose(&lf); + } +return treeChoices; +} + static char *urlFromTn(struct tempName *tn) /* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't * just leave it up to the user's web browser to do the right thing with "../". */ { struct dyString *dy = dyStringCreate("%s%s", hLocalHostCgiBinUrl(), tn->forHtml); return dyStringCannibalize(&dy); } void reportTiming(int *pStartTime, char *message) /* Print out a report to stderr of how much time something took. */ { if (measureTiming) { int now = clock1000(); fprintf(stderr, "%dms to %s\n", now - *pStartTime, message); @@ -712,37 +762,37 @@ met = hashFindVal(sampleMetadata, words[1]); } return met; } static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ { char *lineage = NULL; struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); if (met) lineage = met->lineage; return lineage; } -static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree, +static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree, char *source, struct placementInfo *info, struct hash *sampleMetadata) /* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */ { if (bigTree) { - printf("<p>Nearest neighboring GISAID sequence already in phylogenetic tree: "); + printf("<p>Nearest neighboring %s sequence already in phylogenetic tree: ", source); struct slName *neighbors = findNearestNeighbor(bigTree, info->sampleId, info->variantPath); struct slName *neighbor; for (neighbor = neighbors; neighbor != NULL; neighbor = neighbor->next) { if (neighbor != neighbors) printf(", "); printf("%s", neighbor->name); char *lineage = lineageForSample(sampleMetadata, neighbor->name); if (isNotEmpty(lineage)) printf(": lineage %s", lineage); } puts("</p>"); } } @@ -855,31 +905,31 @@ else safecpy(indentForKids+indentLen - 4, 4 + 1, "| "); } if (node->numEdges > 0) { char kidIndent[strlen(indent)+5]; safef(kidIndent, sizeof kidIndent, "%s%s", indentForKids, "+---"); int i; for (i = 0; i < node->numEdges; i++) asciiTree(node->edges[i], kidIndent, (i == node->numEdges - 1)); } } static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements, struct phyloTree *subtree, struct hash *sampleMetadata, - struct mutationAnnotatedTree *bigTree) + struct mutationAnnotatedTree *bigTree, char *source) /* Report how each sample fits into the big tree. */ { // Sort sample placements by sampleMuts so we can group identical samples. struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements); slSort(&placementRefs, placementInfoRefCmpSampleMuts); int relatedCount = slCount(placementRefs); int clumpSize = countIdentical(placementRefs); if (clumpSize < relatedCount && relatedCount > 2) { // Not all of the related sequences are identical, so they will be broken down into // separate "clumps". List all related samples first to avoid confusion. puts("<pre>"); asciiTree(subtree, "", TRUE); puts("</pre>"); } @@ -912,31 +962,31 @@ } refsToGo = ref; displaySampleMuts(info); if (info->imputedBases) { puts("<p>Base values imputed by parsimony:\n<ul>"); struct baseVal *bv; for (bv = info->imputedBases; bv != NULL; bv = bv->next) printf("<li>%d: %s\n", bv->chromStart+1, bv->val); puts("</ul>"); puts("</p>"); } displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples"); displayBestNodes(info, sampleMetadata); if (!showBestNodePaths) - displayNearestNeighbors(bigTree, info, sampleMetadata); + displayNearestNeighbors(bigTree, source, info, sampleMetadata); if (showParsimonyScore && info->parsimonyScore > 0) printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore); //#*** TODO: explain parsimony score } } struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds) /* Prune all descendants of node that have no leaf descendants in sampleIds. */ { if (node->numEdges) { struct phyloTree *prunedKids = NULL; int i; for (i = 0; i < node->numEdges; i++) { @@ -1422,43 +1472,73 @@ slNameAddHead(&pSites[i], reason); } bigBedFileClose(&bbi); } return pSites; } static int subTreeInfoUserSampleCmp(const void *pa, const void *pb) /* Compare subtreeInfo by number of user sample IDs (highest number first). */ { struct subtreeInfo *tiA = *(struct subtreeInfo **)pa; struct subtreeInfo *tiB = *(struct subtreeInfo **)pb; return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds); } -char *phyloPlaceSamples(struct lineFile *lf, char *db, boolean doMeasureTiming, int subtreeSize, - int fontHeight) +char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf, + boolean doMeasureTiming, int subtreeSize, int fontHeight) /* Given a lineFile that contains either FASTA or VCF, prepare VCF for usher; * if that goes well then run usher, report results, make custom track files * and return the top-level custom track file; otherwise return NULL. */ { char *ctFile = NULL; measureTiming = doMeasureTiming; int startTime = clock1000(); struct tempName *vcfTn = NULL; struct slName *sampleIds = NULL; char *usherPath = getUsherPath(TRUE); -char *usherAssignmentsPath = getUsherAssignmentsPath(db, TRUE); +char *usherAssignmentsPath = NULL; +char *source = NULL; +char *metadataFile = NULL; +struct treeChoices *treeChoices = loadTreeChoices(db); +if (treeChoices) + { + usherAssignmentsPath = defaultProtobuf; + if (isEmpty(usherAssignmentsPath)) + usherAssignmentsPath = treeChoices->protobufFiles[0]; + int i; + for (i = 0; i < treeChoices->count; i++) + if (sameString(treeChoices->protobufFiles[i], usherAssignmentsPath)) + { + metadataFile = treeChoices->metadataFiles[i]; + source = treeChoices->sources[i]; + break; + } + if (i == treeChoices->count) + { + usherAssignmentsPath = treeChoices->protobufFiles[0]; + metadataFile = treeChoices->metadataFiles[0]; + source = treeChoices->sources[0]; + } + } +else + { + // Fall back on old settings + usherAssignmentsPath = getUsherAssignmentsPath(db, TRUE); + metadataFile = phyloPlaceDbSettingPath(db, "metadataFile"); + source = "GISAID"; + } struct mutationAnnotatedTree *bigTree = parseParsimonyProtobuf(usherAssignmentsPath); reportTiming(&startTime, "parse protobuf file"); if (! bigTree) { warn("Problem parsing %s; can't make subtree subtracks.", usherAssignmentsPath); } lineFileCarefulNewlines(lf); struct slName **maskSites = getProblematicSites(db); struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize); boolean isFasta = FALSE; struct seqInfo *seqInfoList = NULL; if (lfLooksLikeFasta(lf)) { boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites); struct slPair *failedSeqs; @@ -1499,72 +1579,71 @@ warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n"); } lineFileClose(&lf); if (vcfTn) { struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); if (results->subtreeInfoList) { int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); - char *metadataFile = phyloPlaceDbSettingPath(db, "metadataFile"); struct hash *sampleMetadata = getSampleMetadata(metadataFile); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json"); treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, sampleMetadata, - jsonTns[ix]->forCgi); + jsonTns[ix]->forCgi, source); } puts("<p></p>"); makeButtonRow(jsonTns, subtreeCount, isFasta); printf("<p>If you have metadata you wish to display, click a 'view subtree in Nextstrain' " "button, and then you can drag on a CSV file to " "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>." "</p>\n"); summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) printf("%d related samples", subtreeUserSampleCount); else if (subtreeCount > 1) printf("Unrelated sample"); printf("</h3>\n"); makeNextstrainButton("viewNextstrainSub", ix, jsonTns); puts("<br>"); // Make a sub-subtree with only user samples for display: struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi); subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds); describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, subtree, - sampleMetadata, bigTree); + sampleMetadata, bigTree, source); } reportTiming(&startTime, "describe placements"); // Make custom tracks for uploaded samples and subtree(s). struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, - fontHeight, &startTime); + source, fontHeight, &startTime); // Offer big tree w/new samples for download puts("<h3>Downloads</h3>"); puts("<ul>"); printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree " "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml, ti->subtreeUserSampleIds->name); struct slName *sln; for (sln = ti->subtreeUserSampleIds->next; sln != NULL; sln = sln->next) printf(", %s", sln->name); puts(" (Newick file)</a>"); printf("<li><a href='%s' download>Auspice JSON for subtree with %s",