b7c2dcc865207dd23298429f93057c9068acef68
angie
Wed Dec 9 15:48:34 2020 -0800
hgPhyloPlace: Add config option treeChoices: tab-sep file of {protobuf, metadata, source, description} so we can offer the user a choice between GISAID and public sequence trees (possibly different releases for reproducibility).
diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index b2e5c13..79dc952 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -96,30 +96,80 @@
else if (abortIfNotFound)
errAbort("Missing required file %s", usherAssignmentsPath);
return NULL;
}
//#*** This needs to go in a lib so CGIs know whether to include it in the menu. needs better name.
boolean hgPhyloPlaceEnabled()
/* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */
{
char *cfgSetting = cfgOption("hgPhyloPlaceEnabled");
boolean isEnabled = (isNotEmpty(cfgSetting) &&
differentWord(cfgSetting, "off") && differentWord(cfgSetting, "no"));
return (isEnabled && hDbExists("wuhCor1"));
}
+static void addPathIfNecessary(struct dyString *dy, char *db, char *fileName)
+/* If fileName exists, copy it into dy, else try hgPhyloPlaceData/<db>/fileName */
+{
+dyStringClear(dy);
+if (fileExists(fileName))
+ dyStringAppend(dy, fileName);
+else
+ dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", db, fileName);
+}
+
+struct treeChoices *loadTreeChoices(char *db)
+/* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */
+{
+struct treeChoices *treeChoices = NULL;
+char *filename = phyloPlaceDbSettingPath(db, "treeChoices");
+if (isNotEmpty(filename) && fileExists(filename))
+ {
+ AllocVar(treeChoices);
+ int maxChoices = 128;
+ AllocArray(treeChoices->protobufFiles, maxChoices);
+ AllocArray(treeChoices->metadataFiles, maxChoices);
+ AllocArray(treeChoices->sources, maxChoices);
+ AllocArray(treeChoices->descriptions, maxChoices);
+ struct lineFile *lf = lineFileOpen(filename, TRUE);
+ char *line;
+ while (lineFileNextReal(lf, &line))
+ {
+ char *words[5];
+ int wordCount = chopTabs(line, words);
+ lineFileExpectWords(lf, 4, wordCount);
+ if (treeChoices->count >= maxChoices)
+ {
+ warn("File %s has too many lines, only showing first %d phylogenetic tree choices",
+ filename, maxChoices);
+ break;
+ }
+ struct dyString *dy = dyStringNew(0);
+ addPathIfNecessary(dy, db, words[0]);
+ treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string);
+ addPathIfNecessary(dy, db, words[1]);
+ treeChoices->metadataFiles[treeChoices->count] = dyStringCannibalize(&dy);
+ treeChoices->sources[treeChoices->count] = cloneString(words[2]);
+ treeChoices->descriptions[treeChoices->count] = cloneString(words[3]);
+ treeChoices->count++;
+ }
+ lineFileClose(&lf);
+ }
+return treeChoices;
+}
+
static char *urlFromTn(struct tempName *tn)
/* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't
* just leave it up to the user's web browser to do the right thing with "../". */
{
struct dyString *dy = dyStringCreate("%s%s", hLocalHostCgiBinUrl(), tn->forHtml);
return dyStringCannibalize(&dy);
}
void reportTiming(int *pStartTime, char *message)
/* Print out a report to stderr of how much time something took. */
{
if (measureTiming)
{
int now = clock1000();
fprintf(stderr, "%dms to %s\n", now - *pStartTime, message);
@@ -712,37 +762,37 @@
met = hashFindVal(sampleMetadata, words[1]);
}
return met;
}
static char *lineageForSample(struct hash *sampleMetadata, char *sampleId)
/* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */
{
char *lineage = NULL;
struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId);
if (met)
lineage = met->lineage;
return lineage;
}
-static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree,
+static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree, char *source,
struct placementInfo *info, struct hash *sampleMetadata)
/* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */
{
if (bigTree)
{
- printf("<p>Nearest neighboring GISAID sequence already in phylogenetic tree: ");
+ printf("<p>Nearest neighboring %s sequence already in phylogenetic tree: ", source);
struct slName *neighbors = findNearestNeighbor(bigTree, info->sampleId, info->variantPath);
struct slName *neighbor;
for (neighbor = neighbors; neighbor != NULL; neighbor = neighbor->next)
{
if (neighbor != neighbors)
printf(", ");
printf("%s", neighbor->name);
char *lineage = lineageForSample(sampleMetadata, neighbor->name);
if (isNotEmpty(lineage))
printf(": lineage %s", lineage);
}
puts("</p>");
}
}
@@ -855,31 +905,31 @@
else
safecpy(indentForKids+indentLen - 4, 4 + 1, "| ");
}
if (node->numEdges > 0)
{
char kidIndent[strlen(indent)+5];
safef(kidIndent, sizeof kidIndent, "%s%s", indentForKids, "+---");
int i;
for (i = 0; i < node->numEdges; i++)
asciiTree(node->edges[i], kidIndent, (i == node->numEdges - 1));
}
}
static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements,
struct phyloTree *subtree, struct hash *sampleMetadata,
- struct mutationAnnotatedTree *bigTree)
+ struct mutationAnnotatedTree *bigTree, char *source)
/* Report how each sample fits into the big tree. */
{
// Sort sample placements by sampleMuts so we can group identical samples.
struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements);
slSort(&placementRefs, placementInfoRefCmpSampleMuts);
int relatedCount = slCount(placementRefs);
int clumpSize = countIdentical(placementRefs);
if (clumpSize < relatedCount && relatedCount > 2)
{
// Not all of the related sequences are identical, so they will be broken down into
// separate "clumps". List all related samples first to avoid confusion.
puts("<pre>");
asciiTree(subtree, "", TRUE);
puts("</pre>");
}
@@ -912,31 +962,31 @@
}
refsToGo = ref;
displaySampleMuts(info);
if (info->imputedBases)
{
puts("<p>Base values imputed by parsimony:\n<ul>");
struct baseVal *bv;
for (bv = info->imputedBases; bv != NULL; bv = bv->next)
printf("<li>%d: %s\n", bv->chromStart+1, bv->val);
puts("</ul>");
puts("</p>");
}
displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples");
displayBestNodes(info, sampleMetadata);
if (!showBestNodePaths)
- displayNearestNeighbors(bigTree, info, sampleMetadata);
+ displayNearestNeighbors(bigTree, source, info, sampleMetadata);
if (showParsimonyScore && info->parsimonyScore > 0)
printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore);
//#*** TODO: explain parsimony score
}
}
struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds)
/* Prune all descendants of node that have no leaf descendants in sampleIds. */
{
if (node->numEdges)
{
struct phyloTree *prunedKids = NULL;
int i;
for (i = 0; i < node->numEdges; i++)
{
@@ -1422,43 +1472,73 @@
slNameAddHead(&pSites[i], reason);
}
bigBedFileClose(&bbi);
}
return pSites;
}
static int subTreeInfoUserSampleCmp(const void *pa, const void *pb)
/* Compare subtreeInfo by number of user sample IDs (highest number first). */
{
struct subtreeInfo *tiA = *(struct subtreeInfo **)pa;
struct subtreeInfo *tiB = *(struct subtreeInfo **)pb;
return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds);
}
-char *phyloPlaceSamples(struct lineFile *lf, char *db, boolean doMeasureTiming, int subtreeSize,
- int fontHeight)
+char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf,
+ boolean doMeasureTiming, int subtreeSize, int fontHeight)
/* Given a lineFile that contains either FASTA or VCF, prepare VCF for usher;
* if that goes well then run usher, report results, make custom track files
* and return the top-level custom track file; otherwise return NULL. */
{
char *ctFile = NULL;
measureTiming = doMeasureTiming;
int startTime = clock1000();
struct tempName *vcfTn = NULL;
struct slName *sampleIds = NULL;
char *usherPath = getUsherPath(TRUE);
-char *usherAssignmentsPath = getUsherAssignmentsPath(db, TRUE);
+char *usherAssignmentsPath = NULL;
+char *source = NULL;
+char *metadataFile = NULL;
+struct treeChoices *treeChoices = loadTreeChoices(db);
+if (treeChoices)
+ {
+ usherAssignmentsPath = defaultProtobuf;
+ if (isEmpty(usherAssignmentsPath))
+ usherAssignmentsPath = treeChoices->protobufFiles[0];
+ int i;
+ for (i = 0; i < treeChoices->count; i++)
+ if (sameString(treeChoices->protobufFiles[i], usherAssignmentsPath))
+ {
+ metadataFile = treeChoices->metadataFiles[i];
+ source = treeChoices->sources[i];
+ break;
+ }
+ if (i == treeChoices->count)
+ {
+ usherAssignmentsPath = treeChoices->protobufFiles[0];
+ metadataFile = treeChoices->metadataFiles[0];
+ source = treeChoices->sources[0];
+ }
+ }
+else
+ {
+ // Fall back on old settings
+ usherAssignmentsPath = getUsherAssignmentsPath(db, TRUE);
+ metadataFile = phyloPlaceDbSettingPath(db, "metadataFile");
+ source = "GISAID";
+ }
struct mutationAnnotatedTree *bigTree = parseParsimonyProtobuf(usherAssignmentsPath);
reportTiming(&startTime, "parse protobuf file");
if (! bigTree)
{
warn("Problem parsing %s; can't make subtree subtracks.", usherAssignmentsPath);
}
lineFileCarefulNewlines(lf);
struct slName **maskSites = getProblematicSites(db);
struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize);
boolean isFasta = FALSE;
struct seqInfo *seqInfoList = NULL;
if (lfLooksLikeFasta(lf))
{
boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites);
struct slPair *failedSeqs;
@@ -1499,66 +1579,65 @@
warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n");
}
lineFileClose(&lf);
if (vcfTn)
{
struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi,
subtreeSize, sampleIds, bigTree->condensedNodes,
&startTime);
if (results->subtreeInfoList)
{
int subtreeCount = slCount(results->subtreeInfoList);
// Sort subtrees by number of user samples (largest first).
slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp);
// Make Nextstrain/auspice JSON file for each subtree.
char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile");
- char *metadataFile = phyloPlaceDbSettingPath(db, "metadataFile");
struct hash *sampleMetadata = getSampleMetadata(metadataFile);
struct tempName *jsonTns[subtreeCount];
struct subtreeInfo *ti;
int ix;
for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++)
{
AllocVar(jsonTns[ix]);
trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json");
treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, sampleMetadata,
- jsonTns[ix]->forCgi);
+ jsonTns[ix]->forCgi, source);
}
puts("<p></p>");
makeButtonRow(jsonTns, subtreeCount, isFasta);
printf("<p>If you have metadata you wish to display, click a 'view subtree in Nextstrain' "
"button, and then you can drag on a CSV file to "
"<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>."
"</p>\n");
summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree);
reportTiming(&startTime, "write summary table (including reading in lineages)");
for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++)
{
int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds);
printf("<h3>Subtree %d: ", ix+1);
if (subtreeUserSampleCount > 1)
printf("%d related samples", subtreeUserSampleCount);
else if (subtreeCount > 1)
printf("Unrelated sample");
printf("</h3>\n");
makeNextstrainButton("viewNextstrainSub", ix, jsonTns);
puts("<br>");
// Make a sub-subtree with only user samples for display:
struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi);
subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds);
describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, subtree,
- sampleMetadata, bigTree);
+ sampleMetadata, bigTree, source);
}
reportTiming(&startTime, "describe placements");
// Make custom tracks for uploaded samples and subtree(s).
struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree,
fontHeight, &startTime);
// Offer big tree w/new samples for download
puts("<h3>Downloads</h3>");
puts("<ul>");
printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree "
"with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml);
for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++)
{
printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml,