be4311c07e14feb728abc6425ee606ffaa611a58
markd
  Fri Jan 22 06:46:58 2021 -0800
merge with master

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index b2e5c13..2a44636 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -96,30 +96,80 @@
 else if (abortIfNotFound)
     errAbort("Missing required file %s", usherAssignmentsPath);
 return NULL;
 }
 
 //#*** This needs to go in a lib so CGIs know whether to include it in the menu. needs better name.
 boolean hgPhyloPlaceEnabled()
 /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */
 {
 char *cfgSetting = cfgOption("hgPhyloPlaceEnabled");
 boolean isEnabled = (isNotEmpty(cfgSetting) &&
                      differentWord(cfgSetting, "off") && differentWord(cfgSetting, "no"));
 return (isEnabled && hDbExists("wuhCor1"));
 }
 
+static void addPathIfNecessary(struct dyString *dy, char *db, char *fileName)
+/* If fileName exists, copy it into dy, else try hgPhyloPlaceData/<db>/fileName */
+{
+dyStringClear(dy);
+if (fileExists(fileName))
+    dyStringAppend(dy, fileName);
+else
+    dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", db, fileName);
+}
+
+struct treeChoices *loadTreeChoices(char *db)
+/* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */
+{
+struct treeChoices *treeChoices = NULL;
+char *filename = phyloPlaceDbSettingPath(db, "treeChoices");
+if (isNotEmpty(filename) && fileExists(filename))
+    {
+    AllocVar(treeChoices);
+    int maxChoices = 128;
+    AllocArray(treeChoices->protobufFiles, maxChoices);
+    AllocArray(treeChoices->metadataFiles, maxChoices);
+    AllocArray(treeChoices->sources, maxChoices);
+    AllocArray(treeChoices->descriptions, maxChoices);
+    struct lineFile *lf = lineFileOpen(filename, TRUE);
+    char *line;
+    while (lineFileNextReal(lf, &line))
+        {
+        char *words[5];
+        int wordCount = chopTabs(line, words);
+        lineFileExpectWords(lf, 4, wordCount);
+        if (treeChoices->count >= maxChoices)
+            {
+            warn("File %s has too many lines, only showing first %d phylogenetic tree choices",
+                 filename, maxChoices);
+            break;
+            }
+        struct dyString *dy = dyStringNew(0);
+        addPathIfNecessary(dy, db, words[0]);
+        treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string);
+        addPathIfNecessary(dy, db, words[1]);
+        treeChoices->metadataFiles[treeChoices->count] = dyStringCannibalize(&dy);
+        treeChoices->sources[treeChoices->count] = cloneString(words[2]);
+        treeChoices->descriptions[treeChoices->count] = cloneString(words[3]);
+        treeChoices->count++;
+        }
+    lineFileClose(&lf);
+    }
+return treeChoices;
+}
+
 static char *urlFromTn(struct tempName *tn)
 /* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't
  * just leave it up to the user's web browser to do the right thing with "../". */
 {
 struct dyString *dy = dyStringCreate("%s%s", hLocalHostCgiBinUrl(), tn->forHtml);
 return dyStringCannibalize(&dy);
 }
 
 void reportTiming(int *pStartTime, char *message)
 /* Print out a report to stderr of how much time something took. */
 {
 if (measureTiming)
     {
     int now = clock1000();
     fprintf(stderr, "%dms to %s\n", now - *pStartTime, message);
@@ -712,37 +762,37 @@
         met = hashFindVal(sampleMetadata, words[1]);
     }
 return met;
 }
 
 static char *lineageForSample(struct hash *sampleMetadata, char *sampleId)
 /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */
 {
 char *lineage = NULL;
 struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId);
 if (met)
     lineage = met->lineage;
 return lineage;
 }
 
-static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree,
+static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree, char *source,
                                     struct placementInfo *info, struct hash *sampleMetadata)
 /* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */
 {
 if (bigTree)
     {
-    printf("<p>Nearest neighboring GISAID sequence already in phylogenetic tree: ");
+    printf("<p>Nearest neighboring %s sequence already in phylogenetic tree: ", source);
     struct slName *neighbors = findNearestNeighbor(bigTree, info->sampleId, info->variantPath);
     struct slName *neighbor;
     for (neighbor = neighbors;  neighbor != NULL;  neighbor = neighbor->next)
         {
         if (neighbor != neighbors)
             printf(", ");
         printf("%s", neighbor->name);
         char *lineage = lineageForSample(sampleMetadata, neighbor->name);
         if (isNotEmpty(lineage))
             printf(": lineage %s", lineage);
         }
     puts("</p>");
     }
 }
 
@@ -855,31 +905,31 @@
     else
         safecpy(indentForKids+indentLen - 4, 4 + 1, "|   ");
     }
 if (node->numEdges > 0)
     {
     char kidIndent[strlen(indent)+5];
     safef(kidIndent, sizeof kidIndent, "%s%s", indentForKids, "+---");
     int i;
     for (i = 0;  i < node->numEdges;  i++)
         asciiTree(node->edges[i], kidIndent, (i == node->numEdges - 1));
     }
 }
 
 static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements,
                                      struct phyloTree *subtree, struct hash *sampleMetadata,
-                                     struct mutationAnnotatedTree *bigTree)
+                                     struct mutationAnnotatedTree *bigTree, char *source)
 /* Report how each sample fits into the big tree. */
 {
 // Sort sample placements by sampleMuts so we can group identical samples.
 struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements);
 slSort(&placementRefs, placementInfoRefCmpSampleMuts);
 int relatedCount = slCount(placementRefs);
 int clumpSize = countIdentical(placementRefs);
 if (clumpSize < relatedCount && relatedCount > 2)
     {
     // Not all of the related sequences are identical, so they will be broken down into
     // separate "clumps".  List all related samples first to avoid confusion.
     puts("<pre>");
     asciiTree(subtree, "", TRUE);
     puts("</pre>");
     }
@@ -912,31 +962,31 @@
         }
     refsToGo = ref;
     displaySampleMuts(info);
     if (info->imputedBases)
         {
         puts("<p>Base values imputed by parsimony:\n<ul>");
         struct baseVal *bv;
         for (bv = info->imputedBases;  bv != NULL;  bv = bv->next)
             printf("<li>%d: %s\n", bv->chromStart+1, bv->val);
         puts("</ul>");
         puts("</p>");
         }
     displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples");
     displayBestNodes(info, sampleMetadata);
     if (!showBestNodePaths)
-        displayNearestNeighbors(bigTree, info, sampleMetadata);
+        displayNearestNeighbors(bigTree, source, info, sampleMetadata);
     if (showParsimonyScore && info->parsimonyScore > 0)
         printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore);
         //#*** TODO: explain parsimony score
     }
 }
 
 struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds)
 /* Prune all descendants of node that have no leaf descendants in sampleIds. */
 {
 if (node->numEdges)
     {
     struct phyloTree *prunedKids = NULL;
     int i;
     for (i = 0;  i < node->numEdges;  i++)
         {
@@ -1422,43 +1472,73 @@
             slNameAddHead(&pSites[i], reason);
         }
     bigBedFileClose(&bbi);
     }
 return pSites;
 }
 
 static int subTreeInfoUserSampleCmp(const void *pa, const void *pb)
 /* Compare subtreeInfo by number of user sample IDs (highest number first). */
 {
 struct subtreeInfo *tiA = *(struct subtreeInfo **)pa;
 struct subtreeInfo *tiB = *(struct subtreeInfo **)pb;
 return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds);
 }
 
-char *phyloPlaceSamples(struct lineFile *lf, char *db, boolean doMeasureTiming, int subtreeSize,
-                        int fontHeight)
+char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf,
+                        boolean doMeasureTiming, int subtreeSize, int fontHeight)
 /* Given a lineFile that contains either FASTA or VCF, prepare VCF for usher;
  * if that goes well then run usher, report results, make custom track files
  * and return the top-level custom track file; otherwise return NULL. */
 {
 char *ctFile = NULL;
 measureTiming = doMeasureTiming;
 int startTime = clock1000();
 struct tempName *vcfTn = NULL;
 struct slName *sampleIds = NULL;
 char *usherPath = getUsherPath(TRUE);
-char *usherAssignmentsPath = getUsherAssignmentsPath(db, TRUE);
+char *usherAssignmentsPath = NULL;
+char *source = NULL;
+char *metadataFile = NULL;
+struct treeChoices *treeChoices = loadTreeChoices(db);
+if (treeChoices)
+    {
+    usherAssignmentsPath = defaultProtobuf;
+    if (isEmpty(usherAssignmentsPath))
+        usherAssignmentsPath = treeChoices->protobufFiles[0];
+    int i;
+    for (i = 0;  i < treeChoices->count;  i++)
+        if (sameString(treeChoices->protobufFiles[i], usherAssignmentsPath))
+            {
+            metadataFile = treeChoices->metadataFiles[i];
+            source = treeChoices->sources[i];
+            break;
+            }
+    if (i == treeChoices->count)
+        {
+        usherAssignmentsPath = treeChoices->protobufFiles[0];
+        metadataFile = treeChoices->metadataFiles[0];
+        source = treeChoices->sources[0];
+        }
+    }
+else
+    {
+    // Fall back on old settings
+    usherAssignmentsPath = getUsherAssignmentsPath(db, TRUE);
+    metadataFile = phyloPlaceDbSettingPath(db, "metadataFile");
+    source = "GISAID";
+    }
 struct mutationAnnotatedTree *bigTree = parseParsimonyProtobuf(usherAssignmentsPath);
 reportTiming(&startTime, "parse protobuf file");
 if (! bigTree)
     {
     warn("Problem parsing %s; can't make subtree subtracks.", usherAssignmentsPath);
     }
 lineFileCarefulNewlines(lf);
 struct slName **maskSites = getProblematicSites(db);
 struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize);
 boolean isFasta = FALSE;
 struct seqInfo *seqInfoList = NULL;
 if (lfLooksLikeFasta(lf))
     {
     boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites);
     struct slPair *failedSeqs;
@@ -1499,72 +1579,71 @@
         warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n");
     }
 lineFileClose(&lf);
 if (vcfTn)
     {
     struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi,
                                             subtreeSize, sampleIds, bigTree->condensedNodes,
                                             &startTime);
     if (results->subtreeInfoList)
         {
         int subtreeCount = slCount(results->subtreeInfoList);
         // Sort subtrees by number of user samples (largest first).
         slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp);
         // Make Nextstrain/auspice JSON file for each subtree.
         char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile");
-        char *metadataFile = phyloPlaceDbSettingPath(db, "metadataFile");
         struct hash *sampleMetadata = getSampleMetadata(metadataFile);
         struct tempName *jsonTns[subtreeCount];
         struct subtreeInfo *ti;
         int ix;
         for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
             {
             AllocVar(jsonTns[ix]);
             trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json");
             treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, sampleMetadata,
-                              jsonTns[ix]->forCgi);
+                              jsonTns[ix]->forCgi, source);
             }
         puts("<p></p>");
         makeButtonRow(jsonTns, subtreeCount, isFasta);
         printf("<p>If you have metadata you wish to display, click a 'view subtree in Nextstrain' "
                "button, and then you can drag on a CSV file to "
                "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>."
                "</p>\n");
         summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree);
         reportTiming(&startTime, "write summary table (including reading in lineages)");
         for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
             {
             int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds);
             printf("<h3>Subtree %d: ", ix+1);
             if (subtreeUserSampleCount > 1)
                 printf("%d related samples", subtreeUserSampleCount);
             else if (subtreeCount > 1)
                 printf("Unrelated sample");
             printf("</h3>\n");
             makeNextstrainButton("viewNextstrainSub", ix, jsonTns);
             puts("<br>");
             // Make a sub-subtree with only user samples for display:
             struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi);
             subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds);
             describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, subtree,
-                                     sampleMetadata, bigTree);
+                                     sampleMetadata, bigTree, source);
             }
         reportTiming(&startTime, "describe placements");
 
         // Make custom tracks for uploaded samples and subtree(s).
         struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree,
-                                                  fontHeight, &startTime);
+                                                  source, fontHeight, &startTime);
                                
         // Offer big tree w/new samples for download
         puts("<h3>Downloads</h3>");
         puts("<ul>");
         printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree "
                "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml);
         for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
            {
             printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml,
                    ti->subtreeUserSampleIds->name);
             struct slName *sln;
             for (sln = ti->subtreeUserSampleIds->next;  sln != NULL;  sln = sln->next)
                 printf(", %s", sln->name);
             puts(" (Newick file)</a>");
             printf("<li><a href='%s' download>Auspice JSON for subtree with %s",