src/hg/hgPhyloPlace/phyloPlace.c f22f0b5f1d467e570dad2407813748889408d9bd

f22f0b5f1d467e570dad2407813748889408d9bd
angie
  Tue Sep 14 09:38:11 2021 -0700
Don't rely on outputs printed only when usher is compiled with -DDEBUG=1; those are going away in server mode.  We already weren't showing the 'Best nodes' output, so just remove that code.  We already have sample mutations in seqInfo->sncList, so just reformat that into placementInfo->sampleMuts.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index a00a4c0..d583c05 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -24,31 +24,30 @@
 #include "psl.h"
 #include "ra.h"
 #include "regexHelper.h"
 #include "trashDir.h"
 #include "vcf.h"
 
 // Globals:
 static boolean measureTiming = FALSE;
 
 // wuhCor1-specific:
 char *chrom = "NC_045512v2";
 int chromSize = 29903;
 
 // Parameter constants:
 int maxGenotypes = 1000;        // Upper limit on number of samples user can upload at once.
-boolean showBestNodePaths = FALSE;
 boolean showParsimonyScore = FALSE;
 
 
 char *phyloPlaceDbSetting(char *db, char *settingName)
 /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */
 {
 static struct hash *configHash = NULL;
 static char *configDb = NULL;
 if (!sameOk(db, configDb))
     {
     char configFile[1024];
     safef(configFile, sizeof configFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", db);
     if (fileExists(configFile))
         {
         configHash = raReadSingle(configFile);
@@ -451,30 +450,59 @@
 if (errCatch->gotError)
     {
     warn("%s", errCatch->message->string);
     unlink(tn->forCgi);
     freez(&tn);
     }
 errCatchFree(&errCatch); 
 struct seqInfo *si;
 for (si = seqInfoList;  si != NULL;  si = si->next)
     slReverse(&si->sncList);
 *retSeqInfoList = seqInfoList;
 *retSampleIds = sampleIds;
 return tn;
 }
 
+static void addSampleMutsFromSeqInfo(struct hash *samplePlacements, struct hash *seqInfoHash)
+/* We used to get placementInfo->sampleMuts from DEBUG mode usher output in runUsher.c, but
+ * the DEBUG mode output went away with the change to server-mode usher.  Instead, now we fill
+ * it in from seqInfo->sncList which has the same info. */
+{
+struct hashEl *hel;
+struct hashCookie cookie = hashFirst(samplePlacements);
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    struct placementInfo *pi = hel->val;
+    struct seqInfo *si = hashFindVal(seqInfoHash, pi->sampleId);
+    if (si)
+        {
+        struct slName *sampleMuts = NULL;
+        struct singleNucChange *snc;
+        for (snc = si->sncList;  snc != NULL;  snc = snc->next)
+            {
+            char mutStr[64];
+            safef(mutStr, sizeof mutStr, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase);
+            slNameAddHead(&sampleMuts, mutStr);
+            }
+        slReverse(&sampleMuts);
+        pi->sampleMuts = sampleMuts;
+        }
+    else
+        errAbort("addSampleMutsFromSeqInfo: no seqInfo for placed sequence '%s'", pi->sampleId);
+    }
+}
+
 static void displaySampleMuts(struct placementInfo *info)
 {
 printf("<p>Differences from the reference genome "
        "(<a href='https://www.ncbi.nlm.nih.gov/nuccore/NC_045512.2' target=_blank>"
        "NC_045512.2</a>): ");
 if (info->sampleMuts == NULL)
     printf("(None; identical to reference)");
 else
     {
     struct slName *sln;
     for (sln = info->sampleMuts;  sln != NULL;  sln = sln->next)
         {
         if (sln != info->sampleMuts)
             printf(", ");
         printf("%s", sln->name);
@@ -869,65 +897,30 @@
     }
 }
 
 static void displayNearestNeighbors(struct placementInfo *info, char *source)
 /* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */
 {
 if (isNotEmpty(info->nearestNeighbor))
     {
     printf("<p>Nearest neighboring %s sequence already in phylogenetic tree: %s",
            source, info->nearestNeighbor);
     printLineageUrl(info->neighborLineage);
     puts("</p>");
     }
 }
 
-static void displayBestNodes(struct placementInfo *info, struct hash *sampleMetadata)
-/* Show the node(s) most closely related to sample. */
-{
-if (info->bestNodeCount == 1)
-    printf("<p>The placement in the tree is unambiguous; "
-           "there are no other parsimony-optimal placements in the phylogenetic tree.</p>\n");
-else if (info->bestNodeCount > 1)
-    printf("<p>This placement is not the only parsimony-optimal placement in the tree; "
-           "%d other placements exist.</p>\n", info->bestNodeCount - 1);
-if (showBestNodePaths && info->bestNodes)
-    {
-    if (info->bestNodeCount != slCount(info->bestNodes))
-        errAbort("Inconsistent bestNodeCount (%d) and number of bestNodes (%d)",
-                 info->bestNodeCount, slCount(info->bestNodes));
-    if (info->bestNodeCount > 1)
-        printf("<ul><li><b>used for placement</b>: ");
-    if (differentString(info->bestNodes->name, "?") && !isInternalNodeName(info->bestNodes->name, 0))
-        printf("%s ", info->bestNodes->name);
-    printVariantPathNoNodeNames(stdout, info->bestNodes->variantPath);
-    struct bestNodeInfo *bn;
-    for (bn = info->bestNodes->next;  bn != NULL;  bn = bn->next)
-        {
-        printf("\n<li>");
-        if (differentString(bn->name, "?") && !isInternalNodeName(bn->name, 0))
-            printf("%s ", bn->name);
-        printVariantPathNoNodeNames(stdout, bn->variantPath);
-        char *lineage = lineageForSample(sampleMetadata, bn->name);
-        printLineageUrl(lineage);
-        }
-    if (info->bestNodeCount > 1)
-        puts("</ul>");
-    puts("</p>");
-    }
-}
-
 static int placementInfoRefCmpSampleMuts(const void *va, const void *vb)
 /* Compare slRef->placementInfo->sampleMuts lists.  Shorter lists first.  Using alpha sort
  * to distinguish between different sampleMuts contents arbitrarily; the purpose is to
  * clump samples with identical lists. */
 {
 struct slRef * const *rra = va;
 struct slRef * const *rrb = vb;
 struct placementInfo *pa = (*rra)->val;
 struct placementInfo *pb = (*rrb)->val;
 int diff = slCount(pa->sampleMuts) - slCount(pb->sampleMuts);
 if (diff == 0)
     {
     struct slName *slnA, *slnB;
     for (slnA = pa->sampleMuts, slnB = pb->sampleMuts;  slnA != NULL;
          slnA = slnA->next, slnB = slnB->next)
@@ -1085,32 +1078,30 @@
         printf("<b>%s</b>\n", info->sampleId);
         ref = ref->next;
         }
     refsToGo = ref;
     displaySampleMuts(info);
     if (info->imputedBases)
         {
         puts("<p>Base values imputed by parsimony:\n<ul>");
         struct baseVal *bv;
         for (bv = info->imputedBases;  bv != NULL;  bv = bv->next)
             printf("<li>%d: %s\n", bv->chromStart+1, bv->val);
         puts("</ul>");
         puts("</p>");
         }
     displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples");
-    displayBestNodes(info, sampleMetadata);
-    if (!showBestNodePaths)
     displayNearestNeighbors(info, source);
     if (showParsimonyScore && info->parsimonyScore > 0)
         printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore);
         //#*** TODO: explain parsimony score
     }
 }
 
 struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds)
 /* Prune all descendants of node that have no leaf descendants in sampleIds. */
 {
 if (node->numEdges)
     {
     struct phyloTree *prunedKids = NULL;
     int i;
     for (i = 0;  i < node->numEdges;  i++)
@@ -2180,60 +2171,58 @@
     }
 }
 
 
 static struct hash *hashFromSeqInfoList(struct seqInfo *seqInfoList)
 /* Hash sequence name to seqInfo for quick lookup. */
 {
 struct hash *hash = hashNew(0);
 struct seqInfo *si;
 for (si = seqInfoList;  si != NULL;  si = si->next)
     hashAdd(hash, si->seq->name, si);
 return hash;
 }
 
 static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree,
-                                        struct slName *sampleIds, struct seqInfo *seqInfoList,
+                                        struct slName *sampleIds, struct hash *seqInfoHash,
                                         struct geneInfo *geneInfoList, struct seqWindow *gSeqWin,
                                         struct hash *spikeChanges, int *pStartTime)
 /* Write a tab-separated summary file for download.  If the user uploaded enough samples to make
  * a tree, then write out samples in tree order; otherwise use sampleIds list.
  * Accumulate S gene changes. */
 {
 struct tempName *tsvTn = NULL;
 AllocVar(tsvTn);
 trashDirFile(tsvTn, "ct", "usher_samples", ".tsv");
 FILE *f = mustOpen(tsvTn->forCgi, "w");
 fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path"
         "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases"
         "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations"
         "\tnearest_neighbor\tneighbor_lineage\tnextstrain_clade\tpango_lineage"
         "\n");
-struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList);
 if (sampleTree)
     {
     rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin,
                               spikeChanges);
     }
 else
     {
     struct slName *sample;
     for (sample = sampleIds;  sample != NULL;  sample = sample->next)
         writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges);
     }
 carefulClose(&f);
-hashFree(&seqInfoHash);
 reportTiming(pStartTime, "write tsv summary");
 return tsvTn;
 }
 
 static struct tempName *writeSpikeChangeSummary(struct hash *spikeChanges, int totalSampleCount)
 /* Write a tab-separated summary of S (Spike) gene changes for download. */
 {
 struct tempName *tsvTn = NULL;
 AllocVar(tsvTn);
 trashDirFile(tsvTn, "ct", "usher_S_muts", ".tsv");
 FILE *f = mustOpen(tsvTn->forCgi, "w");
 fprintf(f, "aa_mutation\tsample_count\tsample_frequency\tsample_ids\tcategory"
         "\n");
 struct aaMutInfo *sChanges[spikeChanges->elCount];
 struct hashCookie cookie = hashFirst(spikeChanges);
@@ -2682,41 +2671,43 @@
         printf("<p>Sorry, could not align any sequences to reference well enough to place in "
                "the phylogenetic tree.</p>\n");
     isFasta = TRUE;
     }
 else if (lfLooksLikeVcf(lf))
     {
     vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds);
     reportTiming(&startTime, "check uploaded VCF");
     }
 else
     {
     subtreesOnly = TRUE;
     sampleIds = readSampleIds(lf, bigTree, aliasFile);
     }
 lineFileClose(&lf);
+struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList);
 if (sampleIds == NULL)
     {
     return ctFile;
     }
 struct usherResults *results = NULL;
 if (vcfTn)
     {
     fflush(stdout);
     results = runUsher(usherPath, protobufPath, vcfTn->forCgi,
                        subtreeSize, sampleIds, bigTree->condensedNodes,
                        &startTime);
+    addSampleMutsFromSeqInfo(results->samplePlacements, seqInfoHash);
     }
 else if (subtreesOnly)
     {
     char *matUtilsPath = getMatUtilsPath(TRUE);
     results = runMatUtilsExtractSubtrees(matUtilsPath, protobufPath, subtreeSize,
                                          sampleIds, bigTree->condensedNodes,
                                          &startTime);
     }
 if (results && results->singleSubtreeInfo)
     {
     if (retSuccess)
         *retSuccess = TRUE;
     puts("<p></p>");
     readQcThresholds(db);
     int subtreeCount = slCount(results->subtreeInfoList);
@@ -2769,31 +2760,31 @@
         {
         summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, bigTree);
         reportTiming(&startTime, "describe subtrees");
         }
     else
         {
         findNearestNeighbors(results->samplePlacements, sampleMetadata, bigTree);
 
         // Make custom tracks for uploaded samples and subtree(s).
         struct phyloTree *sampleTree = NULL;
         ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree,
                                  source, fontHeight, &sampleTree, &startTime);
 
         // Make a sample summary TSV file and accumulate S gene changes
         struct hash *spikeChanges = hashNew(0);
-        tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoList,
+        tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoHash,
                                                  geneInfoList, gSeqWin, spikeChanges, &startTime);
         sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds));
         downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml,
                      zipTn->forHtml);
 
         int seqCount = slCount(seqInfoList);
         if (seqCount <= MAX_SEQ_DETAILS)
             {
             summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, refGenome);
             reportTiming(&startTime, "write summary table (including reading in lineages)");
             for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
                 {
                 int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds);
                 printf("<h3>Subtree %d: ", ix+1);
                 if (subtreeUserSampleCount > 1)