68f95eec388eb3895787fc885fa01fe7ee3447ac
angie
  Thu Aug 26 11:36:26 2021 -0700
Recently usher started prepending node_ to numeric internal node names stored in the protobuf, so node names in usher outputs are now node_<number> instead of just <number>.  However, the protobuf still has numeric names, so it may be necessary to strip the prefix when looking up bigTree nodes.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 2308f95..abe222f 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -470,73 +470,75 @@
 if (info->sampleMuts == NULL)
     printf("(None; identical to reference)");
 else
     {
     struct slName *sln;
     for (sln = info->sampleMuts;  sln != NULL;  sln = sln->next)
         {
         if (sln != info->sampleMuts)
             printf(", ");
         printf("%s", sln->name);
         }
     }
 puts("</p>");
 }
 
+boolean isInternalNodeName(char *nodeName, int minNewNode)
+/* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric
+ * or <USHER_NODE_PREFIX>_<number> and, if minNewNode > 0, number is less than minNewNode. */
+{
+if (startsWith(USHER_NODE_PREFIX, nodeName))
+    nodeName += strlen(USHER_NODE_PREFIX);
+return isAllDigits(nodeName) && (minNewNode <= 0 || (atoi(nodeName) < minNewNode));
+}
+
 static void variantPathPrint(struct variantPathNode *variantPath)
 /* Print out a variantPath; print nodeName only if non-numeric
  * (i.e. a sample ID not internal node) */
 {
 struct variantPathNode *vpn;
 for (vpn = variantPath;  vpn != NULL;  vpn = vpn->next)
     {
     if (vpn != variantPath)
         printf(" > ");
-    if (!isAllDigits(vpn->nodeName))
+    if (!isInternalNodeName(vpn->nodeName, 0))
         printf("%s: ", vpn->nodeName);
     struct singleNucChange *snc;
     for (snc = vpn->sncList;  snc != NULL;  snc = snc->next)
         {
         if (snc != vpn->sncList)
             printf(", ");
         printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
         }
     }
 }
 
 static void displayVariantPath(struct variantPathNode *variantPath, char *sampleId)
 /* Display mutations on the path to this sample. */
 {
 printf("<p>Mutations along the path from the root of the phylogenetic tree to %s:<br>",
        sampleId);
 if (variantPath)
     {
     variantPathPrint(variantPath);
     puts("<br>");
     }
 else
     puts("(None; your sample was placed at the root of the phylogenetic tree)");
 puts("</p>");
 }
 
-static boolean isInternalNodeName(char *nodeName, int minNewNode)
-/* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric
- * and less than minNewNode. */
-{
-return isAllDigits(nodeName) && (atoi(nodeName) < minNewNode);
-}
-
 static struct variantPathNode *findLastInternalNode(struct variantPathNode *variantPath,
                                                     int minNewNode)
 /* Return the last node in variantPath with a numeric name less than minNewNode, or NULL. */
 {
 if (!variantPath || !isInternalNodeName(variantPath->nodeName, minNewNode))
     return NULL;
 while (variantPath->next && isInternalNodeName(variantPath->next->nodeName, minNewNode))
     variantPath = variantPath->next;
 if (variantPath && isInternalNodeName(variantPath->nodeName, minNewNode))
     return variantPath;
 return NULL;
 }
 
 static int mutCountCmp(const void *a, const void *b)
 /* Compare number of mutations of phyloTree nodes a and b. */
@@ -589,31 +591,37 @@
 return leafName;
 }
 
 static char *findNearestNeighbor(struct mutationAnnotatedTree *bigTree, char *sampleId,
                                  struct variantPathNode *variantPath)
 /* Use the sequence of mutations in variantPath to find sampleId's parent node in bigTree,
  * then look for most similar leaf sibling(s). */
 {
 char *nearestNeighbor = NULL;
 int bigTreeINodeCount = phyloCountInternalNodes(bigTree->tree);
 int minNewNode = bigTreeINodeCount + 1; // 1-based
 struct variantPathNode *lastOldNode = findLastInternalNode(variantPath, minNewNode);
 struct phyloTree *node = lastOldNode ? hashFindVal(bigTree->nodeHash, lastOldNode->nodeName) :
                                        bigTree->tree;
 if (lastOldNode && !node)
-    errAbort("Can't find last internal node for sample %s", sampleId);
+    {
+    if (startsWith(USHER_NODE_PREFIX, lastOldNode->nodeName))
+        // protobuf still has number even if usher prepends USHER_NODE_PREFIX when parsing.
+        node = hashFindVal(bigTree->nodeHash, lastOldNode->nodeName+strlen(USHER_NODE_PREFIX));
+    if (node == NULL)
+        errAbort("Can't find last internal node %s for sample %s", lastOldNode->nodeName, sampleId);
+    }
 // Look for a leaf kid with no mutations relative to the parent, should be closest.
 if (node->numEdges == 0)
     {
     struct slName *nodeList = hashFindVal(bigTree->condensedNodes, node->ident->name);
     if (nodeList)
         nearestNeighbor = cloneString(nodeList->name);
     else
         nearestNeighbor = cloneString(node->ident->name);
     }
 else
     {
     nearestNeighbor = leafWithLeastMuts(node, bigTree);
     if (nearestNeighbor == NULL && node->parent != NULL)
         nearestNeighbor = leafWithLeastMuts(node->parent, bigTree);
     }
@@ -870,38 +878,38 @@
 /* Show the node(s) most closely related to sample. */
 {
 if (info->bestNodeCount == 1)
     printf("<p>The placement in the tree is unambiguous; "
            "there are no other parsimony-optimal placements in the phylogenetic tree.</p>\n");
 else if (info->bestNodeCount > 1)
     printf("<p>This placement is not the only parsimony-optimal placement in the tree; "
            "%d other placements exist.</p>\n", info->bestNodeCount - 1);
 if (showBestNodePaths && info->bestNodes)
     {
     if (info->bestNodeCount != slCount(info->bestNodes))
         errAbort("Inconsistent bestNodeCount (%d) and number of bestNodes (%d)",
                  info->bestNodeCount, slCount(info->bestNodes));
     if (info->bestNodeCount > 1)
         printf("<ul><li><b>used for placement</b>: ");
-    if (differentString(info->bestNodes->name, "?") && !isAllDigits(info->bestNodes->name))
+    if (differentString(info->bestNodes->name, "?") && !isInternalNodeName(info->bestNodes->name, 0))
         printf("%s ", info->bestNodes->name);
     printVariantPathNoNodeNames(stdout, info->bestNodes->variantPath);
     struct bestNodeInfo *bn;
     for (bn = info->bestNodes->next;  bn != NULL;  bn = bn->next)
         {
         printf("\n<li>");
-        if (differentString(bn->name, "?") && !isAllDigits(bn->name))
+        if (differentString(bn->name, "?") && !isInternalNodeName(bn->name, 0))
             printf("%s ", bn->name);
         printVariantPathNoNodeNames(stdout, bn->variantPath);
         char *lineage = lineageForSample(sampleMetadata, bn->name);
         printLineageUrl(lineage);
         }
     if (info->bestNodeCount > 1)
         puts("</ul>");
     puts("</p>");
     }
 }
 
 static int placementInfoRefCmpSampleMuts(const void *va, const void *vb)
 /* Compare slRef->placementInfo->sampleMuts lists.  Shorter lists first.  Using alpha sort
  * to distinguish between different sampleMuts contents arbitrarily; the purpose is to
  * clump samples with identical lists. */
@@ -951,31 +959,31 @@
     {
     clumpCount++;
     if (ref->next == NULL || placementInfoRefCmpSampleMuts(&ref, &ref->next))
         break;
     }
 return clumpCount;
 }
 
 static void asciiTree(struct phyloTree *node, char *indent, boolean isLast,
                       struct dyString *dyLine, struct slPair **pRowList)
 /* Until we can make a real graphic, at least print an ascii tree build up a (reversed) list of
  * lines so that we can add some text to the right later. */
 {
 if (isNotEmpty(indent) || isNotEmpty(node->ident->name))
     {
-    if (node->ident->name && !isAllDigits(node->ident->name))
+    if (node->ident->name && !isInternalNodeName(node->ident->name, 0))
         {
         dyStringPrintf(dyLine, "%s %s", indent, node->ident->name);
         slPairAdd(pRowList, node->ident->name, cloneString(dyLine->string));
         dyStringClear(dyLine);
         }
     }
 int indentLen = strlen(indent);
 char indentForKids[indentLen+1];
 safecpy(indentForKids, sizeof indentForKids, indent);
 if (indentLen >= 4)
     {
     if (isLast)
         safecpy(indentForKids+indentLen - 4, 4 + 1, "    ");
     else
         safecpy(indentForKids+indentLen - 4, 4 + 1, "|   ");
@@ -2128,30 +2136,32 @@
         struct singleNucChange *snc;
         for (snc = si->maskedSncList;  snc != NULL;  snc = snc->next)
             {
             if (snc != si->maskedSncList)
                 fputc(',', f);
             fprintf(f, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase);
             }
         }
     else
         {
         warn("writeOneTsvRow: no sequenceInfo for sample '%s'", sampleId);
         fprintf(f, "\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a");
         }
     fprintf(f, "\t%s", isNotEmpty(info->nearestNeighbor) ? info->nearestNeighbor : "n/a");
     fprintf(f, "\t%s", isNotEmpty(info->neighborLineage) ? info->neighborLineage : "n/a");
+    fprintf(f, "\t%s", isNotEmpty(info->nextClade) ? info->nextClade : "n/a");
+    fprintf(f, "\t%s", isNotEmpty(info->pangoLineage) ? info->pangoLineage : "n/a");
     fputc('\n', f);
     }
 }
 
 static void rWriteTsvSummaryTreeOrder(struct phyloTree *node, FILE *f, struct usherResults *results,
                                       struct hash *seqInfoHash, struct geneInfo *geneInfoList,
                                       struct seqWindow *gSeqWin, struct hash *spikeChanges)
 /* As we encounter leaves (user-uploaded samples) in depth-first search order, write out a line
  * of TSV summary for each one. */
 {
 if (node->numEdges)
     {
     int i;
     for (i = 0;  i < node->numEdges;  i++)
         rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin,
@@ -2177,31 +2187,31 @@
 static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree,
                                         struct slName *sampleIds, struct seqInfo *seqInfoList,
                                         struct geneInfo *geneInfoList, struct seqWindow *gSeqWin,
                                         struct hash *spikeChanges, int *pStartTime)
 /* Write a tab-separated summary file for download.  If the user uploaded enough samples to make
  * a tree, then write out samples in tree order; otherwise use sampleIds list.
  * Accumulate S gene changes. */
 {
 struct tempName *tsvTn = NULL;
 AllocVar(tsvTn);
 trashDirFile(tsvTn, "ct", "usher_samples", ".tsv");
 FILE *f = mustOpen(tsvTn->forCgi, "w");
 fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path"
         "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases"
         "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations"
-        "\tnearest_neighbor\tneighbor_lineage"
+        "\tnearest_neighbor\tneighbor_lineage\tnextstrain_clade\tpango_lineage"
         "\n");
 struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList);
 if (sampleTree)
     {
     rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin,
                               spikeChanges);
     }
 else
     {
     struct slName *sample;
     for (sample = sampleIds;  sample != NULL;  sample = sample->next)
         writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges);
     }
 carefulClose(&f);
 hashFree(&seqInfoHash);
@@ -2375,31 +2385,31 @@
     {
     *p = '\0';
     hashAdd(nameHash, id, fullName);
     }
 }
 
 static void maybeAddIsolate(struct hash *nameHash, char *name, char *fullName)
 /* If name looks like country/isolate/year and isolate looks sufficiently distinctive
  * then also map isolate to fullName. */
 {
 regmatch_t substrs[2];
 if (regexMatchSubstr(name, "^[A-Za-z _]+/(.*)/[0-9]{4}$", substrs, ArraySize(substrs)))
     {
     char isolate[substrs[1].rm_eo - substrs[1].rm_so + 1];
     regexSubstringCopy(name, substrs[1], isolate, sizeof isolate);
-    if (! isAllDigits(isolate) &&
+    if (! isInternalNodeName(isolate, 0) &&
         (regexMatch(isolate, "[A-Za-z0-9]{4}") ||
          regexMatch(isolate, "[A-Za-z0-9][A-Za-z0-9]+[-_][A-Za-z0-9][A-Za-z0-9]+")))
         {
         hashAdd(nameHash, isolate, fullName);
         }
     }
 }
 static void addNameMaybeComponents(struct hash *nameHash, char *fullName, boolean addComponents)
 /* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName.
  * If addComponents and fullName is |-separated name|ID|date, add name and ID to nameHash. */
 {
 char *fullNameHashStored = hashStoreName(nameHash, fullName);
 // Now that we have hash storage for fullName, make it point to itself.
 struct hashEl *hel = hashLookup(nameHash, fullName);
 if (hel == NULL)