b4460ce8ea85171693750dd217ba805870b0b93d
angie
  Thu Mar 21 08:54:51 2024 -0700
Branch attributes (used for coloring branches by majority lineage/clade) are now specified by a new config setting branchAttributes instead of being hardcoded.

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index 613fc83..1db6673 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -354,131 +354,140 @@
 
 static void makeLineageUrl(char *lineage, char *lineageUrl, size_t lineageUrlSize)
 /* If lineage is not "uploaded sample", make an outbreak.info link to it, otherwise just copy
  * lineage. */
 {
 if (sameString(lineage, "uploaded sample"))
     safecpy(lineageUrl, lineageUrlSize, lineage);
 else
     safef(lineageUrl, lineageUrlSize, OUTBREAK_INFO_URLBASE "%s", lineage);
 }
 
 static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name,
                                         struct sampleMetadata *met, boolean isUserSample,
                                         char *source, struct hash *sampleUrls,
                                         struct hash *samplePlacements, boolean isRsv,
-                                        char **retUserOrOld, char **retNClade, char **retGClade,
-                                        char **retLineage, char **retNLineage,
-                                        char **retNCladeUsher, char **retLineageUsher)
+                                        int branchAttrCount, char **branchAttrCols,
+                                        char **branchAttrVals)
 /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash,
  * or may be a new sample from the user.  Set rets for color categories so parent branches can
  * determine their color categories. */
 {
-*retUserOrOld = isUserSample ? "uploaded sample" : source;
-jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld);
-*retNClade = *retGClade = *retLineage = *retNLineage = *retNCladeUsher = *retLineageUsher = "";
+char *userOrOld = isUserSample ? "uploaded sample" : source;
+jsonWriteObjectValue(jw, "userOrOld", userOrOld);
+int i;
+for (i = 0;  i < branchAttrCount;  i++)
+    branchAttrVals[i] = "";
+if (branchAttrCount > 0 && sameString(branchAttrCols[0], "userOrOld"))
+    branchAttrVals[0] = userOrOld;
 if (met != NULL)
     {
     int i;
     for (i = 0;  i < met->columnCount;  i++)
         {
         char *colName = met->columnNames[i];
+        // Tweak old column name if found
         if (sameString(colName, "pangolin_lineage"))
-            {
             colName = "pango_lineage";
+        // Link out to outbreak.info for Pango lineages
+        if (startsWith("pango_lineage", colName))
+            {
             if (isNotEmpty(met->columnValues[i]))
                 {
                 char lineageUrl[1024];
                 makeLineageUrl(met->columnValues[i], lineageUrl, sizeof lineageUrl);
                 jsonWriteObjectValueUrl(jw, colName, met->columnValues[i], lineageUrl);
                 }
             else if (isNotEmpty(met->columnValues[i]))
                 jsonWriteObjectValue(jw, colName, met->columnValues[i]);
             }
         else
             jsonWriteObjectValue(jw, colName, met->columnValues[i]);
-        // Some columns get passed up for aggregation so we can color internal nodes/branches.
-        if (sameString(colName, "Nextstrain_clade") || sameString(colName, "goya_nextclade"))
-            *retNClade = met->columnValues[i];
-        else if (sameString(colName, "GISAID_clade") || sameString(colName, "GCC_assigned_2023-11"))
-            *retGClade = met->columnValues[i];
-        else if (sameString(colName, "pango_lineage") || sameString(colName, "GCC_nextclade"))
-            *retLineage = met->columnValues[i];
-        else if (sameString(colName, "Nextstrain_clade_usher") || sameString(colName, "goya_usher"))
-            *retNCladeUsher = met->columnValues[i];
-        else if (sameString(colName, "pango_lineage_usher") || sameString(colName, "GCC_usher"))
-            *retLineageUsher = met->columnValues[i];
+        // Some columns get passed upwards for aggregation so we can color internal nodes/branches.
+        int j;
+        for (j = 0;  j < branchAttrCount;  j++)
+            {
+            if (sameString(colName, branchAttrCols[j]))
+                {
+                branchAttrVals[j] = met->columnValues[i];
+                break;
+                }
+            }
         }
     }
 else if (isUserSample)
     {
     struct placementInfo *pi = name ? hashFindVal(samplePlacements, name) : NULL;
-    //#*** Really need to know what columns are present in the absence of met, so we can avoid
-    //#*** writing objects that shouldn't be there for this org.
-    *retNClade = *retGClade = *retLineage = *retNLineage = "uploaded sample";
-    jsonWriteObjectValue(jw, isRsv ? "goya_nextclade" : "Nextstrain_clade", "uploaded sample");
-    jsonWriteObjectValue(jw, isRsv ? "GCC_assigned_2023-11" : "GISAID_clade", "uploaded sample");
-    jsonWriteObjectValue(jw, isRsv ? "GCC_nextclade" : "pango_lineage", "uploaded sample");
-    jsonWriteObjectValue(jw, "Nextstrain_lineage", "uploaded sample");
-    *retNCladeUsher = (pi && pi->nextClade) ? pi->nextClade : "uploaded sample";
-    jsonWriteObjectValue(jw, (isRsv ? "goya_usher" : "Nextstrain_clade_usher"), *retNCladeUsher);
-    *retLineageUsher = (pi && pi->pangoLineage) ? pi->pangoLineage : "uploaded sample";
-    if (isRsv)
-        jsonWriteObjectValue(jw, "GCC_usher", *retLineageUsher);
-    else
+    int i;
+    for (i = 0;  i < branchAttrCount;  i++)
+        {
+        branchAttrVals[i] = "uploaded sample";
+        // Special cases for using placementInfo of user sample for _usher lineage/clade calls
+        // and outbreak.info link for Pango lineage
+        //#*** TODO: think of a way to make this config-driven
+        boolean wroteLink = FALSE;
+        if (pi)
             {
+            if (pi->nextClade && (sameString(branchAttrCols[i], "Nextstrain_clade_usher") ||
+                                  sameString(branchAttrCols[i], "goya_usher")))
+                branchAttrVals[i] = pi->nextClade;
+            else if (pi->pangoLineage)
+                {
+                if (sameString(branchAttrCols[i], "pango_lineage_usher"))
+                    {
+                    branchAttrVals[i] = pi->pangoLineage;
                     char lineageUrl[1024];
-        makeLineageUrl(*retLineageUsher, lineageUrl, sizeof lineageUrl);
-        jsonWriteObjectValueUrl(jw, "pango_lineage_usher", *retLineageUsher, lineageUrl);
+                    makeLineageUrl(pi->pangoLineage, lineageUrl, sizeof lineageUrl);
+                    jsonWriteObjectValueUrl(jw, branchAttrCols[i], branchAttrVals[i], lineageUrl);
+                    wroteLink = TRUE;
+                    }
+                else if (sameString(branchAttrCols[i], "GCC_usher"))
+                    branchAttrVals[i] = pi->pangoLineage;
+                }
+            }
+        if (!wroteLink)
+            jsonWriteObjectValue(jw, branchAttrCols[i], branchAttrVals[i]);
         }
     }
 char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL;
 if (isNotEmpty(sampleUrl))
     {
     char *p = strstr(sampleUrl, "subtreeAuspice");
     char *subtreeNum = p + strlen("subtreeAuspice");
     if (p && isdigit(*subtreeNum))
         {
         int num = atoi(subtreeNum);
         char subtreeLabel[1024];
         safef(subtreeLabel, sizeof subtreeLabel, "view subtree %d", num);
         jsonWriteObjectValueUrl(jw, "subtree", subtreeLabel, sampleUrl);
         }
     else
         jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl);
     }
 }
 
-static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, boolean isRsv, char *userOrOld,
-                                          char *nClade, char *gClade, char *lineage, char *nLineage,
-                                          char *nCladeUsher, char *lineageUsher)
+static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, boolean isRsv,
+                                          int branchAttrCount, char **branchAttrCols,
+                                          char **branchAttrVals)
 /* Write elements of node_attrs for a branch. */
 {
-if (userOrOld)
-    jsonWriteObjectValue(jw, "userOrOld", userOrOld);
-if (nClade)
-    jsonWriteObjectValue(jw, (isRsv ? "goya_nextclade" : "Nextstrain_clade"), nClade);
-if (gClade)
-    jsonWriteObjectValue(jw, (isRsv ? "GCC_assigned_2023-11" : "GISAID_clade"), gClade);
-if (lineage)
-    jsonWriteObjectValue(jw, (isRsv ? "GCC_nextclade" : "pango_lineage"), lineage);
-if (nLineage)
-    jsonWriteObjectValue(jw, "Nextstrain_lineage", lineage);
-if (nCladeUsher)
-    jsonWriteObjectValue(jw, (isRsv ? "goya_usher" : "Nextstrain_clade_usher"), nCladeUsher);
-if (lineageUsher)
-    jsonWriteObjectValue(jw, (isRsv ? "GCC_usher" : "pango_lineage_usher"), lineageUsher);
+int i;
+for (i = 0;  i < branchAttrCount;  i++)
+    {
+    if (isNotEmpty(branchAttrVals[i]))
+        jsonWriteObjectValue(jw, branchAttrCols[i], branchAttrVals[i]);
+    }
 }
 
 INLINE char maybeComplement(char base, struct psl *psl)
 /* If psl is on '+' strand, return base, otherwise return the complement of base. */
 {
 return (pslOrientation(psl) > 0) ? base : ntCompTable[(int)base];
 }
 
 static struct slName *codonVpTxToAaChange(struct vpTx *codonVpTxList,
                                           struct singleNucChange *ancestorMuts,
                                           struct geneInfo *gi)
 /* Given a list of vpTx from the same codon, combine their changes with inherited mutations
  * in the same codon to get the amino acid change at this node.
  * Note: this assumes there is no UTR in transcript, only CDS.  True so far for pathogens... */
 {
@@ -725,104 +734,88 @@
         runLength++;
         if (runLength > maxRunLength)
             {
             maxRunLength = runLength;
             maxRunVal = array[i];
             }
         }
     else
         runLength = 1;
     }
 return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL;
 }
 
 static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji,
                                struct singleNucChange *ancestorMuts, boolean isRsv,
-                               char **retUserOrOld, char **retNClade, char **retGClade,
-                               char **retLineage, char **retNLineage,
-                               char **retNCladeUsher, char **retLineageUsher)
+                               int branchAttrCount, char **branchAttrCols, char **branchAttrVals)
 /* Write Augur/Auspice V2 JSON for tree.  Enclosing object start and end are written by caller. */
 {
 struct singleNucChange *sncList = node->priv;
 if (sncList)
     {
     depth += slCount(sncList);
     }
 boolean isUserSample = FALSE;
 if (node->ident->name)
     isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name);
 char *name = node->ident->name;
 struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL;
 if (name)
     jsonWriteString(aji->jw, "name", name);
 else
     jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++);
 jsonWriteBranchAttrs(aji->jw, node, ancestorMuts, aji->geneInfoList, aji->gSeqWin);
 if (node->numEdges > 0)
     {
     struct singleNucChange *allMuts = ancestorMuts;
     struct singleNucChange *ancLast = slLastEl(ancestorMuts);
     if (ancLast != NULL)
         ancLast->next = sncList;
     else
         allMuts = sncList;
     jsonWriteListStart(aji->jw, "children");
-    char *kidUserOrOld[node->numEdges];
-    char *kidNClade[node->numEdges];
-    char *kidGClade[node->numEdges];
-    char *kidLineage[node->numEdges];
-    char *kidNCladeUsher[node->numEdges];
-    char *kidLineageUsher[node->numEdges];
-    char *kidNLineage[node->numEdges];
+    char *kidAttrVals[branchAttrCount][node->numEdges];
     // Step through children in reverse order because nextstrain/Auspice draws upside-down. :)
     int i;
     for (i = node->numEdges - 1;  i >= 0;  i--)
         {
+        char *kidNodeAttrVals[branchAttrCount];
         jsonWriteObjectStart(aji->jw, NULL);
         rTreeToAuspiceJson(node->edges[i], depth, aji, allMuts, isRsv,
-                           &kidUserOrOld[i], &kidNClade[i], &kidGClade[i], &kidLineage[i],
-                           &kidNLineage[i], &kidNCladeUsher[i], &kidLineageUsher[i]);
+                           branchAttrCount, branchAttrCols, kidNodeAttrVals);
         jsonWriteObjectEnd(aji->jw);
+        int j;
+        for (j = 0;  j < branchAttrCount;  j++)
+            kidAttrVals[j][i] = kidNodeAttrVals[j];
         }
     jsonWriteListEnd(aji->jw);
-    if (retUserOrOld)
-        *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges);
-    if (retNClade)
-        *retNClade = majorityMaybe(kidNClade, node->numEdges);
-    if (retGClade)
-        *retGClade = majorityMaybe(kidGClade, node->numEdges);
-    if (retLineage)
-        *retLineage = majorityMaybe(kidLineage, node->numEdges);
-    if (retNCladeUsher)
-        *retNCladeUsher = majorityMaybe(kidNCladeUsher, node->numEdges);
-    if (retLineageUsher)
-        *retLineageUsher = majorityMaybe(kidLineageUsher, node->numEdges);
-    if (retNLineage)
-        *retNLineage = majorityMaybe(kidNLineage, node->numEdges);
+    if (branchAttrVals)
+        {
+        for (i = 0;  i < branchAttrCount;  i++)
+            branchAttrVals[i] = majorityMaybe(kidAttrVals[i], node->numEdges);
+        }
     if (ancLast)
         ancLast->next = NULL;
     }
 jsonWriteObjectStart(aji->jw, "node_attrs");
 jsonWriteDouble(aji->jw, "div", depth);
 if (node->numEdges == 0)
     jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls,
                                 aji->samplePlacements, isRsv,
-                                retUserOrOld, retNClade, retGClade, retLineage, retNLineage,
-                                retNCladeUsher, retLineageUsher);
-else if (retUserOrOld && retGClade && retLineage)
-    jsonWriteBranchNodeAttributes(aji->jw, isRsv, *retUserOrOld, *retNClade, *retGClade, *retLineage,
-                                  *retNLineage, *retNCladeUsher, *retLineageUsher);
+                                branchAttrCount, branchAttrCols, branchAttrVals);
+else if (branchAttrVals)
+    jsonWriteBranchNodeAttributes(aji->jw, isRsv, branchAttrCount, branchAttrCols, branchAttrVals);
 jsonWriteObjectEnd(aji->jw);
 }
 
 struct phyloTree *phyloTreeNewNode(char *name)
 /* Alloc & return a new node with no children. */
 {
 struct phyloTree *node;
 AllocVar(node);
 AllocVar(node->ident);
 node->ident->name = cloneString(name);
 return node;
 }
 
 struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome)
 /* If config.ra has a source of gene annotations, then return the gene list. */
@@ -871,50 +864,76 @@
         genePredToCds((struct genePred *)gp, gi->cds);
         int cdsLen = gi->cds->end - gi->cds->start;
         // Skip genes with no CDS (like RNA genes) or obviously incomplete/incorrect CDS.
         if (cdsLen > 0 && (cdsLen % 3) == 0)
             {
             slAddHead(&geneInfoList, gi);
             }
         }
     lmCleanup(&lm);
     bigBedFileClose(&bbi);
     }
 slReverse(&geneInfoList);
 return geneInfoList;
 }
 
+static int getBranchAttrCols(char *db, char ***retBranchAttrCols)
+/* Alloc an array of metadata column names to use as branch attributes and return count.
+ * There will always be at least 1 (userOrOld / Sample type); others come from config setting. */
+{
+int branchAttrCount = 1;
+struct slName *attrList = NULL, *attr;
+char *branchAttrSetting = phyloPlaceDbSetting(db, "branchAttributes");
+if (isNotEmpty(branchAttrSetting))
+    {
+    attrList = slNameListFromComma(branchAttrSetting);
+    branchAttrCount += slCount(attrList);
+    }
+char **branchAttrCols = NULL;
+AllocArray(branchAttrCols, branchAttrCount);
+branchAttrCols[0] = cloneString("userOrOld");
+int i;
+for (i = 1, attr = attrList;  i < branchAttrCount && attr != NULL;  i++, attr = attr->next)
+    branchAttrCols[i] = cloneString(trimSpaces(attr->name));
+*retBranchAttrCols = branchAttrCols;
+return branchAttrCount;
+}
+
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
                        struct seqWindow *gSeqWin, struct hash *sampleMetadata,
                        struct hash *sampleUrls, struct hash *samplePlacements,
                        char *jsonFile, char *source)
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 {
 struct phyloTree *tree = sti->subtree;
 FILE *outF = mustOpen(jsonFile, "w");
 struct jsonWrite *jw = jsonWriteNew();
 jsonWriteObjectStart(jw, NULL);
 jsonWriteString(jw, "version", "v2");
 boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) ||
                  startsWith("RGCC", db));
 boolean isFlu = (stringIn("GCF_000865085", db) || stringIn("GCF_001343785", db));
 writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, db, geneInfoList,
                  gSeqWin->end, isRsv, isFlu);
 jsonWriteObjectStart(jw, "tree");
 int nodeNum = 10000; // Auspice.us starting node number for newick -> json
 int depth = 0;
 
 // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree
 struct phyloTree *root = phyloTreeNewNode("wrapper");
 phyloAddEdge(root, tree);
 tree = root;
 struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin,
                                sampleMetadata, sampleUrls, samplePlacements, nodeNum, source };
-rTreeToAuspiceJson(tree, depth, &aji, NULL, isRsv, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+
+
+char **branchAttrCols = NULL;
+int branchAttrCount = getBranchAttrCols(db, &branchAttrCols);
+rTreeToAuspiceJson(tree, depth, &aji, NULL, isRsv, branchAttrCount, branchAttrCols, NULL);
 jsonWriteObjectEnd(jw); // tree
 jsonWriteObjectEnd(jw); // top-level object
 fputs(jw->dy->string, outF);
 jsonWriteFree(&jw);
 carefulClose(&outF);
 }