ff94dc36ad4584d7ccdeeb545b2b92825735d225
angie
  Fri Dec 8 09:55:19 2023 -0800
If an auspiceMeta.json file is specified in config.ra then use it for organism-specific tree colors & filters.  Also use CDS coords when writing genome annotations in Auspice output; not all viral genes are CDS-only.

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index b5e945f..1b2d661 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -1,27 +1,28 @@
 /* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample
  * mutations, protein changes and metadata. */
 
 /* Copyright (C) 2020 The Regents of the University of California */
 
 #include "common.h"
 #include "dnaseq.h"
 #include "errCatch.h"
 #include "hash.h"
 #include "hui.h"
 #include "jsonWrite.h"
 #include "linefile.h"
+#include "obscure.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "phyloTree.h"
 #include "variantProjector.h"
 
 
 static void auspiceMetaColoringCategoricalStart(struct jsonWrite *jw, char *key, char *title)
 /* Write key, title and type of a "categorical" coloring spec, but leave it open in case a
  * scale list needs to be added. */
 {
 jsonWriteObjectStart(jw, NULL);
 jsonWriteString(jw, "key", key);
 jsonWriteString(jw, "title", title);
 jsonWriteString(jw, "type", "categorical");
 }
@@ -36,30 +37,37 @@
 /* Write a "categorical" coloring spec with no scale component. */
 {
 auspiceMetaColoringCategoricalStart(jw, key, title);
 auspiceMetaColoringCategoricalEnd(jw);
 }
 
 static void jsonWritePair(struct jsonWrite *jw, char *valA, char *valB)
 /* Write a list with two string values. */
 {
 jsonWriteListStart(jw, NULL);
 jsonWriteString(jw, NULL, valA);
 jsonWriteString(jw, NULL, valB);
 jsonWriteListEnd(jw);
 }
 
+static void jsonWriteAppendData(struct jsonWrite *jw, char *data)
+/* Append data to jw->dy, trusting that it will maintain well-formed JSON results. */
+{
+dyStringAppend(jw->dy, data);
+}
+
+
 static void auspiceMetaColoringSarsCov2Nextclade(struct jsonWrite *jw, char *key, char *title)
 /* Write a coloring spec for SARS-CoV-2 Nextstrain clades.  (Non-VoC clades are omitted and
  * will be assigned grayscale values by Auspice. */
 {
 auspiceMetaColoringCategoricalStart(jw, key, title);
 jsonWriteListStart(jw, "scale");
 // Color hex values are words from line N of
 // https://github.com/nextstrain/ncov/blob/master/defaults/color_schemes.tsv
 // where N = number of clade_membership lines in
 // https://github.com/nextstrain/ncov/blob/master/defaults/color_ordering.tsv
 jsonWritePair(jw, "20H (Beta, V2)", "#5E1D9D");
 jsonWritePair(jw, "20I (Alpha, V1)", "#492AB5");
 jsonWritePair(jw, "20J (Gamma, V3)", "#4042C7");
 jsonWritePair(jw, "21A (Delta)", "#3E5DD0");
 jsonWritePair(jw, "21I (Delta)", "#4377CD");
@@ -86,74 +94,120 @@
 static void writeAuspiceMetaGenomeAnnotations(struct jsonWrite *jw, struct geneInfo *geneInfoList,
                                               uint genomeSize)
 /* Write out auspice genome annotations (protein-coding gene CDS and "nuc"). */
 {
 jsonWriteObjectStart(jw, "genome_annotations");
 struct geneInfo *gi;
 for (gi = geneInfoList;  gi != NULL;  gi = gi->next)
     {
     jsonWriteObjectStart(jw, gi->psl->qName);
     if (gi->psl->blockCount > 1)
         {
         jsonWriteListStart(jw, "segments");
         int i;
         for (i = 0;  i < gi->psl->blockCount;  i++)
             {
+            int cStart = gi->psl->tStarts[i];
+            int cEnd = gi->psl->tStarts[i] + gi->psl->blockSizes[i];
+            if (cEnd < gi->cdsStart)
+                continue;
+            if (cStart > gi->cdsEnd)
+                break;
+            if (cStart < gi->cdsStart)
+                cStart = gi->cdsStart;
+            if (cEnd > gi->cdsEnd)
+                cEnd = gi->cdsEnd;
             jsonWriteObjectStart(jw, NULL);
-            jsonWriteNumber(jw, "start", gi->psl->tStarts[i]+1);
-            jsonWriteNumber(jw, "end", gi->psl->tStarts[i] + gi->psl->blockSizes[i]);
+            jsonWriteNumber(jw, "start", cStart+1);
+            jsonWriteNumber(jw, "end", cEnd);
             jsonWriteObjectEnd(jw);
             }
         jsonWriteListEnd(jw);
         }
     else
         {
-        jsonWriteNumber(jw, "start", gi->psl->tStart+1);
-        jsonWriteNumber(jw, "end", gi->psl->tEnd);
+        jsonWriteNumber(jw, "start", gi->cdsStart+1);
+        jsonWriteNumber(jw, "end", gi->cdsEnd);
         }
     jsonWriteString(jw, "strand", (pslOrientation(gi->psl) > 0) ? "+" : "-");
     jsonWriteString(jw, "type", "CDS");
     jsonWriteObjectEnd(jw);
     }
 jsonWriteObjectStart(jw, "nuc");
 jsonWriteNumber(jw, "start", 1);
 jsonWriteNumber(jw, "end", genomeSize);
 jsonWriteString(jw, "strand", "+");
 jsonWriteString(jw, "type", "source");
 jsonWriteObjectEnd(jw);
 jsonWriteObjectEnd(jw);
 }
 
+//#*** TODO: in v459 or later, remove the organism-specific code.
+static struct slName *getColorFields(char *db, boolean isRsv, boolean isFlu)
+{
+struct slName *colorFields = NULL;
+if (sameString(db, "wuhCor1"))
+    {
+    slNameAddHead(&colorFields, "country");
+    slNameAddHead(&colorFields, "Nextstrain_clade_usher");
+    slNameAddHead(&colorFields, "pango_lineage_usher");
+    slNameAddHead(&colorFields, "Nextstrain_clade");
+    slNameAddHead(&colorFields, "pango_lineage");
+    }
+else if (isRsv)
+    {
+    slNameAddHead(&colorFields, "country");
+    slNameAddHead(&colorFields, "GCC_nextclade");
+    slNameAddHead(&colorFields, "GCC_usher");
+    slNameAddHead(&colorFields, "GCC_assigned_2023-11");
+    slNameAddHead(&colorFields, "goya_nextclade");
+    slNameAddHead(&colorFields, "goya_usher");
+    }
+else if (isFlu)
+    {
+    slNameAddHead(&colorFields, "country");
+    slNameAddHead(&colorFields, "Nextstrain_clade");
+    }
+else
+    {
+    slNameAddHead(&colorFields, "country");
+    slNameAddHead(&colorFields, "Nextstrain_lineage");
+    }
+return colorFields;
+}
+
+//#*** TODO: in v459 or later, remove the organism-specific stuff
 static char *getDefaultColor(struct slName *colorFields)
 /* Pick default color from available color fields from metadata.  Do not free returned string. */
 {
 char *colorDefault = NULL;
 if (slNameInList(colorFields, "pango_lineage_usher"))
     colorDefault = "pango_lineage_usher";
 else if (slNameInList(colorFields, "Nextstrain_lineage"))
     colorDefault = "Nextstrain_lineage";
 else if (slNameInList(colorFields, "Nextstrain_clade"))
     colorDefault = "Nextstrain_clade";
 else if (slNameInList(colorFields, "GCC_usher"))
     colorDefault = "GCC_usher";
 else if (colorFields != NULL)
     colorDefault = colorFields->name;
 else
     colorDefault = "userOrOld";
 return colorDefault;
 }
 
+//#*** TODO: in v459 or later, remove the organism-specific stuff
 static void auspiceMetaColorings(struct jsonWrite *jw, char *source, struct slName *colorFields,
                                  char *db)
 /* Write coloring specs for colorFields from metadata, locally added userOrOld, and
  * Auspice-automatic gt. */
 {
 jsonWriteListStart(jw, "colorings");
 auspiceMetaColoringCategoricalStart(jw, "userOrOld", "Sample type");
 jsonWriteListStart(jw, "scale");
 jsonWritePair(jw, "uploaded sample", "#CC0000");
 jsonWritePair(jw, source, "#000000");
 jsonWriteListEnd(jw);
 auspiceMetaColoringCategoricalEnd(jw);
 auspiceMetaColoringCategorical(jw, "gt", "Genotype");
 struct slName *col;
 for (col = colorFields;  col != NULL;  col = col->next)
@@ -181,99 +235,113 @@
     else if (sameString(col->name, "GCC_nextclade"))
         auspiceMetaColoringCategorical(jw, col->name, "RGCC lineage assigned by nextclade");
     else if (sameString(col->name, "GCC_usher"))
         auspiceMetaColoringCategorical(jw, col->name, "RGCC lineage assigned by UShER");
     else if (sameString(col->name, "GCC_assigned_2023-11"))
         auspiceMetaColoringCategorical(jw, col->name, "RGCC designated lineage");
     else if (sameString(col->name, "country"))
         auspiceMetaColoringCategorical(jw, col->name, "Country");
     else
         auspiceMetaColoringCategorical(jw, col->name, col->name);
     }
 jsonWriteListEnd(jw);
 }
 
 static void writeAuspiceMeta(struct jsonWrite *jw, struct slName *subtreeUserSampleIds, char *source,
-                             char *db, struct slName *colorFields, struct geneInfo *geneInfoList,
+                             char *db, struct geneInfo *geneInfoList,
                              uint genomeSize, boolean isRsv, boolean isFlu)
 /* Write metadata to configure Auspice display. */
 {
 jsonWriteObjectStart(jw, "meta");
 // Title
 struct dyString *dy = dyStringCreate("Subtree with %s", subtreeUserSampleIds->name);
 int sampleCount = slCount(subtreeUserSampleIds);
 if (sampleCount > 10)
     dyStringPrintf(dy, " and %d other uploaded samples", sampleCount - 1);
 else
     {
     struct slName *sln;
     for (sln = subtreeUserSampleIds->next;  sln != NULL;  sln = sln->next)
         dyStringPrintf(dy, ", %s", sln->name);
     }
 jsonWriteString(jw, "title", dy->string);
 // Description
 jsonWriteStringf(jw, "description", "Dataset generated by [UShER web interface]"
                  "(%shgPhyloPlace) using the "
                  "[usher](https://github.com/yatisht/usher/) program.  "
 //#*** TODO: describe input from which tree was generated: user sample, version of tree, etc.
                  "If you have metadata you wish to display, you can now drag on a CSV file and "
                  "it will be added into this view, [see here]("NEXTSTRAIN_DRAG_DROP_DOC") "
                  "for more info."
                  , hLocalHostCgiBinUrl());
 // Panels: just the tree and  entropy (no map)
 jsonWriteListStart(jw, "panels");
 jsonWriteString(jw, NULL, "tree");
 jsonWriteString(jw, NULL, "entropy");
 jsonWriteListEnd(jw);
+char *metaJsonFile = phyloPlaceDbSettingPath(db, "auspiceMeta");
+if (isNotEmpty(metaJsonFile) && fileExists(metaJsonFile))
+    {
+    char *metaJson = NULL;
+    size_t size = 0;
+    readInGulp(metaJsonFile, &metaJson, &size);
+    while (size > 0 && metaJson[size-1] == '\n')
+        metaJson[--size] = '\0';
+    jsonWriteAppendData(jw, ", ");
+    jsonWriteAppendData(jw, metaJson);
+    freeMem(metaJson);
+    }
+else
+    {
     // Default label & color
+    struct slName *colorFields = getColorFields(db, isRsv, isFlu);
     jsonWriteObjectStart(jw, "display_defaults");
     jsonWriteString(jw, "branch_label", "aa mutations");
     jsonWriteString(jw, "color_by", getDefaultColor(colorFields));
     jsonWriteObjectEnd(jw);
     // Colorings: userOrOld, gt and whatever we got from metadata
     auspiceMetaColorings(jw, source, colorFields, db);
-// Filters didn't work when I tried them a long time ago... revisit someday.
+    // Filters
     jsonWriteListStart(jw, "filters");
     jsonWriteString(jw, NULL, "userOrOld");
     jsonWriteString(jw, NULL, "country");
-//#*** FIXME: TODO: either pass in along with sampleMetadata, or take from JSON file specified
-//#*** in config, or better yet, compute while building tree object in memory, then write the
-//#*** header object, then write the tree.
+    //#*** TODO: in v459 or later, remove the organism-specific stuff
     if (sameString(db, "wuhCor1"))
         {
         jsonWriteString(jw, NULL, "pango_lineage_usher");
         jsonWriteString(jw, NULL, "pango_lineage");
         jsonWriteString(jw, NULL, "Nextstrain_clade_usher");
         jsonWriteString(jw, NULL, "Nextstrain_clade");
         }
     else if (isRsv)
         {
         jsonWriteString(jw, NULL, "GCC_usher");
         jsonWriteString(jw, NULL, "GCC_nextclade");
         jsonWriteString(jw, NULL, "GCC_assigned_2023-11");
         jsonWriteString(jw, NULL, "goya_usher");
         jsonWriteString(jw, NULL, "goya_nextclade");
         }
     else if (isFlu)
         {
         jsonWriteString(jw, NULL, "Nextstrain_clade");
         }
     else
         {
         jsonWriteString(jw, NULL, "Nextstrain_lineage");
         }
     jsonWriteListEnd(jw);
+    }
 // Annotations for coloring/filtering by base
 writeAuspiceMetaGenomeAnnotations(jw, geneInfoList, genomeSize);
 jsonWriteObjectEnd(jw);
 }
 
 static void jsonWriteObjectValueUrl(struct jsonWrite *jw, char *name, char *value, char *url)
 /* Write an object with member "value" set to value, and if url is non-empty, "url" set to url. */
 {
 jsonWriteObjectStart(jw, name);
 jsonWriteString(jw, "value", value);
 if (isNotEmpty(url))
     jsonWriteString(jw, "url", url);
 jsonWriteObjectEnd(jw);
 }
 
@@ -793,94 +861,64 @@
         char *seq = needMem(txLen+1);
         int txOffset = 0;
         for (ex = 0;  ex < gp->exonCount;  ex++)
             {
             int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex];
             safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen);
             txOffset += exonLen;
             }
         if (sameString(gp->strand, "-"))
             reverseComplement(seq, txLen);
         struct geneInfo *gi;
         AllocVar(gi);
         gi->psl = genePredToPsl((struct genePred *)gp, refGenome->size, txLen);
         gi->psl->qName = cloneString(gp->name2);
         gi->txSeq = newDnaSeq(seq, txLen, gp->name2);
+        gi->cdsStart = gp->cdsStart;
+        gi->cdsEnd = gp->cdsEnd;
         AllocVar(gi->cds);
         genePredToCds((struct genePred *)gp, gi->cds);
         int cdsLen = gi->cds->end - gi->cds->start;
         // Skip genes with no CDS (like RNA genes) or obviously incomplete/incorrect CDS.
         if (cdsLen > 0 && (cdsLen % 3) == 0)
             {
             slAddHead(&geneInfoList, gi);
             }
         }
     lmCleanup(&lm);
     bigBedFileClose(&bbi);
     }
 slReverse(&geneInfoList);
 return geneInfoList;
 }
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
                        struct seqWindow *gSeqWin, struct hash *sampleMetadata,
                        struct hash *sampleUrls, struct hash *samplePlacements,
                        char *jsonFile, char *source)
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 {
 struct phyloTree *tree = sti->subtree;
 FILE *outF = mustOpen(jsonFile, "w");
 struct jsonWrite *jw = jsonWriteNew();
 jsonWriteObjectStart(jw, NULL);
 jsonWriteString(jw, "version", "v2");
-//#*** FIXME: TODO: either pass in along with sampleMetadata, or take from JSON file specified
-//#*** in config, or better yet, compute while building tree object in memory, then write the
-//#*** header object, then write the tree.
 boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) ||
                  startsWith("RGCC", db));
 boolean isFlu = (stringIn("GCF_000865085", db) || stringIn("GCF_001343785", db));
-struct slName *colorFields = NULL;
-if (sameString(db, "wuhCor1"))
-    {
-    slNameAddHead(&colorFields, "country");
-    slNameAddHead(&colorFields, "Nextstrain_clade_usher");
-    slNameAddHead(&colorFields, "pango_lineage_usher");
-    slNameAddHead(&colorFields, "Nextstrain_clade");
-    slNameAddHead(&colorFields, "pango_lineage");
-    }
-else if (isRsv)
-    {
-    slNameAddHead(&colorFields, "country");
-    slNameAddHead(&colorFields, "GCC_nextclade");
-    slNameAddHead(&colorFields, "GCC_usher");
-    slNameAddHead(&colorFields, "GCC_assigned_2023-11");
-    slNameAddHead(&colorFields, "goya_nextclade");
-    slNameAddHead(&colorFields, "goya_usher");
-    }
-else if (isFlu)
-    {
-    slNameAddHead(&colorFields, "country");
-    slNameAddHead(&colorFields, "Nextstrain_clade");
-    }
-else
-    {
-    slNameAddHead(&colorFields, "country");
-    slNameAddHead(&colorFields, "Nextstrain_lineage");
-    }
-//#*** END FIXME
-writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, db, colorFields, geneInfoList,
+writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, db, geneInfoList,
                  gSeqWin->end, isRsv, isFlu);
 jsonWriteObjectStart(jw, "tree");
 int nodeNum = 10000; // Auspice.us starting node number for newick -> json
 int depth = 0;
 
 // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree
 struct phyloTree *root = phyloTreeNewNode("wrapper");
 phyloAddEdge(root, tree);
 tree = root;
 struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin,
                                sampleMetadata, sampleUrls, samplePlacements, nodeNum, source };
 rTreeToAuspiceJson(tree, depth, &aji, NULL, isRsv, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
 jsonWriteObjectEnd(jw); // tree
 jsonWriteObjectEnd(jw); // top-level object
 fputs(jw->dy->string, outF);