e14c2761ca01d8175578df87bd5311b5c0442bfb
angie
  Thu Nov 19 13:04:01 2020 -0800
New CGI hgPhyloPlace: place SARS-CoV-2 genomes in fasta or VCF on phylogenetic tree using Yatish Turakhia's UShER program; add subtree custom tracks and Nextstrain linkouts to visualize results.  refs #25943

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
new file mode 100644
index 0000000..9fb559d
--- /dev/null
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -0,0 +1,555 @@
+/* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample
+ * mutations, protein changes and metadata. */
+
+/* Copyright (C) 2020 The Regents of the University of California */
+
+#include "common.h"
+#include "hash.h"
+#include "hui.h"
+#include "jsonWrite.h"
+#include "linefile.h"
+#include "parsimonyProto.h"
+#include "phyloPlace.h"
+#include "phyloTree.h"
+#include "variantProjector.h"
+
+// Globals
+extern char *chrom;
+extern int chromSize;
+
+static void writeAuspiceMeta(FILE *outF, struct slName *subtreeUserSampleIds)
+/* Write metadata to configure Auspice display. */
+{
+fprintf(outF,
+        "\"meta\": { "
+        "\"title\": \"Subtree with %s", subtreeUserSampleIds->name);
+struct slName *sln;
+for (sln = subtreeUserSampleIds->next;  sln != NULL;  sln = sln->next)
+    fprintf(outF, ", %s", sln->name);
+fputs("\", "
+      "\"panels\": [ \"tree\"] , "
+      "\"colorings\": [ "
+      "  { \"key\": \"pangolin_lineage\", "
+      "    \"title\": \"Pangolin lineage\", \"type\": \"categorical\" },"
+      "  { \"key\": \"GISAID_clade\","
+      "    \"scale\": [ [ \"S\", \"#EC676D\" ], [ \"L\", \"#F79E43\" ], [ \"O\", \"#F9D136\" ],"
+      "        [ \"V\", \"#FAEA95\" ], [ \"G\", \"#B6D77A\" ], [ \"GH\", \"#8FD4ED\" ],"
+      "        [ \"GR\", \"#A692C3\" ] ],"
+      "    \"title\": \"GISAID Clade\", \"type\": \"categorical\" },"
+      "  { \"key\": \"userOrOld\", "
+      "    \"scale\": [ [ \"uploaded sample\", \"#CC0000\"] , [ \"GISAID sample\", \"#000000\"] ],"
+      "    \"title\": \"Sample type\", \"type\": \"categorical\" }"
+      "  ] , "
+//#*** Filters didn't seem to work... maybe something about the new fetch feature, or do I need to spcify in some other way?
+//#***      "\"filters\": [ \"GISAID_clade\", \"region\", \"country\", \"division\", \"author\" ], "
+      "\"display_defaults\": { "
+      "  \"branch_label\": \"nuc mutations\" "
+      "}, "
+      , outF);
+fprintf(outF,
+        "\"description\": \"Dataset generated by [UShER web interface]"
+        "(%shgPhyloPlace) using the "
+        "[usher](https://github.com/yatisht/usher/) program."
+//#*** TODO: describe input from which tree was generated: user sample, version of tree, etc.
+        , hLocalHostCgiBinUrl());
+fputs("If you have metadata you wish to display, you can now drag on a CSV file and it will be "
+      "added into this view, [see here]"
+      "(https://nextstrain.github.io/auspice/advanced-functionality/drag-drop-csv-tsv) "
+      "for more info.\"} ,"
+      , outF);
+}
+
+struct sampleMetadata
+/* Information about a virus sample. */
+    {
+    char *strain;       // Strain name, usually of the form Country/ArbitraryId/YYYY-MM-DD
+    char *epiId;        // GISAID EPI_ISL_[0-9]+ ID
+    char *date;         // Sample collection date
+    char *author;       // Author(s) to credit
+    char *gClade;       // GISAID clade
+    char *lineage;      // Pangolin lineage
+    char *country;      // Country in which sample was collected
+    char *division;     // Administrative division in which sample was collected (country or state)
+    char *location;     // Location in which sample was collected (city)
+    char *countryExp;   // Country in which host was exposed to virus
+    char *divExp;       // Administrative division in which host was exposed to virus
+    char *origLab;      // Originating lab
+    char *subLab;       // Submitting lab
+    char *region;       // Continent on which sample was collected
+    };
+
+static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value)
+/* Write an object with one member, "value", set to value, as most Auspice node attributes are
+ * formatted. */
+{
+jsonWriteObjectStart(jw, name);
+jsonWriteString(jw, "value", value);
+jsonWriteObjectEnd(jw);
+}
+
+static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *db, char *name,
+                                        struct sampleMetadata *met, boolean isUserSample,
+                                        char **retUserOrOld, char **retGClade, char **retLineage)
+/* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash,
+ * or may be a new sample from the user.  Set rets for color categories so parent branches can
+ * determine their color categories. */
+{
+*retUserOrOld = isUserSample ? "uploaded sample" : "GISAID sample";
+jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld);
+if (met && met->date)
+    jsonWriteObjectValue(jw, "date", met->date);
+if (met && met->author)
+    {
+    jsonWriteObjectValue(jw, "author", met->author);
+    // Note: Nextstrain adds paper_url and title when available; they also add author and use
+    // a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A")
+    }
+*retGClade = isUserSample ? "uploaded sample" : (met && met->gClade) ? met->gClade : NULL;
+if (isNotEmpty(*retGClade))
+    jsonWriteObjectValue(jw, "GISAID_clade", *retGClade);
+*retLineage = isUserSample ? "uploaded sample" :
+                               (met && met->lineage) ? met->lineage : lineageForSample(db, name);
+if (isNotEmpty(*retLineage))
+    jsonWriteObjectValue(jw, "pangolin_lineage", *retLineage);
+if (met && met->epiId)
+    jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId);
+if (met && met->country)
+    jsonWriteObjectValue(jw, "country", met->country);
+if (met && met->division)
+    jsonWriteObjectValue(jw, "division", met->division);
+if (met && met->location)
+    jsonWriteObjectValue(jw, "location", met->location);
+if (met && met->countryExp)
+    jsonWriteObjectValue(jw, "country_exposure", met->countryExp);
+if (met && met->divExp)
+    jsonWriteObjectValue(jw, "division_exposure", met->divExp);
+if (met && met->origLab)
+    jsonWriteObjectValue(jw, "originating_lab", met->origLab);
+if (met && met->subLab)
+    jsonWriteObjectValue(jw, "submitting_lab", met->subLab);
+if (met && met->region)
+    jsonWriteObjectValue(jw, "region", met->region);
+}
+
+static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld, char *gClade,
+                                          char *lineage)
+/* Write elements of node_attrs for a branch. */
+{
+if (userOrOld)
+    jsonWriteObjectValue(jw, "userOrOld", userOrOld);
+if (gClade)
+    jsonWriteObjectValue(jw, "GISAID_clade", gClade);
+if (lineage)
+    jsonWriteObjectValue(jw, "pangolin_lineage", lineage);
+}
+
+struct geneInfo
+/* Information sufficient to determine whether a genome change causes a coding change. */
+    {
+    struct geneInfo *next;
+    struct psl *psl;        // Alignment of transcript to genome
+    struct dnaSeq *txSeq;   // Transcript sequence
+    };
+
+static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi,
+                              struct seqWindow *gSeqWin,
+                              int *retAaStart, char *retOldAa, char *retNewAa)
+/* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly
+ * (note amino acid values are single-base not strings). */
+{
+boolean isCodingChange = FALSE;
+if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
+    {
+    struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 };
+    char gAlt[2];
+    safef(gAlt, sizeof(gAlt), "%c", snc->newBase);
+    if (!sameString(gi->psl->strand, "+"))
+        errAbort("changesProtein: only worlds for psl->strand='+', but gene '%s' has strand '%s'",
+                 gi->psl->qName, gi->psl->strand);
+    struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq);
+    if (vpTx->start.region == vpExon)
+        {
+        int aaStart = vpTx->start.txOffset / 3;
+        int codonStart = aaStart * 3;
+        char newCodon[4];
+        safencpy(newCodon, sizeof newCodon, gi->txSeq->dna + codonStart, 3);
+        int codonOffset = vpTx->start.txOffset - codonStart;
+        assert(codonOffset < sizeof newCodon);
+        newCodon[codonOffset] = snc->newBase;
+        char oldAa = lookupCodon(gi->txSeq->dna + codonStart);
+        char newAa = lookupCodon(newCodon);
+        if (newAa != oldAa)
+            {
+            isCodingChange = TRUE;
+            *retAaStart = aaStart;
+            *retOldAa = oldAa;
+            *retNewAa = newAa;
+            }
+        }
+    vpTxFree(&vpTx);
+    }
+return isCodingChange;
+}
+
+static struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList,
+                                     struct seqWindow *gSeqWin)
+/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
+{
+struct slPair *geneChangeList = NULL;
+struct geneInfo *gi;
+for (gi = geneInfoList;  gi != NULL;  gi = gi->next)
+    {
+    struct slName *aaChangeList = NULL;
+    struct singleNucChange *snc;
+    for (snc = sncList;  snc != NULL;  snc = snc->next)
+        {
+        if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
+            {
+            int aaStart;
+            char oldAa, newAa;
+            if (changesProtein(snc, gi, gSeqWin, &aaStart, &oldAa, &newAa))
+                {
+                char aaChange[32];
+                safef(aaChange, sizeof aaChange, "%c%d%c", oldAa, aaStart+1, newAa);
+                slNameAddHead(&aaChangeList, aaChange);
+                }
+            }
+        }
+    if (aaChangeList != NULL)
+        {
+        slReverse(&aaChangeList);
+        slAddHead(&geneChangeList, slPairNew(gi->psl->qName, aaChangeList));
+        }
+    }
+slReverse(&geneChangeList);
+return geneChangeList;
+}
+
+static void jsonWriteBranchAttrs(struct jsonWrite *jw, struct phyloTree *node,
+                                 struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
+/* Write mutations (if any).  If node is not a leaf, write label for mutations at this node. */
+{
+if (node->priv != NULL)
+    {
+    struct singleNucChange *sncList = node->priv;
+    struct slPair *geneAaMutations = getAaMutations(sncList, geneInfoList, gSeqWin);
+    jsonWriteObjectStart(jw, "branch_attrs");
+    if (node->numEdges > 0)
+        {
+        jsonWriteObjectStart(jw, "labels");
+        struct singleNucChange *snc = sncList;
+        struct dyString *dy = dyStringCreate("%c%d%c",
+                                             snc->parBase, snc->chromStart+1, snc->newBase);
+        for (snc = sncList->next;  snc != NULL;  snc = snc->next)
+            dyStringPrintf(dy, ",%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
+        jsonWriteString(jw, "nuc mutations", dy->string);
+        dyStringClear(dy);
+        struct slPair *geneAaMut;
+        for (geneAaMut = geneAaMutations;  geneAaMut != NULL;  geneAaMut = geneAaMut->next)
+            {
+            struct slName *aaMut;
+            for (aaMut = geneAaMut->val;  aaMut != NULL;  aaMut = aaMut->next)
+                {
+                dyStringAppendSep(dy, ",");
+                dyStringPrintf(dy, "%s:%s", geneAaMut->name, aaMut->name);
+                }
+            }
+        if (isNotEmpty(dy->string))
+            jsonWriteString(jw, "aa mutations", dy->string);
+        dyStringFree(&dy);
+        jsonWriteObjectEnd(jw);
+        }
+    jsonWriteObjectStart(jw, "mutations");
+    struct slPair *geneAaMut;
+    for (geneAaMut = geneAaMutations;  geneAaMut != NULL;  geneAaMut = geneAaMut->next)
+        {
+        jsonWriteListStart(jw, geneAaMut->name);
+        struct slName *aaMut;
+        for (aaMut = geneAaMut->val;  aaMut != NULL;  aaMut = aaMut->next)
+            jsonWriteString(jw, NULL, aaMut->name);
+        jsonWriteListEnd(jw);
+        }
+    jsonWriteListStart(jw, "nuc");
+    struct singleNucChange *snc;
+    for (snc = sncList;  snc != NULL;  snc = snc->next)
+        jsonWriteStringf(jw, NULL, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
+    jsonWriteListEnd(jw);
+    jsonWriteObjectEnd(jw);  // mutations
+    jsonWriteObjectEnd(jw); // branch_attrs
+    }
+}
+
+struct auspiceJsonInfo
+/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the
+ * recursive function doesn't need a dozen args. */
+    {
+    struct jsonWrite *jw;
+    char *db;
+    struct slName *subtreeUserSampleIds;  // Subtree node names for user samples (not from big tree)
+    struct geneInfo *geneInfoList;        // Transcript seq & alignment for predicting AA change
+    struct seqWindow *gSeqWin;            // Reference genome seq for predicting AA change
+    struct hash *sampleMetadata;          // Sample metadata for decorating tree
+    int nodeNum;                          // For generating sequential node ID (in absence of name)
+    };
+
+static int cmpstringp(const void *p1, const void *p2)
+/* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */
+{
+char *s1 = *(char * const *)p1;
+char *s2 = *(char * const *)p2;
+if (s1 && s2)
+    return strcmp(s1, s2);
+else if (s1 && !s2)
+    return 1;
+else if (s2 && !s1)
+    return -1;
+return 0;
+}
+
+static char *majorityMaybe(char *array[], int arraySize)
+/* Sort array and if a majority of its values are the same then return that value; otherwise NULL. */
+{
+if (arraySize < 1)
+    return NULL;
+qsort(array, arraySize, sizeof array[0], cmpstringp);
+char *maxRunVal = array[0];
+int runLength = 1, maxRunLength = 1;
+int i;
+for (i = 1;  i < arraySize;  i++)
+    {
+    if (sameOk(array[i], array[i-1]))
+        {
+        runLength++;
+        if (runLength > maxRunLength)
+            {
+            maxRunLength = runLength;
+            maxRunVal = array[i];
+            }
+        }
+    else
+        runLength = 1;
+    }
+return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL;
+}
+
+static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji,
+                               char **retUserOrOld, char **retGClade, char **retLineage)
+/* Write Augur/Auspice V2 JSON for tree.  Enclosing object start and end are written by caller. */
+{
+if (node->priv)
+    {
+    struct singleNucChange *sncList = node->priv;
+    depth += slCount(sncList);
+    }
+boolean isUserSample = FALSE;
+if (node->ident->name)
+    isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name);
+char *name = node->ident->name;
+char *epiId = name ? epiIdFromSampleName(name) : NULL;
+struct sampleMetadata *met = NULL;
+if (epiId && aji->sampleMetadata)
+    met = hashFindVal(aji->sampleMetadata, epiId);
+if (!isUserSample && met && met->strain)
+    // Some of Rob's names are outdated; use latest from metadata.
+    name = met->strain;
+if (name)
+    jsonWriteString(aji->jw, "name", name);
+else
+    jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++);
+jsonWriteBranchAttrs(aji->jw, node, aji->geneInfoList, aji->gSeqWin);
+if (node->numEdges > 0)
+    {
+    jsonWriteListStart(aji->jw, "children");
+    char *kidUserOrOld[node->numEdges];
+    char *kidGClade[node->numEdges];
+    char *kidLineage[node->numEdges];
+    int i;
+    for (i = 0;  i < node->numEdges;  i++)
+        {
+        jsonWriteObjectStart(aji->jw, NULL);
+        rTreeToAuspiceJson(node->edges[i], depth, aji,
+                           &kidUserOrOld[i], &kidGClade[i], &kidLineage[i]);
+        jsonWriteObjectEnd(aji->jw);
+        }
+    jsonWriteListEnd(aji->jw);
+    if (retUserOrOld)
+        *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges);
+    if (retGClade)
+        *retGClade = majorityMaybe(kidGClade, node->numEdges);
+    if (retLineage)
+        *retLineage = majorityMaybe(kidLineage, node->numEdges);
+    }
+jsonWriteObjectStart(aji->jw, "node_attrs");
+jsonWriteDouble(aji->jw, "div", depth);
+if (node->numEdges == 0)
+    jsonWriteLeafNodeAttributes(aji->jw, aji->db, name, met, isUserSample,
+                                retUserOrOld, retGClade, retLineage);
+else if (retUserOrOld && retGClade && retLineage)
+    jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retGClade, *retLineage);
+jsonWriteObjectEnd(aji->jw);
+}
+
+struct phyloTree *phyloTreeNewNode(char *name)
+/* Alloc & return a new node with no children. */
+{
+struct phyloTree *node;
+AllocVar(node);
+AllocVar(node->ident);
+node->ident->name = cloneString(name);
+return node;
+}
+
+struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome)
+/* If config.ra has a source of gene annotations, then return the gene list. */
+{
+struct geneInfo *geneInfoList = NULL;
+if (isNotEmpty(bigGenePredFile) && fileExists(bigGenePredFile))
+    {
+    struct bbiFile *bbi = bigBedFileOpen(bigGenePredFile);
+    struct lm *lm = lmInit(0);
+    struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm);
+    for (bb = bbList;  bb != NULL;  bb = bb->next)
+        {
+        struct genePredExt *gp = genePredFromBigGenePred(chrom, bb);
+        if (gp->strand[0] != '+')
+            errAbort("getGeneInfoList: strand must be '+' but got '%s' for gene %s",
+                     gp->strand, gp->name);
+        int txLen = 0;
+        int ex;
+        for (ex = 0;  ex < gp->exonCount;  ex++)
+            txLen += (gp->exonEnds[ex] - gp->exonStarts[ex]);
+        char *seq = needMem(txLen);
+        int txOffset = 0;
+        for (ex = 0;  ex < gp->exonCount;  ex++)
+            {
+            int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex];
+            safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen);
+            txOffset += exonLen;
+            }
+        struct geneInfo *gi;
+        AllocVar(gi);
+        gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen);
+        gi->txSeq = newDnaSeq(seq, txLen, gp->name2);
+        slAddHead(&geneInfoList, gi);
+        }
+    lmCleanup(&lm);
+    bigBedFileClose(&bbi);
+    }
+slReverse(&geneInfoList);
+return geneInfoList;
+}
+
+static struct hash *getSampleMetadata(char *metadataFile)
+/* If config.ra defines a metadataFile, load its contents into a hash indexed by EPI ID and return;
+ * otherwise return NULL. */
+{
+struct hash *sampleMetadata = NULL;
+if (isNotEmpty(metadataFile) && fileExists(metadataFile))
+    {
+    sampleMetadata = hashNew(0);
+    struct lineFile *lf = lineFileOpen(metadataFile, TRUE);
+    int headerWordCount = 0;
+    char **headerWords = NULL;
+    char *line;
+    // Check for header line
+    if (lineFileNext(lf, &line, NULL))
+        {
+        if (startsWithWord("strain", line))
+            {
+            char *headerLine = cloneString(line);
+            headerWordCount = chopString(headerLine, "\t", NULL, 0);
+            AllocArray(headerWords, headerWordCount);
+            chopString(headerLine, "\t", headerWords, headerWordCount);
+            }
+        else
+            errAbort("Missing header line from metadataFile %s", metadataFile);
+        }
+    int strainIx = stringArrayIx("strain", headerWords, headerWordCount);
+    int epiIdIx = stringArrayIx("gisaid_epi_isl", headerWords, headerWordCount);
+    int dateIx = stringArrayIx("date", headerWords, headerWordCount);
+    int authorIx = stringArrayIx("authors", headerWords, headerWordCount);
+    int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount);
+    int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount);
+    int countryIx = stringArrayIx("country", headerWords, headerWordCount);
+    int divisionIx = stringArrayIx("division", headerWords, headerWordCount);
+    int locationIx = stringArrayIx("location", headerWords, headerWordCount);
+    int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount);
+    int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount);
+    int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount);
+    int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount);
+    int regionIx = stringArrayIx("region", headerWords, headerWordCount);
+    while (lineFileNext(lf, &line, NULL))
+        {
+        char *words[headerWordCount];
+        int wordCount = chopTabs(line, words);
+        lineFileExpectWords(lf, headerWordCount, wordCount);
+        struct sampleMetadata *met;
+        AllocVar(met);
+        met->strain = cloneString(words[strainIx]);
+        if (epiIdIx >= 0)
+            met->epiId = cloneString(words[epiIdIx]);
+        if (dateIx >= 0)
+            met->date = cloneString(words[dateIx]);
+        if (authorIx >= 0)
+            met->author = cloneString(words[authorIx]);
+        if (gCladeIx >= 0)
+            met->gClade = cloneString(words[gCladeIx]);
+        if (lineageIx >= 0)
+            met->lineage = cloneString(words[lineageIx]);
+        if (countryIx >= 0)
+            met->country = cloneString(words[countryIx]);
+        if (divisionIx >= 0)
+            met->division = cloneString(words[divisionIx]);
+        if (locationIx >= 0)
+            met->location = cloneString(words[locationIx]);
+        if (countryExpIx >= 0)
+            met->countryExp = cloneString(words[countryExpIx]);
+        if (divExpIx >= 0)
+            met->divExp = cloneString(words[divExpIx]);
+        if (origLabIx >= 0)
+            met->origLab = cloneString(words[origLabIx]);
+        if (subLabIx >= 0)
+            met->subLab = cloneString(words[subLabIx]);
+        if (regionIx >= 0)
+            met->region = cloneString(words[regionIx]);
+        hashAdd(sampleMetadata, words[epiIdIx], met);
+        }
+    lineFileClose(&lf);
+    }
+return sampleMetadata;
+}
+
+void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref,
+                       char *bigGenePredFile, char *metadataFile, char *jsonFile)
+/* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
+ * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
+{
+struct phyloTree *tree = sti->subtree;
+FILE *outF = mustOpen(jsonFile, "w");
+fputs("{ \"version\": \"v2\", ", outF);
+writeAuspiceMeta(outF, sti->subtreeUserSampleIds);
+// The meta part is mostly constant & easier to just write out, but jsonWrite is better for the
+// nested tree structure.
+struct jsonWrite *jw = jsonWriteNew();
+jsonWriteObjectStart(jw, "tree");
+int nodeNum = 10000; // Auspice.us starting node number for newick -> json
+int depth = 0;
+
+// Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree
+struct phyloTree *root = phyloTreeNewNode("wrapper");
+phyloAddEdge(root, tree);
+tree = root;
+struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, ref);
+struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize);
+struct hash *sampleMetadata = getSampleMetadata(metadataFile);
+struct auspiceJsonInfo aji = { jw, db, sti->subtreeUserSampleIds, geneInfoList, gSeqWin,
+                               sampleMetadata, nodeNum };
+rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL);
+chromSeqWindowFree(&gSeqWin);
+jsonWriteObjectEnd(jw);
+fputs(jw->dy->string, outF);
+jsonWriteFree(&jw);
+fputs("}", outF);
+carefulClose(&outF);
+}
+