ff94dc36ad4584d7ccdeeb545b2b92825735d225 angie Fri Dec 8 09:55:19 2023 -0800 If an auspiceMeta.json file is specified in config.ra then use it for organism-specific tree colors & filters. Also use CDS coords when writing genome annotations in Auspice output; not all viral genes are CDS-only. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index b5e945f..1b2d661 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -1,27 +1,28 @@ /* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample * mutations, protein changes and metadata. */ /* Copyright (C) 2020 The Regents of the University of California */ #include "common.h" #include "dnaseq.h" #include "errCatch.h" #include "hash.h" #include "hui.h" #include "jsonWrite.h" #include "linefile.h" +#include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "variantProjector.h" static void auspiceMetaColoringCategoricalStart(struct jsonWrite *jw, char *key, char *title) /* Write key, title and type of a "categorical" coloring spec, but leave it open in case a * scale list needs to be added. */ { jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "key", key); jsonWriteString(jw, "title", title); jsonWriteString(jw, "type", "categorical"); } @@ -36,30 +37,37 @@ /* Write a "categorical" coloring spec with no scale component. */ { auspiceMetaColoringCategoricalStart(jw, key, title); auspiceMetaColoringCategoricalEnd(jw); } static void jsonWritePair(struct jsonWrite *jw, char *valA, char *valB) /* Write a list with two string values. */ { jsonWriteListStart(jw, NULL); jsonWriteString(jw, NULL, valA); jsonWriteString(jw, NULL, valB); jsonWriteListEnd(jw); } +static void jsonWriteAppendData(struct jsonWrite *jw, char *data) +/* Append data to jw->dy, trusting that it will maintain well-formed JSON results. */ +{ +dyStringAppend(jw->dy, data); +} + + static void auspiceMetaColoringSarsCov2Nextclade(struct jsonWrite *jw, char *key, char *title) /* Write a coloring spec for SARS-CoV-2 Nextstrain clades. (Non-VoC clades are omitted and * will be assigned grayscale values by Auspice. */ { auspiceMetaColoringCategoricalStart(jw, key, title); jsonWriteListStart(jw, "scale"); // Color hex values are words from line N of // https://github.com/nextstrain/ncov/blob/master/defaults/color_schemes.tsv // where N = number of clade_membership lines in // https://github.com/nextstrain/ncov/blob/master/defaults/color_ordering.tsv jsonWritePair(jw, "20H (Beta, V2)", "#5E1D9D"); jsonWritePair(jw, "20I (Alpha, V1)", "#492AB5"); jsonWritePair(jw, "20J (Gamma, V3)", "#4042C7"); jsonWritePair(jw, "21A (Delta)", "#3E5DD0"); jsonWritePair(jw, "21I (Delta)", "#4377CD"); @@ -86,74 +94,120 @@ static void writeAuspiceMetaGenomeAnnotations(struct jsonWrite *jw, struct geneInfo *geneInfoList, uint genomeSize) /* Write out auspice genome annotations (protein-coding gene CDS and "nuc"). */ { jsonWriteObjectStart(jw, "genome_annotations"); struct geneInfo *gi; for (gi = geneInfoList; gi != NULL; gi = gi->next) { jsonWriteObjectStart(jw, gi->psl->qName); if (gi->psl->blockCount > 1) { jsonWriteListStart(jw, "segments"); int i; for (i = 0; i < gi->psl->blockCount; i++) { + int cStart = gi->psl->tStarts[i]; + int cEnd = gi->psl->tStarts[i] + gi->psl->blockSizes[i]; + if (cEnd < gi->cdsStart) + continue; + if (cStart > gi->cdsEnd) + break; + if (cStart < gi->cdsStart) + cStart = gi->cdsStart; + if (cEnd > gi->cdsEnd) + cEnd = gi->cdsEnd; jsonWriteObjectStart(jw, NULL); - jsonWriteNumber(jw, "start", gi->psl->tStarts[i]+1); - jsonWriteNumber(jw, "end", gi->psl->tStarts[i] + gi->psl->blockSizes[i]); + jsonWriteNumber(jw, "start", cStart+1); + jsonWriteNumber(jw, "end", cEnd); jsonWriteObjectEnd(jw); } jsonWriteListEnd(jw); } else { - jsonWriteNumber(jw, "start", gi->psl->tStart+1); - jsonWriteNumber(jw, "end", gi->psl->tEnd); + jsonWriteNumber(jw, "start", gi->cdsStart+1); + jsonWriteNumber(jw, "end", gi->cdsEnd); } jsonWriteString(jw, "strand", (pslOrientation(gi->psl) > 0) ? "+" : "-"); jsonWriteString(jw, "type", "CDS"); jsonWriteObjectEnd(jw); } jsonWriteObjectStart(jw, "nuc"); jsonWriteNumber(jw, "start", 1); jsonWriteNumber(jw, "end", genomeSize); jsonWriteString(jw, "strand", "+"); jsonWriteString(jw, "type", "source"); jsonWriteObjectEnd(jw); jsonWriteObjectEnd(jw); } +//#*** TODO: in v459 or later, remove the organism-specific code. +static struct slName *getColorFields(char *db, boolean isRsv, boolean isFlu) +{ +struct slName *colorFields = NULL; +if (sameString(db, "wuhCor1")) + { + slNameAddHead(&colorFields, "country"); + slNameAddHead(&colorFields, "Nextstrain_clade_usher"); + slNameAddHead(&colorFields, "pango_lineage_usher"); + slNameAddHead(&colorFields, "Nextstrain_clade"); + slNameAddHead(&colorFields, "pango_lineage"); + } +else if (isRsv) + { + slNameAddHead(&colorFields, "country"); + slNameAddHead(&colorFields, "GCC_nextclade"); + slNameAddHead(&colorFields, "GCC_usher"); + slNameAddHead(&colorFields, "GCC_assigned_2023-11"); + slNameAddHead(&colorFields, "goya_nextclade"); + slNameAddHead(&colorFields, "goya_usher"); + } +else if (isFlu) + { + slNameAddHead(&colorFields, "country"); + slNameAddHead(&colorFields, "Nextstrain_clade"); + } +else + { + slNameAddHead(&colorFields, "country"); + slNameAddHead(&colorFields, "Nextstrain_lineage"); + } +return colorFields; +} + +//#*** TODO: in v459 or later, remove the organism-specific stuff static char *getDefaultColor(struct slName *colorFields) /* Pick default color from available color fields from metadata. Do not free returned string. */ { char *colorDefault = NULL; if (slNameInList(colorFields, "pango_lineage_usher")) colorDefault = "pango_lineage_usher"; else if (slNameInList(colorFields, "Nextstrain_lineage")) colorDefault = "Nextstrain_lineage"; else if (slNameInList(colorFields, "Nextstrain_clade")) colorDefault = "Nextstrain_clade"; else if (slNameInList(colorFields, "GCC_usher")) colorDefault = "GCC_usher"; else if (colorFields != NULL) colorDefault = colorFields->name; else colorDefault = "userOrOld"; return colorDefault; } +//#*** TODO: in v459 or later, remove the organism-specific stuff static void auspiceMetaColorings(struct jsonWrite *jw, char *source, struct slName *colorFields, char *db) /* Write coloring specs for colorFields from metadata, locally added userOrOld, and * Auspice-automatic gt. */ { jsonWriteListStart(jw, "colorings"); auspiceMetaColoringCategoricalStart(jw, "userOrOld", "Sample type"); jsonWriteListStart(jw, "scale"); jsonWritePair(jw, "uploaded sample", "#CC0000"); jsonWritePair(jw, source, "#000000"); jsonWriteListEnd(jw); auspiceMetaColoringCategoricalEnd(jw); auspiceMetaColoringCategorical(jw, "gt", "Genotype"); struct slName *col; for (col = colorFields; col != NULL; col = col->next) @@ -181,99 +235,113 @@ else if (sameString(col->name, "GCC_nextclade")) auspiceMetaColoringCategorical(jw, col->name, "RGCC lineage assigned by nextclade"); else if (sameString(col->name, "GCC_usher")) auspiceMetaColoringCategorical(jw, col->name, "RGCC lineage assigned by UShER"); else if (sameString(col->name, "GCC_assigned_2023-11")) auspiceMetaColoringCategorical(jw, col->name, "RGCC designated lineage"); else if (sameString(col->name, "country")) auspiceMetaColoringCategorical(jw, col->name, "Country"); else auspiceMetaColoringCategorical(jw, col->name, col->name); } jsonWriteListEnd(jw); } static void writeAuspiceMeta(struct jsonWrite *jw, struct slName *subtreeUserSampleIds, char *source, - char *db, struct slName *colorFields, struct geneInfo *geneInfoList, + char *db, struct geneInfo *geneInfoList, uint genomeSize, boolean isRsv, boolean isFlu) /* Write metadata to configure Auspice display. */ { jsonWriteObjectStart(jw, "meta"); // Title struct dyString *dy = dyStringCreate("Subtree with %s", subtreeUserSampleIds->name); int sampleCount = slCount(subtreeUserSampleIds); if (sampleCount > 10) dyStringPrintf(dy, " and %d other uploaded samples", sampleCount - 1); else { struct slName *sln; for (sln = subtreeUserSampleIds->next; sln != NULL; sln = sln->next) dyStringPrintf(dy, ", %s", sln->name); } jsonWriteString(jw, "title", dy->string); // Description jsonWriteStringf(jw, "description", "Dataset generated by [UShER web interface]" "(%shgPhyloPlace) using the " "[usher](https://github.com/yatisht/usher/) program. " //#*** TODO: describe input from which tree was generated: user sample, version of tree, etc. "If you have metadata you wish to display, you can now drag on a CSV file and " "it will be added into this view, [see here]("NEXTSTRAIN_DRAG_DROP_DOC") " "for more info." , hLocalHostCgiBinUrl()); // Panels: just the tree and entropy (no map) jsonWriteListStart(jw, "panels"); jsonWriteString(jw, NULL, "tree"); jsonWriteString(jw, NULL, "entropy"); jsonWriteListEnd(jw); +char *metaJsonFile = phyloPlaceDbSettingPath(db, "auspiceMeta"); +if (isNotEmpty(metaJsonFile) && fileExists(metaJsonFile)) + { + char *metaJson = NULL; + size_t size = 0; + readInGulp(metaJsonFile, &metaJson, &size); + while (size > 0 && metaJson[size-1] == '\n') + metaJson[--size] = '\0'; + jsonWriteAppendData(jw, ", "); + jsonWriteAppendData(jw, metaJson); + freeMem(metaJson); + } +else + { // Default label & color + struct slName *colorFields = getColorFields(db, isRsv, isFlu); jsonWriteObjectStart(jw, "display_defaults"); jsonWriteString(jw, "branch_label", "aa mutations"); jsonWriteString(jw, "color_by", getDefaultColor(colorFields)); jsonWriteObjectEnd(jw); // Colorings: userOrOld, gt and whatever we got from metadata auspiceMetaColorings(jw, source, colorFields, db); -// Filters didn't work when I tried them a long time ago... revisit someday. + // Filters jsonWriteListStart(jw, "filters"); jsonWriteString(jw, NULL, "userOrOld"); jsonWriteString(jw, NULL, "country"); -//#*** FIXME: TODO: either pass in along with sampleMetadata, or take from JSON file specified -//#*** in config, or better yet, compute while building tree object in memory, then write the -//#*** header object, then write the tree. + //#*** TODO: in v459 or later, remove the organism-specific stuff if (sameString(db, "wuhCor1")) { jsonWriteString(jw, NULL, "pango_lineage_usher"); jsonWriteString(jw, NULL, "pango_lineage"); jsonWriteString(jw, NULL, "Nextstrain_clade_usher"); jsonWriteString(jw, NULL, "Nextstrain_clade"); } else if (isRsv) { jsonWriteString(jw, NULL, "GCC_usher"); jsonWriteString(jw, NULL, "GCC_nextclade"); jsonWriteString(jw, NULL, "GCC_assigned_2023-11"); jsonWriteString(jw, NULL, "goya_usher"); jsonWriteString(jw, NULL, "goya_nextclade"); } else if (isFlu) { jsonWriteString(jw, NULL, "Nextstrain_clade"); } else { jsonWriteString(jw, NULL, "Nextstrain_lineage"); } jsonWriteListEnd(jw); + } // Annotations for coloring/filtering by base writeAuspiceMetaGenomeAnnotations(jw, geneInfoList, genomeSize); jsonWriteObjectEnd(jw); } static void jsonWriteObjectValueUrl(struct jsonWrite *jw, char *name, char *value, char *url) /* Write an object with member "value" set to value, and if url is non-empty, "url" set to url. */ { jsonWriteObjectStart(jw, name); jsonWriteString(jw, "value", value); if (isNotEmpty(url)) jsonWriteString(jw, "url", url); jsonWriteObjectEnd(jw); } @@ -793,94 +861,64 @@ char *seq = needMem(txLen+1); int txOffset = 0; for (ex = 0; ex < gp->exonCount; ex++) { int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex]; safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen); txOffset += exonLen; } if (sameString(gp->strand, "-")) reverseComplement(seq, txLen); struct geneInfo *gi; AllocVar(gi); gi->psl = genePredToPsl((struct genePred *)gp, refGenome->size, txLen); gi->psl->qName = cloneString(gp->name2); gi->txSeq = newDnaSeq(seq, txLen, gp->name2); + gi->cdsStart = gp->cdsStart; + gi->cdsEnd = gp->cdsEnd; AllocVar(gi->cds); genePredToCds((struct genePred *)gp, gi->cds); int cdsLen = gi->cds->end - gi->cds->start; // Skip genes with no CDS (like RNA genes) or obviously incomplete/incorrect CDS. if (cdsLen > 0 && (cdsLen % 3) == 0) { slAddHead(&geneInfoList, gi); } } lmCleanup(&lm); bigBedFileClose(&bbi); } slReverse(&geneInfoList); return geneInfoList; } void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, struct hash *samplePlacements, char *jsonFile, char *source) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ { struct phyloTree *tree = sti->subtree; FILE *outF = mustOpen(jsonFile, "w"); struct jsonWrite *jw = jsonWriteNew(); jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "version", "v2"); -//#*** FIXME: TODO: either pass in along with sampleMetadata, or take from JSON file specified -//#*** in config, or better yet, compute while building tree object in memory, then write the -//#*** header object, then write the tree. boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) || startsWith("RGCC", db)); boolean isFlu = (stringIn("GCF_000865085", db) || stringIn("GCF_001343785", db)); -struct slName *colorFields = NULL; -if (sameString(db, "wuhCor1")) - { - slNameAddHead(&colorFields, "country"); - slNameAddHead(&colorFields, "Nextstrain_clade_usher"); - slNameAddHead(&colorFields, "pango_lineage_usher"); - slNameAddHead(&colorFields, "Nextstrain_clade"); - slNameAddHead(&colorFields, "pango_lineage"); - } -else if (isRsv) - { - slNameAddHead(&colorFields, "country"); - slNameAddHead(&colorFields, "GCC_nextclade"); - slNameAddHead(&colorFields, "GCC_usher"); - slNameAddHead(&colorFields, "GCC_assigned_2023-11"); - slNameAddHead(&colorFields, "goya_nextclade"); - slNameAddHead(&colorFields, "goya_usher"); - } -else if (isFlu) - { - slNameAddHead(&colorFields, "country"); - slNameAddHead(&colorFields, "Nextstrain_clade"); - } -else - { - slNameAddHead(&colorFields, "country"); - slNameAddHead(&colorFields, "Nextstrain_lineage"); - } -//#*** END FIXME -writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, db, colorFields, geneInfoList, +writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, db, geneInfoList, gSeqWin->end, isRsv, isFlu); jsonWriteObjectStart(jw, "tree"); int nodeNum = 10000; // Auspice.us starting node number for newick -> json int depth = 0; // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree struct phyloTree *root = phyloTreeNewNode("wrapper"); phyloAddEdge(root, tree); tree = root; struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, sampleMetadata, sampleUrls, samplePlacements, nodeNum, source }; rTreeToAuspiceJson(tree, depth, &aji, NULL, isRsv, NULL, NULL, NULL, NULL, NULL, NULL, NULL); jsonWriteObjectEnd(jw); // tree jsonWriteObjectEnd(jw); // top-level object fputs(jw->dy->string, outF);