2a9d39b83afd2a71f4a0d57042b4a6fcee81a556 angie Thu Jan 26 14:54:41 2023 -0800 Added two RSV clade systems (Goya et al. and Ramaekers et al.) and different sources (nextclade vs. tree vs. direct assignments) as coloring options in Nextstrain JSON output for RSV. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index 2902bfe..4dc7614 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -104,30 +104,32 @@ jsonWriteString(jw, "type", "source"); jsonWriteObjectEnd(jw); jsonWriteObjectEnd(jw); } static char *getDefaultColor(struct slName *colorFields) /* Pick default color from available color fields from metadata. Do not free returned string. */ { char *colorDefault = NULL; if (slNameInList(colorFields, "pango_lineage_usher")) colorDefault = "pango_lineage_usher"; else if (slNameInList(colorFields, "Nextstrain_lineage")) colorDefault = "Nextstrain_lineage"; else if (slNameInList(colorFields, "Nextstrain_clade")) colorDefault = "Nextstrain_clade"; +else if (slNameInList(colorFields, "goya_usher")) + colorDefault = "goya_usher"; else if (colorFields != NULL) colorDefault = colorFields->name; else colorDefault = "userOrOld"; return colorDefault; } static void auspiceMetaColorings(struct jsonWrite *jw, char *source, struct slName *colorFields) /* Write coloring specs for colorFields from metadata, locally added userOrOld, and * Auspice-automatic gt. */ { jsonWriteListStart(jw, "colorings"); auspiceMetaColoringCategoricalStart(jw, "userOrOld", "Sample type"); jsonWriteListStart(jw, "scale"); jsonWritePair(jw, "uploaded sample", "#CC0000"); @@ -136,30 +138,40 @@ auspiceMetaColoringCategoricalEnd(jw); auspiceMetaColoringCategorical(jw, "gt", "Genotype"); struct slName *col; for (col = colorFields; col != NULL; col = col->next) { if (sameString(col->name, "Nextstrain_clade")) auspiceMetaColoringSarsCov2Nextclade(jw, col->name, "Nextstrain Clade"); else if (sameString(col->name, "Nextstrain_clade_usher")) auspiceMetaColoringSarsCov2Nextclade(jw, col->name, "Nextstrain Clade assigned by UShER"); else if (sameString(col->name, "pango_lineage")) auspiceMetaColoringCategorical(jw, col->name, "Pango lineage"); else if (sameString(col->name, "pango_lineage_usher")) auspiceMetaColoringCategorical(jw, col->name, "Pango lineage assigned by UShER"); else if (sameString(col->name, "Nextstrain_lineage")) auspiceMetaColoringCategorical(jw, col->name, "Nextstrain lineage"); + else if (sameString(col->name, "goya_nextclade")) + auspiceMetaColoringCategorical(jw, col->name, "Goya 2020 clade assigned by nextclade"); + else if (sameString(col->name, "goya_usher")) + auspiceMetaColoringCategorical(jw, col->name, "Goya 2020 clade assigned by UShER"); + else if (sameString(col->name, "ramaekers_nextclade")) + auspiceMetaColoringCategorical(jw, col->name, "Ramaekers 2020 clade assigned by nextclade"); + else if (sameString(col->name, "ramaekers_usher")) + auspiceMetaColoringCategorical(jw, col->name, "Ramaekers 2020 clade assigned by UShER"); + else if (sameString(col->name, "ramaekers_tableS1")) + auspiceMetaColoringCategorical(jw, col->name, "Ramaekers 2020 Table S1 designation"); else if (sameString(col->name, "country")) auspiceMetaColoringCategorical(jw, col->name, "Country"); else auspiceMetaColoringCategorical(jw, col->name, col->name); } jsonWriteListEnd(jw); } static void writeAuspiceMeta(struct jsonWrite *jw, struct slName *subtreeUserSampleIds, char *source, char *db, struct slName *colorFields, struct geneInfo *geneInfoList, uint genomeSize) /* Write metadata to configure Auspice display. */ { jsonWriteObjectStart(jw, "meta"); // Title @@ -188,38 +200,46 @@ jsonWriteString(jw, NULL, "tree"); jsonWriteString(jw, NULL, "entropy"); jsonWriteListEnd(jw); // Default label & color jsonWriteObjectStart(jw, "display_defaults"); jsonWriteString(jw, "branch_label", "aa mutations"); jsonWriteString(jw, "color_by", getDefaultColor(colorFields)); jsonWriteObjectEnd(jw); // Colorings: userOrOld, gt and whatever we got from metadata auspiceMetaColorings(jw, source, colorFields); // Filters didn't work when I tried them a long time ago... revisit someday. jsonWriteListStart(jw, "filters"); jsonWriteString(jw, NULL, "userOrOld"); jsonWriteString(jw, NULL, "country"); //#*** FIXME: TODO: either pass in along with sampleMetadata, or better yet, compute while building -//#*** tree object and then write the header object. +//#*** tree object in memory, then write the header object, then write the tree. if (sameString(db, "wuhCor1")) { jsonWriteString(jw, NULL, "pango_lineage_usher"); jsonWriteString(jw, NULL, "pango_lineage"); jsonWriteString(jw, NULL, "Nextstrain_clade_usher"); jsonWriteString(jw, NULL, "Nextstrain_clade"); } +else if (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db)) + { + jsonWriteString(jw, NULL, "goya_usher"); + jsonWriteString(jw, NULL, "goya_nextclade"); + jsonWriteString(jw, NULL, "ramaekers_tableS1"); + jsonWriteString(jw, NULL, "ramaekers_usher"); + jsonWriteString(jw, NULL, "ramaekers_nextclade"); + } else { jsonWriteString(jw, NULL, "Nextstrain_lineage"); } jsonWriteListEnd(jw); // Annotations for coloring/filtering by base writeAuspiceMetaGenomeAnnotations(jw, geneInfoList, genomeSize); jsonWriteObjectEnd(jw); } static void jsonWriteObjectValueUrl(struct jsonWrite *jw, char *name, char *value, char *url) /* Write an object with member "value" set to value, and if url is non-empty, "url" set to url. */ { jsonWriteObjectStart(jw, name); jsonWriteString(jw, "value", value); @@ -236,138 +256,140 @@ } static void makeLineageUrl(char *lineage, char *lineageUrl, size_t lineageUrlSize) /* If lineage is not "uploaded sample", make an outbreak.info link to it, otherwise just copy * lineage. */ { if (sameString(lineage, "uploaded sample")) safecpy(lineageUrl, lineageUrlSize, lineage); else safef(lineageUrl, lineageUrlSize, OUTBREAK_INFO_URLBASE "%s", lineage); } static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name, struct sampleMetadata *met, boolean isUserSample, char *source, struct hash *sampleUrls, - struct hash *samplePlacements, + struct hash *samplePlacements, boolean isRsv, char **retUserOrOld, char **retNClade, char **retGClade, char **retLineage, char **retNLineage, char **retNCladeUsher, char **retLineageUsher) /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash, * or may be a new sample from the user. Set rets for color categories so parent branches can * determine their color categories. */ { *retUserOrOld = isUserSample ? "uploaded sample" : source; jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld); if (met && met->date) jsonWriteObjectValue(jw, "date", met->date); if (met && met->author) { jsonWriteObjectValue(jw, "author", met->author); // Note: Nextstrain adds paper_url and title when available; they also add author and use // a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A") } struct placementInfo *pi = (isUserSample && name) ? hashFindVal(samplePlacements, name) : NULL; *retNClade = (met && met->nClade) ? met->nClade : isUserSample ? "uploaded sample" : NULL; if (isNotEmpty(*retNClade)) - jsonWriteObjectValue(jw, "Nextstrain_clade", *retNClade); + jsonWriteObjectValue(jw, (isRsv ? "goya_nextclade" : "Nextstrain_clade"), *retNClade); *retGClade = (met && met->gClade) ? met->gClade : isUserSample ? "uploaded sample" : NULL; if (isNotEmpty(*retGClade)) - jsonWriteObjectValue(jw, "GISAID_clade", *retGClade); + jsonWriteObjectValue(jw, (isRsv ? "ramaekers_tableS1" : "GISAID_clade"), *retGClade); *retLineage = (met && met->lineage) ? met->lineage : isUserSample ? "uploaded sample" : NULL; if (isNotEmpty(*retLineage)) { char lineageUrl[1024]; makeLineageUrl(*retLineage, lineageUrl, sizeof lineageUrl); - jsonWriteObjectValueUrl(jw, "pango_lineage", *retLineage, lineageUrl); + jsonWriteObjectValueUrl(jw, (isRsv ? "ramaekers_nextclade" : "pango_lineage"), + *retLineage, lineageUrl); } *retNLineage = (met && met->nLineage) ? met->nLineage : isUserSample ? "uploaded sample" : NULL; if (isNotEmpty(*retNLineage)) { jsonWriteObjectValue(jw, "Nextstrain_lineage", *retNLineage); } if (met && met->epiId) jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId); if (met && met->gbAcc) jsonWriteObjectValue(jw, "genbank_accession", met->gbAcc); if (met && met->country) jsonWriteObjectValue(jw, "country", met->country); if (met && met->division) jsonWriteObjectValue(jw, "division", met->division); if (met && met->location) jsonWriteObjectValue(jw, "location", met->location); if (met && met->countryExp) jsonWriteObjectValue(jw, "country_exposure", met->countryExp); if (met && met->divExp) jsonWriteObjectValue(jw, "division_exposure", met->divExp); if (met && met->origLab) jsonWriteObjectValue(jw, "originating_lab", met->origLab); if (met && met->subLab) jsonWriteObjectValue(jw, "submitting_lab", met->subLab); if (met && met->region) jsonWriteObjectValue(jw, "region", met->region); *retNCladeUsher = (pi && pi->nextClade) ? pi->nextClade : (met && met->nCladeUsher) ? met->nCladeUsher : isUserSample ? "uploaded sample" : NULL; if (isNotEmpty(*retNCladeUsher)) - jsonWriteObjectValue(jw, "Nextstrain_clade_usher", *retNCladeUsher); + jsonWriteObjectValue(jw, (isRsv ? "goya_usher" : "Nextstrain_clade_usher"), *retNCladeUsher); *retLineageUsher = (pi && pi->pangoLineage) ? pi->pangoLineage : (met && met->lineageUsher) ? met->lineageUsher : isUserSample ? "uploaded sample" : NULL; if (isNotEmpty(*retLineageUsher)) { char lineageUrl[1024]; makeLineageUrl(*retLineageUsher, lineageUrl, sizeof lineageUrl); - jsonWriteObjectValueUrl(jw, "pango_lineage_usher", *retLineageUsher, lineageUrl); + jsonWriteObjectValueUrl(jw, (isRsv ? "ramaekers_usher" : "pango_lineage_usher"), + *retLineageUsher, lineageUrl); } char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL; if (isNotEmpty(sampleUrl)) { char *p = strstr(sampleUrl, "subtreeAuspice"); char *subtreeNum = p + strlen("subtreeAuspice"); if (p && isdigit(*subtreeNum)) { int num = atoi(subtreeNum); char subtreeLabel[1024]; safef(subtreeLabel, sizeof subtreeLabel, "view subtree %d", num); jsonWriteObjectValueUrl(jw, "subtree", subtreeLabel, sampleUrl); } else jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl); } } -static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld, +static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, boolean isRsv, char *userOrOld, char *nClade, char *gClade, char *lineage, char *nLineage, char *nCladeUsher, char *lineageUsher) /* Write elements of node_attrs for a branch. */ { if (userOrOld) jsonWriteObjectValue(jw, "userOrOld", userOrOld); if (nClade) - jsonWriteObjectValue(jw, "Nextstrain_clade", nClade); + jsonWriteObjectValue(jw, (isRsv ? "goya_nextclade" : "Nextstrain_clade"), nClade); if (gClade) - jsonWriteObjectValue(jw, "GISAID_clade", gClade); + jsonWriteObjectValue(jw, (isRsv ? "ramaekers_tableS1" : "GISAID_clade"), gClade); if (lineage) - jsonWriteObjectValue(jw, "pango_lineage", lineage); + jsonWriteObjectValue(jw, (isRsv ? "ramaekers_nextclade" : "pango_lineage"), lineage); if (nLineage) jsonWriteObjectValue(jw, "Nextstrain_lineage", lineage); if (nCladeUsher) - jsonWriteObjectValue(jw, "Nextstrain_clade_usher", nCladeUsher); + jsonWriteObjectValue(jw, (isRsv ? "goya_usher" : "Nextstrain_clade_usher"), nCladeUsher); if (lineageUsher) - jsonWriteObjectValue(jw, "pango_lineage_usher", lineageUsher); + jsonWriteObjectValue(jw, (isRsv ? "ramaekers_usher" : "pango_lineage_usher"), lineageUsher); } static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi, struct seqWindow *gSeqWin, int *retAaStart, char *retOldAa, char *retNewAa) /* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly * (note amino acid values are single-base not strings). */ { boolean isCodingChange = FALSE; if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) { struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 }; char gAlt[2]; safef(gAlt, sizeof(gAlt), "%c", snc->newBase); struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq); @@ -569,30 +591,31 @@ { runLength++; if (runLength > maxRunLength) { maxRunLength = runLength; maxRunVal = array[i]; } } else runLength = 1; } return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL; } static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji, + boolean isRsv, char **retUserOrOld, char **retNClade, char **retGClade, char **retLineage, char **retNLineage, char **retNCladeUsher, char **retLineageUsher) /* Write Augur/Auspice V2 JSON for tree. Enclosing object start and end are written by caller. */ { if (node->priv) { struct singleNucChange *sncList = node->priv; depth += slCount(sncList); } boolean isUserSample = FALSE; if (node->ident->name) isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name); char *name = node->ident->name; struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL; @@ -604,60 +627,60 @@ if (node->numEdges > 0) { jsonWriteListStart(aji->jw, "children"); char *kidUserOrOld[node->numEdges]; char *kidNClade[node->numEdges]; char *kidGClade[node->numEdges]; char *kidLineage[node->numEdges]; char *kidNCladeUsher[node->numEdges]; char *kidLineageUsher[node->numEdges]; char *kidNLineage[node->numEdges]; // Step through children in reverse order because nextstrain/Auspice draws upside-down. :) int i; for (i = node->numEdges - 1; i >= 0; i--) { jsonWriteObjectStart(aji->jw, NULL); - rTreeToAuspiceJson(node->edges[i], depth, aji, + rTreeToAuspiceJson(node->edges[i], depth, aji, isRsv, &kidUserOrOld[i], &kidNClade[i], &kidGClade[i], &kidLineage[i], &kidNLineage[i], &kidNCladeUsher[i], &kidLineageUsher[i]); jsonWriteObjectEnd(aji->jw); } jsonWriteListEnd(aji->jw); if (retUserOrOld) *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges); if (retNClade) *retNClade = majorityMaybe(kidNClade, node->numEdges); if (retGClade) *retGClade = majorityMaybe(kidGClade, node->numEdges); if (retLineage) *retLineage = majorityMaybe(kidLineage, node->numEdges); if (retNCladeUsher) *retNCladeUsher = majorityMaybe(kidNCladeUsher, node->numEdges); if (retLineageUsher) *retLineageUsher = majorityMaybe(kidLineageUsher, node->numEdges); if (retNLineage) *retNLineage = majorityMaybe(kidNLineage, node->numEdges); } jsonWriteObjectStart(aji->jw, "node_attrs"); jsonWriteDouble(aji->jw, "div", depth); if (node->numEdges == 0) jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls, - aji->samplePlacements, + aji->samplePlacements, isRsv, retUserOrOld, retNClade, retGClade, retLineage, retNLineage, retNCladeUsher, retLineageUsher); else if (retUserOrOld && retGClade && retLineage) - jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retNClade, *retGClade, *retLineage, + jsonWriteBranchNodeAttributes(aji->jw, isRsv, *retUserOrOld, *retNClade, *retGClade, *retLineage, *retNLineage, *retNCladeUsher, *retLineageUsher); jsonWriteObjectEnd(aji->jw); } struct phyloTree *phyloTreeNewNode(char *name) /* Alloc & return a new node with no children. */ { struct phyloTree *node; AllocVar(node); AllocVar(node->ident); node->ident->name = cloneString(name); return node; } struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome) @@ -711,51 +734,61 @@ } void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, struct hash *samplePlacements, char *jsonFile, char *source) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ { struct phyloTree *tree = sti->subtree; FILE *outF = mustOpen(jsonFile, "w"); struct jsonWrite *jw = jsonWriteNew(); jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "version", "v2"); //#*** FIXME: TODO: either pass in along with sampleMetadata, or better yet, compute while building -//#*** tree object and then write the header object. +//#*** tree object in memory, then write the header object, then write the tree. +boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db)); struct slName *colorFields = NULL; if (sameString(db, "wuhCor1")) { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "Nextstrain_clade_usher"); slNameAddHead(&colorFields, "pango_lineage_usher"); slNameAddHead(&colorFields, "Nextstrain_clade"); slNameAddHead(&colorFields, "pango_lineage"); } +else if (isRsv) + { + slNameAddHead(&colorFields, "country"); + slNameAddHead(&colorFields, "ramaekers_nextclade"); + slNameAddHead(&colorFields, "ramaekers_usher"); + slNameAddHead(&colorFields, "ramaekers_tableS1"); + slNameAddHead(&colorFields, "goya_nextclade"); + slNameAddHead(&colorFields, "goya_usher"); + } else { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "Nextstrain_lineage"); } //#*** END FIXME writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, db, colorFields, geneInfoList, gSeqWin->end); jsonWriteObjectStart(jw, "tree"); int nodeNum = 10000; // Auspice.us starting node number for newick -> json int depth = 0; // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree struct phyloTree *root = phyloTreeNewNode("wrapper"); phyloAddEdge(root, tree); tree = root; struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, sampleMetadata, sampleUrls, samplePlacements, nodeNum, source }; -rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL, NULL, NULL, NULL, NULL); +rTreeToAuspiceJson(tree, depth, &aji, isRsv, NULL, NULL, NULL, NULL, NULL, NULL, NULL); jsonWriteObjectEnd(jw); // tree jsonWriteObjectEnd(jw); // top-level object fputs(jw->dy->string, outF); jsonWriteFree(&jw); carefulClose(&outF); }