1813d94a8d9108b9709cb48166b4980d0f02d839 angie Wed Sep 8 12:02:05 2021 -0700 Add coloring options for Nextstrain clade and Pango lineage assigned by usher (really by matUtils annotate) if metadata includes columns for those. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index 9d09250..978e603 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -48,31 +48,59 @@ " [ \"21D (Eta)\", \"#BEBB48\" ]," " [ \"21E (Theta)\", \"#D9AE3E\" ]," " [ \"21F (Iota)\", \"#E69036\" ]," " [ \"21G (Lambda)\", \"#E35F2D\" ]," " [ \"21H\", \"#DB2823\" ]," " [ \"uploaded sample\", \"#FF0000\" ] ]," " \"title\": \"Nextstrain Clade\", \"type\": \"categorical\" }," , outF); if (sameString(source, "GISAID")) fputs(" { \"key\": \"GISAID_clade\"," " \"scale\": [ [ \"S\", \"#EC676D\" ], [ \"L\", \"#F79E43\" ], [ \"O\", \"#F9D136\" ]," " [ \"V\", \"#FAEA95\" ], [ \"G\", \"#B6D77A\" ], [ \"GH\", \"#8FD4ED\" ]," " [ \"GR\", \"#A692C3\" ] ]," " \"title\": \"GISAID Clade\", \"type\": \"categorical\" }," , outF); -fprintf(outF, " { \"key\": \"userOrOld\", " +fprintf(outF, + " { \"key\": \"pango_lineage_usher\", " + " \"title\": \"Pango lineage assigned by UShER\", \"type\": \"categorical\" }," + " { \"key\": \"Nextstrain_clade_usher\"," + " \"scale\": [ " + " [ \"20H(Beta, V2)\", \"#3F47C9\" ]," + " [ \"20I(Alpha, V1)\", \"#4274CE\" ]," + " [ \"20J(Gamma, V3)\", \"#4F97BB\" ]," + " [ \"21A(Delta)\", \"#64AC99\" ]," + " [ \"21B(Kappa)\", \"#7EB976\" ]," + " [ \"21C(Epsilon)\", \"#9EBE5A\" ]," + " [ \"21D(Eta)\", \"#BEBB48\" ]," + " [ \"21E(Theta)\", \"#D9AE3E\" ]," + " [ \"21F(Iota)\", \"#E69036\" ]," + " [ \"21G(Lambda)\", \"#E35F2D\" ]," + " [ \"20H (Beta, V2)\", \"#3F47C9\" ]," + " [ \"20I (Alpha, V1)\", \"#4274CE\" ]," + " [ \"20J (Gamma, V3)\", \"#4F97BB\" ]," + " [ \"21A (Delta)\", \"#64AC99\" ]," + " [ \"21B (Kappa)\", \"#7EB976\" ]," + " [ \"21C (Epsilon)\", \"#9EBE5A\" ]," + " [ \"21D (Eta)\", \"#BEBB48\" ]," + " [ \"21E (Theta)\", \"#D9AE3E\" ]," + " [ \"21F (Iota)\", \"#E69036\" ]," + " [ \"21G (Lambda)\", \"#E35F2D\" ]," + " [ \"21H\", \"#DB2823\" ]," + " [ \"uploaded sample\", \"#FF0000\" ] ]," + " \"title\": \"Nextstrain Clade assigned by UShER\", \"type\": \"categorical\" }," + " { \"key\": \"userOrOld\", " " \"scale\": [ [ \"uploaded sample\", \"#CC0000\"] , [ \"%s\", \"#000000\"] ]," " \"title\": \"Sample type\", \"type\": \"categorical\" }," " {\"key\": \"gt\", \"title\": \"Genotype\", \"type\": \"categorical\"}," " {\"key\": \"country\", \"title\": \"Country\", \"type\": \"categorical\"}" , source); fputs(" ] , " //#*** Filters didn't seem to work... maybe something about the new fetch feature, or do I need to spcify in some other way? //#*** "\"filters\": [ \"GISAID_clade\", \"region\", \"country\", \"division\", \"author\" ], " "\"filters\": [ ], " "\"genome_annotations\":" "{\"E\":{\"end\":26472,\"start\":26245,\"strand\":\"+\",\"type\":\"CDS\"}," " \"M\":{\"end\":27191,\"start\":26523,\"strand\":\"+\",\"type\":\"CDS\"}," " \"N\":{\"end\":29533,\"start\":28274,\"strand\":\"+\",\"type\":\"CDS\"}," " \"ORF1a\":{\"end\":13468,\"start\":266,\"strand\":\"+\",\"type\":\"CDS\"}," " \"ORF1b\":{\"end\":21555,\"start\":13468,\"strand\":\"+\",\"type\":\"CDS\"}," @@ -106,116 +134,140 @@ { jsonWriteObjectStart(jw, name); jsonWriteString(jw, "value", value); if (isNotEmpty(url)) jsonWriteString(jw, "url", url); jsonWriteObjectEnd(jw); } static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value) /* Write an object with one member, "value", set to value, as most Auspice node attributes are * formatted. */ { jsonWriteObjectValueUrl(jw, name, value, NULL); } +static void makeLineageUrl(char *lineage, char *lineageUrl, size_t lineageUrlSize) +/* If lineage is not "uploaded sample", make an outbreak.info link to it, otherwise just copy + * lineage. */ +{ +if (sameString(lineage, "uploaded sample")) + safecpy(lineageUrl, lineageUrlSize, lineage); +else + safef(lineageUrl, lineageUrlSize, OUTBREAK_INFO_URLBASE "%s", lineage); +} + static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name, struct sampleMetadata *met, boolean isUserSample, char *source, struct hash *sampleUrls, char **retUserOrOld, char **retNClade, char **retGClade, - char **retLineage) + char **retLineage, + char **retNCladeUsher, char **retLineageUsher) /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash, * or may be a new sample from the user. Set rets for color categories so parent branches can * determine their color categories. */ { *retUserOrOld = isUserSample ? "uploaded sample" : source; jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld); if (met && met->date) jsonWriteObjectValue(jw, "date", met->date); if (met && met->author) { jsonWriteObjectValue(jw, "author", met->author); // Note: Nextstrain adds paper_url and title when available; they also add author and use // a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A") } *retNClade = isUserSample ? "uploaded sample" : (met && met->nClade) ? met->nClade : NULL; if (isNotEmpty(*retNClade)) jsonWriteObjectValue(jw, "Nextstrain_clade", *retNClade); *retGClade = isUserSample ? "uploaded sample" : (met && met->gClade) ? met->gClade : NULL; if (isNotEmpty(*retGClade)) jsonWriteObjectValue(jw, "GISAID_clade", *retGClade); *retLineage = isUserSample ? "uploaded sample" : (met && met->lineage) ? met->lineage : NULL; if (isNotEmpty(*retLineage)) { char lineageUrl[1024]; - if (sameString(*retLineage, "uploaded sample")) - safecpy(lineageUrl, sizeof lineageUrl, *retLineage); - else - safef(lineageUrl, sizeof lineageUrl, OUTBREAK_INFO_URLBASE "%s", - *retLineage); + makeLineageUrl(*retLineage, lineageUrl, sizeof lineageUrl); jsonWriteObjectValueUrl(jw, "pango_lineage", *retLineage, lineageUrl); } if (met && met->epiId) jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId); if (met && met->gbAcc) jsonWriteObjectValue(jw, "genbank_accession", met->gbAcc); if (met && met->country) jsonWriteObjectValue(jw, "country", met->country); if (met && met->division) jsonWriteObjectValue(jw, "division", met->division); if (met && met->location) jsonWriteObjectValue(jw, "location", met->location); if (met && met->countryExp) jsonWriteObjectValue(jw, "country_exposure", met->countryExp); if (met && met->divExp) jsonWriteObjectValue(jw, "division_exposure", met->divExp); if (met && met->origLab) jsonWriteObjectValue(jw, "originating_lab", met->origLab); if (met && met->subLab) jsonWriteObjectValue(jw, "submitting_lab", met->subLab); if (met && met->region) jsonWriteObjectValue(jw, "region", met->region); +*retNCladeUsher = isUserSample ? "uploaded sample" : + (met && met->nCladeUsher) ? met->nCladeUsher : NULL; +if (isNotEmpty(*retNCladeUsher)) + jsonWriteObjectValue(jw, "Nextstrain_clade_usher", *retNCladeUsher); +*retLineageUsher = isUserSample ? "uploaded sample" : + (met && met->lineageUsher) ? met->lineageUsher : NULL; +if (isNotEmpty(*retLineageUsher)) + { + char lineageUrl[1024]; + makeLineageUrl(*retLineageUsher, lineageUrl, sizeof lineageUrl); + jsonWriteObjectValueUrl(jw, "pango_lineage_usher", *retLineageUsher, lineageUrl); + } char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL; if (isNotEmpty(sampleUrl)) { char *p = strstr(sampleUrl, "subtreeAuspice"); char *subtreeNum = p + strlen("subtreeAuspice"); if (p && isdigit(*subtreeNum)) { int num = atoi(subtreeNum); char subtreeLabel[1024]; safef(subtreeLabel, sizeof subtreeLabel, "view subtree %d", num); jsonWriteObjectValueUrl(jw, "subtree", subtreeLabel, sampleUrl); } else jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl); } } static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld, - char *nClade, char *gClade, char *lineage) + char *nClade, char *gClade, char *lineage, + char *nCladeUsher, char *lineageUsher) /* Write elements of node_attrs for a branch. */ { if (userOrOld) jsonWriteObjectValue(jw, "userOrOld", userOrOld); if (nClade) jsonWriteObjectValue(jw, "Nextstrain_clade", nClade); if (gClade) jsonWriteObjectValue(jw, "GISAID_clade", gClade); if (lineage) jsonWriteObjectValue(jw, "pango_lineage", lineage); +if (nCladeUsher) + jsonWriteObjectValue(jw, "Nextstrain_clade_usher", nCladeUsher); +if (lineageUsher) + jsonWriteObjectValue(jw, "pango_lineage_usher", lineageUsher); } static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi, struct seqWindow *gSeqWin, int *retAaStart, char *retOldAa, char *retNewAa) /* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly * (note amino acid values are single-base not strings). */ { boolean isCodingChange = FALSE; if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) { struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 }; char gAlt[2]; safef(gAlt, sizeof(gAlt), "%c", snc->newBase); if (!sameString(gi->psl->strand, "+")) @@ -411,81 +463,90 @@ runLength++; if (runLength > maxRunLength) { maxRunLength = runLength; maxRunVal = array[i]; } } else runLength = 1; } return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL; } static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji, char **retUserOrOld, char **retNClade, char **retGClade, - char **retLineage) + char **retLineage, char **retNCladeUsher, char **retLineageUsher) /* Write Augur/Auspice V2 JSON for tree. Enclosing object start and end are written by caller. */ { if (node->priv) { struct singleNucChange *sncList = node->priv; depth += slCount(sncList); } boolean isUserSample = FALSE; if (node->ident->name) isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name); char *name = node->ident->name; struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL; if (name) jsonWriteString(aji->jw, "name", name); else jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++); jsonWriteBranchAttrs(aji->jw, node, aji->geneInfoList, aji->gSeqWin); if (node->numEdges > 0) { jsonWriteListStart(aji->jw, "children"); char *kidUserOrOld[node->numEdges]; char *kidNClade[node->numEdges]; char *kidGClade[node->numEdges]; char *kidLineage[node->numEdges]; + char *kidNCladeUsher[node->numEdges]; + char *kidLineageUsher[node->numEdges]; // Step through children in reverse order because nextstrain/Auspice draws upside-down. :) int i; for (i = node->numEdges - 1; i >= 0; i--) { jsonWriteObjectStart(aji->jw, NULL); rTreeToAuspiceJson(node->edges[i], depth, aji, - &kidUserOrOld[i], &kidNClade[i], &kidGClade[i], &kidLineage[i]); + &kidUserOrOld[i], &kidNClade[i], &kidGClade[i], &kidLineage[i], + &kidNCladeUsher[i], &kidLineageUsher[i]); jsonWriteObjectEnd(aji->jw); } jsonWriteListEnd(aji->jw); if (retUserOrOld) *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges); if (retNClade) *retNClade = majorityMaybe(kidNClade, node->numEdges); if (retGClade) *retGClade = majorityMaybe(kidGClade, node->numEdges); if (retLineage) *retLineage = majorityMaybe(kidLineage, node->numEdges); + if (retNCladeUsher) + *retNCladeUsher = majorityMaybe(kidNCladeUsher, node->numEdges); + if (retLineageUsher) + *retLineageUsher = majorityMaybe(kidLineageUsher, node->numEdges); } jsonWriteObjectStart(aji->jw, "node_attrs"); jsonWriteDouble(aji->jw, "div", depth); if (node->numEdges == 0) jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls, - retUserOrOld, retNClade, retGClade, retLineage); + retUserOrOld, retNClade, retGClade, retLineage, + retNCladeUsher, retLineageUsher); else if (retUserOrOld && retGClade && retLineage) - jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retNClade, *retGClade, *retLineage); + jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retNClade, *retGClade, *retLineage, + *retNCladeUsher, *retLineageUsher); jsonWriteObjectEnd(aji->jw); } struct phyloTree *phyloTreeNewNode(char *name) /* Alloc & return a new node with no children. */ { struct phyloTree *node; AllocVar(node); AllocVar(node->ident); node->ident->name = cloneString(name); return node; } struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome) /* If config.ra has a source of gene annotations, then return the gene list. */ @@ -538,23 +599,23 @@ fputs("{ \"version\": \"v2\", ", outF); writeAuspiceMeta(outF, sti->subtreeUserSampleIds, source); // The meta part is mostly constant & easier to just write out, but jsonWrite is better for the // nested tree structure. struct jsonWrite *jw = jsonWriteNew(); jsonWriteObjectStart(jw, "tree"); int nodeNum = 10000; // Auspice.us starting node number for newick -> json int depth = 0; // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree struct phyloTree *root = phyloTreeNewNode("wrapper"); phyloAddEdge(root, tree); tree = root; struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, sampleMetadata, sampleUrls, nodeNum, source }; -rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL, NULL); +rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL, NULL, NULL, NULL); jsonWriteObjectEnd(jw); fputs(jw->dy->string, outF); jsonWriteFree(&jw); fputs("}", outF); carefulClose(&outF); }