9c4b7225d91f5180fcddbecd843b71c4e6503688 angie Tue Mar 30 10:39:22 2021 -0700 Add subtree JSON URLs to single-subtree JSON, using Auspice's new url attribute. Allow up to 1000 VCF samples and extend timeout to 15 minutes. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index 6fdd045..a84e777 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -23,145 +23,157 @@ fprintf(outF, "\"meta\": { " "\"title\": \"Subtree with %s", subtreeUserSampleIds->name); int sampleCount = slCount(subtreeUserSampleIds); if (sampleCount > 10) fprintf(outF, " and %d other uploaded samples", sampleCount - 1); else { struct slName *sln; for (sln = subtreeUserSampleIds->next; sln != NULL; sln = sln->next) fprintf(outF, ", %s", sln->name); } fputs("\", " "\"panels\": [ \"tree\"] , " "\"colorings\": [ " - " { \"key\": \"pangolin_lineage\", " - " \"title\": \"Pangolin lineage\", \"type\": \"categorical\" }," + " { \"key\": \"pango_lineage\", " + " \"title\": \"Pango lineage\", \"type\": \"categorical\" }," " { \"key\": \"Nextstrain_clade\"," " \"scale\": [ [ \"19B\", \"#EC676D\" ], [ \"19A\", \"#F79E43\" ]," " [ \"20A\", \"#B6D77A\" ], [ \"20C\", \"#8FD4ED\" ]," " [ \"20B\", \"#A692C3\" ], [ \"20D\", \"#8020A0\" ]," " [ \"20E (EU1)\", \"#44CC44\" ], [ \"20F\", \"#8822AA\" ]," " [ \"20G\", \"#8888FF\" ], [ \"20H/501Y.V2\", \"#6666FF\" ]," " [ \"20I/501Y.V1\", \"#CC44EE\" ], [ \"20A.EU2\", \"#448844\" ], " " [ \"uploaded sample\", \"#FF0000\" ] ]," " \"title\": \"Nextstrain Clade\", \"type\": \"categorical\" }," , outF); if (sameString(source, "GISAID")) fputs(" { \"key\": \"GISAID_clade\"," " \"scale\": [ [ \"S\", \"#EC676D\" ], [ \"L\", \"#F79E43\" ], [ \"O\", \"#F9D136\" ]," " [ \"V\", \"#FAEA95\" ], [ \"G\", \"#B6D77A\" ], [ \"GH\", \"#8FD4ED\" ]," " [ \"GR\", \"#A692C3\" ] ]," " \"title\": \"GISAID Clade\", \"type\": \"categorical\" }," , outF); fprintf(outF, " { \"key\": \"userOrOld\", " " \"scale\": [ [ \"uploaded sample\", \"#CC0000\"] , [ \"%s\", \"#000000\"] ]," " \"title\": \"Sample type\", \"type\": \"categorical\" }" , source); fputs(" ] , " //#*** Filters didn't seem to work... maybe something about the new fetch feature, or do I need to spcify in some other way? //#*** "\"filters\": [ \"GISAID_clade\", \"region\", \"country\", \"division\", \"author\" ], " + "\"filters\": [ ], " "\"display_defaults\": { " " \"branch_label\": \"none\", " " \"color_by\": \"userOrOld\" " "}, " , outF); fprintf(outF, "\"description\": \"Dataset generated by [UShER web interface]" "(%shgPhyloPlace) using the " "[usher](https://github.com/yatisht/usher/) program. " //#*** TODO: describe input from which tree was generated: user sample, version of tree, etc. , hLocalHostCgiBinUrl()); fputs("If you have metadata you wish to display, you can now drag on a CSV file and it will be " "added into this view, [see here]("NEXTSTRAIN_DRAG_DROP_DOC") " "for more info.\"} ," , outF); } -static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value) -/* Write an object with one member, "value", set to value, as most Auspice node attributes are - * formatted. */ +static void jsonWriteObjectValueUrl(struct jsonWrite *jw, char *name, char *value, char *url) +/* Write an object with member "value" set to value, and if url is non-empty, "url" set to url. */ { jsonWriteObjectStart(jw, name); jsonWriteString(jw, "value", value); +if (isNotEmpty(url)) + jsonWriteString(jw, "url", url); jsonWriteObjectEnd(jw); } +static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value) +/* Write an object with one member, "value", set to value, as most Auspice node attributes are + * formatted. */ +{ +jsonWriteObjectValueUrl(jw, name, value, NULL); +} + static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name, struct sampleMetadata *met, boolean isUserSample, - char *source, + char *source, struct hash *sampleUrls, char **retUserOrOld, char **retNClade, char **retGClade, char **retLineage) /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash, * or may be a new sample from the user. Set rets for color categories so parent branches can * determine their color categories. */ { *retUserOrOld = isUserSample ? "uploaded sample" : source; jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld); if (met && met->date) jsonWriteObjectValue(jw, "date", met->date); if (met && met->author) { jsonWriteObjectValue(jw, "author", met->author); // Note: Nextstrain adds paper_url and title when available; they also add author and use // a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A") } *retNClade = isUserSample ? "uploaded sample" : (met && met->nClade) ? met->nClade : NULL; if (isNotEmpty(*retNClade)) jsonWriteObjectValue(jw, "Nextstrain_clade", *retNClade); *retGClade = isUserSample ? "uploaded sample" : (met && met->gClade) ? met->gClade : NULL; if (isNotEmpty(*retGClade)) jsonWriteObjectValue(jw, "GISAID_clade", *retGClade); *retLineage = isUserSample ? "uploaded sample" : (met && met->lineage) ? met->lineage : NULL; if (isNotEmpty(*retLineage)) - jsonWriteObjectValue(jw, "pangolin_lineage", *retLineage); + jsonWriteObjectValue(jw, "pango_lineage", *retLineage); if (met && met->epiId) jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId); if (met && met->gbAcc) jsonWriteObjectValue(jw, "genbank_accession", met->gbAcc); if (met && met->country) jsonWriteObjectValue(jw, "country", met->country); if (met && met->division) jsonWriteObjectValue(jw, "division", met->division); if (met && met->location) jsonWriteObjectValue(jw, "location", met->location); if (met && met->countryExp) jsonWriteObjectValue(jw, "country_exposure", met->countryExp); if (met && met->divExp) jsonWriteObjectValue(jw, "division_exposure", met->divExp); if (met && met->origLab) jsonWriteObjectValue(jw, "originating_lab", met->origLab); if (met && met->subLab) jsonWriteObjectValue(jw, "submitting_lab", met->subLab); if (met && met->region) jsonWriteObjectValue(jw, "region", met->region); +char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL; +if (isNotEmpty(sampleUrl)) + jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl); } static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld, char *nClade, char *gClade, char *lineage) /* Write elements of node_attrs for a branch. */ { if (userOrOld) jsonWriteObjectValue(jw, "userOrOld", userOrOld); if (nClade) jsonWriteObjectValue(jw, "Nextstrain_clade", nClade); if (gClade) jsonWriteObjectValue(jw, "GISAID_clade", gClade); if (lineage) - jsonWriteObjectValue(jw, "pangolin_lineage", lineage); + jsonWriteObjectValue(jw, "pango_lineage", lineage); } static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi, struct seqWindow *gSeqWin, int *retAaStart, char *retOldAa, char *retNewAa) /* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly * (note amino acid values are single-base not strings). */ { boolean isCodingChange = FALSE; if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) { struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 }; char gAlt[2]; safef(gAlt, sizeof(gAlt), "%c", snc->newBase); if (!sameString(gi->psl->strand, "+")) @@ -277,30 +289,31 @@ jsonWriteListEnd(jw); jsonWriteObjectEnd(jw); // mutations jsonWriteObjectEnd(jw); // branch_attrs } } struct auspiceJsonInfo /* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the * recursive function doesn't need a dozen args. */ { struct jsonWrite *jw; struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree) struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change struct hash *sampleMetadata; // Sample metadata for decorating tree + struct hash *sampleUrls; // URLs for samples, if applicable int nodeNum; // For generating sequential node ID (in absence of name) char *source; // Source of non-user sequences in tree (GISAID or public) }; static int cmpstringp(const void *p1, const void *p2) /* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */ { char *s1 = *(char * const *)p1; char *s2 = *(char * const *)p2; if (s1 && s2) return strcmp(s1, s2); else if (s1 && !s2) return 1; else if (s2 && !s1) return -1; @@ -369,31 +382,31 @@ jsonWriteObjectEnd(aji->jw); } jsonWriteListEnd(aji->jw); if (retUserOrOld) *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges); if (retNClade) *retNClade = majorityMaybe(kidNClade, node->numEdges); if (retGClade) *retGClade = majorityMaybe(kidGClade, node->numEdges); if (retLineage) *retLineage = majorityMaybe(kidLineage, node->numEdges); } jsonWriteObjectStart(aji->jw, "node_attrs"); jsonWriteDouble(aji->jw, "div", depth); if (node->numEdges == 0) - jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, + jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls, retUserOrOld, retNClade, retGClade, retLineage); else if (retUserOrOld && retGClade && retLineage) jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retNClade, *retGClade, *retLineage); jsonWriteObjectEnd(aji->jw); } struct phyloTree *phyloTreeNewNode(char *name) /* Alloc & return a new node with no children. */ { struct phyloTree *node; AllocVar(node); AllocVar(node->ident); node->ident->name = cloneString(name); return node; } @@ -427,45 +440,45 @@ } struct geneInfo *gi; AllocVar(gi); gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen); gi->txSeq = newDnaSeq(seq, txLen, gp->name2); slAddHead(&geneInfoList, gi); } lmCleanup(&lm); bigBedFileClose(&bbi); } slReverse(&geneInfoList); return geneInfoList; } void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, - struct seqWindow *gSeqWin, struct hash *sampleMetadata, char *jsonFile, - char *source) + struct seqWindow *gSeqWin, struct hash *sampleMetadata, + struct hash *sampleUrls, char *jsonFile, char *source) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ { struct phyloTree *tree = sti->subtree; FILE *outF = mustOpen(jsonFile, "w"); fputs("{ \"version\": \"v2\", ", outF); writeAuspiceMeta(outF, sti->subtreeUserSampleIds, source); // The meta part is mostly constant & easier to just write out, but jsonWrite is better for the // nested tree structure. struct jsonWrite *jw = jsonWriteNew(); jsonWriteObjectStart(jw, "tree"); int nodeNum = 10000; // Auspice.us starting node number for newick -> json int depth = 0; // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree struct phyloTree *root = phyloTreeNewNode("wrapper"); phyloAddEdge(root, tree); tree = root; struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, - sampleMetadata, nodeNum, source }; + sampleMetadata, sampleUrls, nodeNum, source }; rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL, NULL); jsonWriteObjectEnd(jw); fputs(jw->dy->string, outF); jsonWriteFree(&jw); fputs("}", outF); carefulClose(&outF); }