7c0d73b5b9eb8303ee04ba236d31a6703552581c angie Wed Dec 2 16:32:15 2020 -0800 hgPhyloPlace metadata: add Nextstrain_clade and genbank_accession columns, refactor in prep for public data. * Move the metadatafile-reading code into phyloPlace.c so metadata file can be used for looking up lineage (no need for separate idToLineage file). * Support looking up metadata by GenBank ID in addition to EPI_ISL ID, in preparation for using protobuf and metadata from public sequences instead of GISAID. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index 34489b5..7aae5eb 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -19,135 +19,129 @@ static void writeAuspiceMeta(FILE *outF, struct slName *subtreeUserSampleIds) /* Write metadata to configure Auspice display. */ { fprintf(outF, "\"meta\": { " "\"title\": \"Subtree with %s", subtreeUserSampleIds->name); struct slName *sln; for (sln = subtreeUserSampleIds->next; sln != NULL; sln = sln->next) fprintf(outF, ", %s", sln->name); fputs("\", " "\"panels\": [ \"tree\"] , " "\"colorings\": [ " " { \"key\": \"pangolin_lineage\", " " \"title\": \"Pangolin lineage\", \"type\": \"categorical\" }," + " { \"key\": \"Nextstrain_clade\"," + " \"scale\": [ [ \"19B\", \"#EC676D\" ], [ \"19A\", \"#F79E43\" ]," + " [ \"20A\", \"#B6D77A\" ], [ \"20C\", \"#8FD4ED\" ]," + " [ \"20B\", \"#A692C3\" ] ]," + " \"title\": \"Nextstrain Clade\", \"type\": \"categorical\" }," " { \"key\": \"GISAID_clade\"," " \"scale\": [ [ \"S\", \"#EC676D\" ], [ \"L\", \"#F79E43\" ], [ \"O\", \"#F9D136\" ]," " [ \"V\", \"#FAEA95\" ], [ \"G\", \"#B6D77A\" ], [ \"GH\", \"#8FD4ED\" ]," " [ \"GR\", \"#A692C3\" ] ]," " \"title\": \"GISAID Clade\", \"type\": \"categorical\" }," " { \"key\": \"userOrOld\", " " \"scale\": [ [ \"uploaded sample\", \"#CC0000\"] , [ \"GISAID sample\", \"#000000\"] ]," " \"title\": \"Sample type\", \"type\": \"categorical\" }" " ] , " //#*** Filters didn't seem to work... maybe something about the new fetch feature, or do I need to spcify in some other way? //#*** "\"filters\": [ \"GISAID_clade\", \"region\", \"country\", \"division\", \"author\" ], " "\"display_defaults\": { " " \"branch_label\": \"nuc mutations\" " "}, " , outF); fprintf(outF, "\"description\": \"Dataset generated by [UShER web interface]" "(%shgPhyloPlace) using the " "[usher](https://github.com/yatisht/usher/) program. " //#*** TODO: describe input from which tree was generated: user sample, version of tree, etc. , hLocalHostCgiBinUrl()); fputs("If you have metadata you wish to display, you can now drag on a CSV file and it will be " "added into this view, [see here]("NEXTSTRAIN_DRAG_DROP_DOC") " "for more info.\"} ," , outF); } -struct sampleMetadata -/* Information about a virus sample. */ - { - char *strain; // Strain name, usually of the form Country/ArbitraryId/YYYY-MM-DD - char *epiId; // GISAID EPI_ISL_[0-9]+ ID - char *date; // Sample collection date - char *author; // Author(s) to credit - char *gClade; // GISAID clade - char *lineage; // Pangolin lineage - char *country; // Country in which sample was collected - char *division; // Administrative division in which sample was collected (country or state) - char *location; // Location in which sample was collected (city) - char *countryExp; // Country in which host was exposed to virus - char *divExp; // Administrative division in which host was exposed to virus - char *origLab; // Originating lab - char *subLab; // Submitting lab - char *region; // Continent on which sample was collected - }; - static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value) /* Write an object with one member, "value", set to value, as most Auspice node attributes are * formatted. */ { jsonWriteObjectStart(jw, name); jsonWriteString(jw, "value", value); jsonWriteObjectEnd(jw); } -static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *db, char *name, +static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name, struct sampleMetadata *met, boolean isUserSample, - char **retUserOrOld, char **retGClade, char **retLineage) + char **retUserOrOld, char **retNClade, char **retGClade, + char **retLineage) /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash, * or may be a new sample from the user. Set rets for color categories so parent branches can * determine their color categories. */ { *retUserOrOld = isUserSample ? "uploaded sample" : "GISAID sample"; jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld); if (met && met->date) jsonWriteObjectValue(jw, "date", met->date); if (met && met->author) { jsonWriteObjectValue(jw, "author", met->author); // Note: Nextstrain adds paper_url and title when available; they also add author and use // a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A") } +*retNClade = isUserSample ? "uploaded sample" : (met && met->nClade) ? met->nClade : NULL; +if (isNotEmpty(*retNClade)) + jsonWriteObjectValue(jw, "Nextstrain_clade", *retNClade); *retGClade = isUserSample ? "uploaded sample" : (met && met->gClade) ? met->gClade : NULL; if (isNotEmpty(*retGClade)) jsonWriteObjectValue(jw, "GISAID_clade", *retGClade); *retLineage = isUserSample ? "uploaded sample" : - (met && met->lineage) ? met->lineage : lineageForSample(db, name); + (met && met->lineage) ? met->lineage : NULL; if (isNotEmpty(*retLineage)) jsonWriteObjectValue(jw, "pangolin_lineage", *retLineage); if (met && met->epiId) jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId); +if (met && met->gbAcc) + jsonWriteObjectValue(jw, "genbank_accession", met->gbAcc); if (met && met->country) jsonWriteObjectValue(jw, "country", met->country); if (met && met->division) jsonWriteObjectValue(jw, "division", met->division); if (met && met->location) jsonWriteObjectValue(jw, "location", met->location); if (met && met->countryExp) jsonWriteObjectValue(jw, "country_exposure", met->countryExp); if (met && met->divExp) jsonWriteObjectValue(jw, "division_exposure", met->divExp); if (met && met->origLab) jsonWriteObjectValue(jw, "originating_lab", met->origLab); if (met && met->subLab) jsonWriteObjectValue(jw, "submitting_lab", met->subLab); if (met && met->region) jsonWriteObjectValue(jw, "region", met->region); } -static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld, char *gClade, - char *lineage) +static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld, + char *nClade, char *gClade, char *lineage) /* Write elements of node_attrs for a branch. */ { if (userOrOld) jsonWriteObjectValue(jw, "userOrOld", userOrOld); +if (nClade) + jsonWriteObjectValue(jw, "Nextstrain_clade", nClade); if (gClade) jsonWriteObjectValue(jw, "GISAID_clade", gClade); if (lineage) jsonWriteObjectValue(jw, "pangolin_lineage", lineage); } struct geneInfo /* Information sufficient to determine whether a genome change causes a coding change. */ { struct geneInfo *next; struct psl *psl; // Alignment of transcript to genome struct dnaSeq *txSeq; // Transcript sequence }; static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi, @@ -271,31 +265,30 @@ jsonWriteListStart(jw, "nuc"); struct singleNucChange *snc; for (snc = sncList; snc != NULL; snc = snc->next) jsonWriteStringf(jw, NULL, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); jsonWriteListEnd(jw); jsonWriteObjectEnd(jw); // mutations jsonWriteObjectEnd(jw); // branch_attrs } } struct auspiceJsonInfo /* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the * recursive function doesn't need a dozen args. */ { struct jsonWrite *jw; - char *db; struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree) struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change struct hash *sampleMetadata; // Sample metadata for decorating tree int nodeNum; // For generating sequential node ID (in absence of name) }; static int cmpstringp(const void *p1, const void *p2) /* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */ { char *s1 = *(char * const *)p1; char *s2 = *(char * const *)p2; if (s1 && s2) return strcmp(s1, s2); else if (s1 && !s2) @@ -320,83 +313,84 @@ { runLength++; if (runLength > maxRunLength) { maxRunLength = runLength; maxRunVal = array[i]; } } else runLength = 1; } return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL; } static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji, - char **retUserOrOld, char **retGClade, char **retLineage) + char **retUserOrOld, char **retNClade, char **retGClade, + char **retLineage) /* Write Augur/Auspice V2 JSON for tree. Enclosing object start and end are written by caller. */ { if (node->priv) { struct singleNucChange *sncList = node->priv; depth += slCount(sncList); } boolean isUserSample = FALSE; if (node->ident->name) isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name); char *name = node->ident->name; -char *epiId = name ? epiIdFromSampleName(name) : NULL; -struct sampleMetadata *met = NULL; -if (epiId && aji->sampleMetadata) - met = hashFindVal(aji->sampleMetadata, epiId); +struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL; if (!isUserSample && met && met->strain) - // Some of Rob's names are outdated; use latest from metadata. + // Some of Rob's tree names are outdated; use latest from metadata. name = met->strain; if (name) jsonWriteString(aji->jw, "name", name); else jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++); jsonWriteBranchAttrs(aji->jw, node, aji->geneInfoList, aji->gSeqWin); if (node->numEdges > 0) { jsonWriteListStart(aji->jw, "children"); char *kidUserOrOld[node->numEdges]; + char *kidNClade[node->numEdges]; char *kidGClade[node->numEdges]; char *kidLineage[node->numEdges]; int i; for (i = 0; i < node->numEdges; i++) { jsonWriteObjectStart(aji->jw, NULL); rTreeToAuspiceJson(node->edges[i], depth, aji, - &kidUserOrOld[i], &kidGClade[i], &kidLineage[i]); + &kidUserOrOld[i], &kidNClade[i], &kidGClade[i], &kidLineage[i]); jsonWriteObjectEnd(aji->jw); } jsonWriteListEnd(aji->jw); if (retUserOrOld) *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges); + if (retNClade) + *retNClade = majorityMaybe(kidNClade, node->numEdges); if (retGClade) *retGClade = majorityMaybe(kidGClade, node->numEdges); if (retLineage) *retLineage = majorityMaybe(kidLineage, node->numEdges); } jsonWriteObjectStart(aji->jw, "node_attrs"); jsonWriteDouble(aji->jw, "div", depth); if (node->numEdges == 0) - jsonWriteLeafNodeAttributes(aji->jw, aji->db, name, met, isUserSample, - retUserOrOld, retGClade, retLineage); + jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, + retUserOrOld, retNClade, retGClade, retLineage); else if (retUserOrOld && retGClade && retLineage) - jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retGClade, *retLineage); + jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retNClade, *retGClade, *retLineage); jsonWriteObjectEnd(aji->jw); } struct phyloTree *phyloTreeNewNode(char *name) /* Alloc & return a new node with no children. */ { struct phyloTree *node; AllocVar(node); AllocVar(node->ident); node->ident->name = cloneString(name); return node; } struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome) /* If config.ra has a source of gene annotations, then return the gene list. */ @@ -426,129 +420,48 @@ txOffset += exonLen; } struct geneInfo *gi; AllocVar(gi); gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen); gi->txSeq = newDnaSeq(seq, txLen, gp->name2); slAddHead(&geneInfoList, gi); } lmCleanup(&lm); bigBedFileClose(&bbi); } slReverse(&geneInfoList); return geneInfoList; } -static struct hash *getSampleMetadata(char *metadataFile) -/* If config.ra defines a metadataFile, load its contents into a hash indexed by EPI ID and return; - * otherwise return NULL. */ -{ -struct hash *sampleMetadata = NULL; -if (isNotEmpty(metadataFile) && fileExists(metadataFile)) - { - sampleMetadata = hashNew(0); - struct lineFile *lf = lineFileOpen(metadataFile, TRUE); - int headerWordCount = 0; - char **headerWords = NULL; - char *line; - // Check for header line - if (lineFileNext(lf, &line, NULL)) - { - if (startsWithWord("strain", line)) - { - char *headerLine = cloneString(line); - headerWordCount = chopString(headerLine, "\t", NULL, 0); - AllocArray(headerWords, headerWordCount); - chopString(headerLine, "\t", headerWords, headerWordCount); - } - else - errAbort("Missing header line from metadataFile %s", metadataFile); - } - int strainIx = stringArrayIx("strain", headerWords, headerWordCount); - int epiIdIx = stringArrayIx("gisaid_epi_isl", headerWords, headerWordCount); - int dateIx = stringArrayIx("date", headerWords, headerWordCount); - int authorIx = stringArrayIx("authors", headerWords, headerWordCount); - int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount); - int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount); - int countryIx = stringArrayIx("country", headerWords, headerWordCount); - int divisionIx = stringArrayIx("division", headerWords, headerWordCount); - int locationIx = stringArrayIx("location", headerWords, headerWordCount); - int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount); - int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount); - int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount); - int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount); - int regionIx = stringArrayIx("region", headerWords, headerWordCount); - while (lineFileNext(lf, &line, NULL)) - { - char *words[headerWordCount]; - int wordCount = chopTabs(line, words); - lineFileExpectWords(lf, headerWordCount, wordCount); - struct sampleMetadata *met; - AllocVar(met); - met->strain = cloneString(words[strainIx]); - if (epiIdIx >= 0) - met->epiId = cloneString(words[epiIdIx]); - if (dateIx >= 0) - met->date = cloneString(words[dateIx]); - if (authorIx >= 0) - met->author = cloneString(words[authorIx]); - if (gCladeIx >= 0) - met->gClade = cloneString(words[gCladeIx]); - if (lineageIx >= 0) - met->lineage = cloneString(words[lineageIx]); - if (countryIx >= 0) - met->country = cloneString(words[countryIx]); - if (divisionIx >= 0) - met->division = cloneString(words[divisionIx]); - if (locationIx >= 0) - met->location = cloneString(words[locationIx]); - if (countryExpIx >= 0) - met->countryExp = cloneString(words[countryExpIx]); - if (divExpIx >= 0) - met->divExp = cloneString(words[divExpIx]); - if (origLabIx >= 0) - met->origLab = cloneString(words[origLabIx]); - if (subLabIx >= 0) - met->subLab = cloneString(words[subLabIx]); - if (regionIx >= 0) - met->region = cloneString(words[regionIx]); - hashAdd(sampleMetadata, words[epiIdIx], met); - } - lineFileClose(&lf); - } -return sampleMetadata; -} - void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref, - char *bigGenePredFile, char *metadataFile, char *jsonFile) + char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ { struct phyloTree *tree = sti->subtree; FILE *outF = mustOpen(jsonFile, "w"); fputs("{ \"version\": \"v2\", ", outF); writeAuspiceMeta(outF, sti->subtreeUserSampleIds); // The meta part is mostly constant & easier to just write out, but jsonWrite is better for the // nested tree structure. struct jsonWrite *jw = jsonWriteNew(); jsonWriteObjectStart(jw, "tree"); int nodeNum = 10000; // Auspice.us starting node number for newick -> json int depth = 0; // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree struct phyloTree *root = phyloTreeNewNode("wrapper"); phyloAddEdge(root, tree); tree = root; struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, ref); struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize); -struct hash *sampleMetadata = getSampleMetadata(metadataFile); -struct auspiceJsonInfo aji = { jw, db, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, +struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, sampleMetadata, nodeNum }; -rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL); +rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL, NULL); chromSeqWindowFree(&gSeqWin); jsonWriteObjectEnd(jw); fputs(jw->dy->string, outF); jsonWriteFree(&jw); fputs("}", outF); carefulClose(&outF); }