e14c2761ca01d8175578df87bd5311b5c0442bfb angie Thu Nov 19 13:04:01 2020 -0800 New CGI hgPhyloPlace: place SARS-CoV-2 genomes in fasta or VCF on phylogenetic tree using Yatish Turakhia's UShER program; add subtree custom tracks and Nextstrain linkouts to visualize results. refs #25943 diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c new file mode 100644 index 0000000..9fb559d --- /dev/null +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -0,0 +1,555 @@ +/* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample + * mutations, protein changes and metadata. */ + +/* Copyright (C) 2020 The Regents of the University of California */ + +#include "common.h" +#include "hash.h" +#include "hui.h" +#include "jsonWrite.h" +#include "linefile.h" +#include "parsimonyProto.h" +#include "phyloPlace.h" +#include "phyloTree.h" +#include "variantProjector.h" + +// Globals +extern char *chrom; +extern int chromSize; + +static void writeAuspiceMeta(FILE *outF, struct slName *subtreeUserSampleIds) +/* Write metadata to configure Auspice display. */ +{ +fprintf(outF, + "\"meta\": { " + "\"title\": \"Subtree with %s", subtreeUserSampleIds->name); +struct slName *sln; +for (sln = subtreeUserSampleIds->next; sln != NULL; sln = sln->next) + fprintf(outF, ", %s", sln->name); +fputs("\", " + "\"panels\": [ \"tree\"] , " + "\"colorings\": [ " + " { \"key\": \"pangolin_lineage\", " + " \"title\": \"Pangolin lineage\", \"type\": \"categorical\" }," + " { \"key\": \"GISAID_clade\"," + " \"scale\": [ [ \"S\", \"#EC676D\" ], [ \"L\", \"#F79E43\" ], [ \"O\", \"#F9D136\" ]," + " [ \"V\", \"#FAEA95\" ], [ \"G\", \"#B6D77A\" ], [ \"GH\", \"#8FD4ED\" ]," + " [ \"GR\", \"#A692C3\" ] ]," + " \"title\": \"GISAID Clade\", \"type\": \"categorical\" }," + " { \"key\": \"userOrOld\", " + " \"scale\": [ [ \"uploaded sample\", \"#CC0000\"] , [ \"GISAID sample\", \"#000000\"] ]," + " \"title\": \"Sample type\", \"type\": \"categorical\" }" + " ] , " +//#*** Filters didn't seem to work... maybe something about the new fetch feature, or do I need to spcify in some other way? +//#*** "\"filters\": [ \"GISAID_clade\", \"region\", \"country\", \"division\", \"author\" ], " + "\"display_defaults\": { " + " \"branch_label\": \"nuc mutations\" " + "}, " + , outF); +fprintf(outF, + "\"description\": \"Dataset generated by [UShER web interface]" + "(%shgPhyloPlace) using the " + "[usher](https://github.com/yatisht/usher/) program." +//#*** TODO: describe input from which tree was generated: user sample, version of tree, etc. + , hLocalHostCgiBinUrl()); +fputs("If you have metadata you wish to display, you can now drag on a CSV file and it will be " + "added into this view, [see here]" + "(https://nextstrain.github.io/auspice/advanced-functionality/drag-drop-csv-tsv) " + "for more info.\"} ," + , outF); +} + +struct sampleMetadata +/* Information about a virus sample. */ + { + char *strain; // Strain name, usually of the form Country/ArbitraryId/YYYY-MM-DD + char *epiId; // GISAID EPI_ISL_[0-9]+ ID + char *date; // Sample collection date + char *author; // Author(s) to credit + char *gClade; // GISAID clade + char *lineage; // Pangolin lineage + char *country; // Country in which sample was collected + char *division; // Administrative division in which sample was collected (country or state) + char *location; // Location in which sample was collected (city) + char *countryExp; // Country in which host was exposed to virus + char *divExp; // Administrative division in which host was exposed to virus + char *origLab; // Originating lab + char *subLab; // Submitting lab + char *region; // Continent on which sample was collected + }; + +static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value) +/* Write an object with one member, "value", set to value, as most Auspice node attributes are + * formatted. */ +{ +jsonWriteObjectStart(jw, name); +jsonWriteString(jw, "value", value); +jsonWriteObjectEnd(jw); +} + +static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *db, char *name, + struct sampleMetadata *met, boolean isUserSample, + char **retUserOrOld, char **retGClade, char **retLineage) +/* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash, + * or may be a new sample from the user. Set rets for color categories so parent branches can + * determine their color categories. */ +{ +*retUserOrOld = isUserSample ? "uploaded sample" : "GISAID sample"; +jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld); +if (met && met->date) + jsonWriteObjectValue(jw, "date", met->date); +if (met && met->author) + { + jsonWriteObjectValue(jw, "author", met->author); + // Note: Nextstrain adds paper_url and title when available; they also add author and use + // a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A") + } +*retGClade = isUserSample ? "uploaded sample" : (met && met->gClade) ? met->gClade : NULL; +if (isNotEmpty(*retGClade)) + jsonWriteObjectValue(jw, "GISAID_clade", *retGClade); +*retLineage = isUserSample ? "uploaded sample" : + (met && met->lineage) ? met->lineage : lineageForSample(db, name); +if (isNotEmpty(*retLineage)) + jsonWriteObjectValue(jw, "pangolin_lineage", *retLineage); +if (met && met->epiId) + jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId); +if (met && met->country) + jsonWriteObjectValue(jw, "country", met->country); +if (met && met->division) + jsonWriteObjectValue(jw, "division", met->division); +if (met && met->location) + jsonWriteObjectValue(jw, "location", met->location); +if (met && met->countryExp) + jsonWriteObjectValue(jw, "country_exposure", met->countryExp); +if (met && met->divExp) + jsonWriteObjectValue(jw, "division_exposure", met->divExp); +if (met && met->origLab) + jsonWriteObjectValue(jw, "originating_lab", met->origLab); +if (met && met->subLab) + jsonWriteObjectValue(jw, "submitting_lab", met->subLab); +if (met && met->region) + jsonWriteObjectValue(jw, "region", met->region); +} + +static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld, char *gClade, + char *lineage) +/* Write elements of node_attrs for a branch. */ +{ +if (userOrOld) + jsonWriteObjectValue(jw, "userOrOld", userOrOld); +if (gClade) + jsonWriteObjectValue(jw, "GISAID_clade", gClade); +if (lineage) + jsonWriteObjectValue(jw, "pangolin_lineage", lineage); +} + +struct geneInfo +/* Information sufficient to determine whether a genome change causes a coding change. */ + { + struct geneInfo *next; + struct psl *psl; // Alignment of transcript to genome + struct dnaSeq *txSeq; // Transcript sequence + }; + +static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi, + struct seqWindow *gSeqWin, + int *retAaStart, char *retOldAa, char *retNewAa) +/* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly + * (note amino acid values are single-base not strings). */ +{ +boolean isCodingChange = FALSE; +if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) + { + struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 }; + char gAlt[2]; + safef(gAlt, sizeof(gAlt), "%c", snc->newBase); + if (!sameString(gi->psl->strand, "+")) + errAbort("changesProtein: only worlds for psl->strand='+', but gene '%s' has strand '%s'", + gi->psl->qName, gi->psl->strand); + struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq); + if (vpTx->start.region == vpExon) + { + int aaStart = vpTx->start.txOffset / 3; + int codonStart = aaStart * 3; + char newCodon[4]; + safencpy(newCodon, sizeof newCodon, gi->txSeq->dna + codonStart, 3); + int codonOffset = vpTx->start.txOffset - codonStart; + assert(codonOffset < sizeof newCodon); + newCodon[codonOffset] = snc->newBase; + char oldAa = lookupCodon(gi->txSeq->dna + codonStart); + char newAa = lookupCodon(newCodon); + if (newAa != oldAa) + { + isCodingChange = TRUE; + *retAaStart = aaStart; + *retOldAa = oldAa; + *retNewAa = newAa; + } + } + vpTxFree(&vpTx); + } +return isCodingChange; +} + +static struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList, + struct seqWindow *gSeqWin) +/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ +{ +struct slPair *geneChangeList = NULL; +struct geneInfo *gi; +for (gi = geneInfoList; gi != NULL; gi = gi->next) + { + struct slName *aaChangeList = NULL; + struct singleNucChange *snc; + for (snc = sncList; snc != NULL; snc = snc->next) + { + if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) + { + int aaStart; + char oldAa, newAa; + if (changesProtein(snc, gi, gSeqWin, &aaStart, &oldAa, &newAa)) + { + char aaChange[32]; + safef(aaChange, sizeof aaChange, "%c%d%c", oldAa, aaStart+1, newAa); + slNameAddHead(&aaChangeList, aaChange); + } + } + } + if (aaChangeList != NULL) + { + slReverse(&aaChangeList); + slAddHead(&geneChangeList, slPairNew(gi->psl->qName, aaChangeList)); + } + } +slReverse(&geneChangeList); +return geneChangeList; +} + +static void jsonWriteBranchAttrs(struct jsonWrite *jw, struct phyloTree *node, + struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) +/* Write mutations (if any). If node is not a leaf, write label for mutations at this node. */ +{ +if (node->priv != NULL) + { + struct singleNucChange *sncList = node->priv; + struct slPair *geneAaMutations = getAaMutations(sncList, geneInfoList, gSeqWin); + jsonWriteObjectStart(jw, "branch_attrs"); + if (node->numEdges > 0) + { + jsonWriteObjectStart(jw, "labels"); + struct singleNucChange *snc = sncList; + struct dyString *dy = dyStringCreate("%c%d%c", + snc->parBase, snc->chromStart+1, snc->newBase); + for (snc = sncList->next; snc != NULL; snc = snc->next) + dyStringPrintf(dy, ",%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); + jsonWriteString(jw, "nuc mutations", dy->string); + dyStringClear(dy); + struct slPair *geneAaMut; + for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next) + { + struct slName *aaMut; + for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) + { + dyStringAppendSep(dy, ","); + dyStringPrintf(dy, "%s:%s", geneAaMut->name, aaMut->name); + } + } + if (isNotEmpty(dy->string)) + jsonWriteString(jw, "aa mutations", dy->string); + dyStringFree(&dy); + jsonWriteObjectEnd(jw); + } + jsonWriteObjectStart(jw, "mutations"); + struct slPair *geneAaMut; + for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next) + { + jsonWriteListStart(jw, geneAaMut->name); + struct slName *aaMut; + for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) + jsonWriteString(jw, NULL, aaMut->name); + jsonWriteListEnd(jw); + } + jsonWriteListStart(jw, "nuc"); + struct singleNucChange *snc; + for (snc = sncList; snc != NULL; snc = snc->next) + jsonWriteStringf(jw, NULL, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); + jsonWriteListEnd(jw); + jsonWriteObjectEnd(jw); // mutations + jsonWriteObjectEnd(jw); // branch_attrs + } +} + +struct auspiceJsonInfo +/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the + * recursive function doesn't need a dozen args. */ + { + struct jsonWrite *jw; + char *db; + struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree) + struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change + struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change + struct hash *sampleMetadata; // Sample metadata for decorating tree + int nodeNum; // For generating sequential node ID (in absence of name) + }; + +static int cmpstringp(const void *p1, const void *p2) +/* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */ +{ +char *s1 = *(char * const *)p1; +char *s2 = *(char * const *)p2; +if (s1 && s2) + return strcmp(s1, s2); +else if (s1 && !s2) + return 1; +else if (s2 && !s1) + return -1; +return 0; +} + +static char *majorityMaybe(char *array[], int arraySize) +/* Sort array and if a majority of its values are the same then return that value; otherwise NULL. */ +{ +if (arraySize < 1) + return NULL; +qsort(array, arraySize, sizeof array[0], cmpstringp); +char *maxRunVal = array[0]; +int runLength = 1, maxRunLength = 1; +int i; +for (i = 1; i < arraySize; i++) + { + if (sameOk(array[i], array[i-1])) + { + runLength++; + if (runLength > maxRunLength) + { + maxRunLength = runLength; + maxRunVal = array[i]; + } + } + else + runLength = 1; + } +return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL; +} + +static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji, + char **retUserOrOld, char **retGClade, char **retLineage) +/* Write Augur/Auspice V2 JSON for tree. Enclosing object start and end are written by caller. */ +{ +if (node->priv) + { + struct singleNucChange *sncList = node->priv; + depth += slCount(sncList); + } +boolean isUserSample = FALSE; +if (node->ident->name) + isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name); +char *name = node->ident->name; +char *epiId = name ? epiIdFromSampleName(name) : NULL; +struct sampleMetadata *met = NULL; +if (epiId && aji->sampleMetadata) + met = hashFindVal(aji->sampleMetadata, epiId); +if (!isUserSample && met && met->strain) + // Some of Rob's names are outdated; use latest from metadata. + name = met->strain; +if (name) + jsonWriteString(aji->jw, "name", name); +else + jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++); +jsonWriteBranchAttrs(aji->jw, node, aji->geneInfoList, aji->gSeqWin); +if (node->numEdges > 0) + { + jsonWriteListStart(aji->jw, "children"); + char *kidUserOrOld[node->numEdges]; + char *kidGClade[node->numEdges]; + char *kidLineage[node->numEdges]; + int i; + for (i = 0; i < node->numEdges; i++) + { + jsonWriteObjectStart(aji->jw, NULL); + rTreeToAuspiceJson(node->edges[i], depth, aji, + &kidUserOrOld[i], &kidGClade[i], &kidLineage[i]); + jsonWriteObjectEnd(aji->jw); + } + jsonWriteListEnd(aji->jw); + if (retUserOrOld) + *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges); + if (retGClade) + *retGClade = majorityMaybe(kidGClade, node->numEdges); + if (retLineage) + *retLineage = majorityMaybe(kidLineage, node->numEdges); + } +jsonWriteObjectStart(aji->jw, "node_attrs"); +jsonWriteDouble(aji->jw, "div", depth); +if (node->numEdges == 0) + jsonWriteLeafNodeAttributes(aji->jw, aji->db, name, met, isUserSample, + retUserOrOld, retGClade, retLineage); +else if (retUserOrOld && retGClade && retLineage) + jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retGClade, *retLineage); +jsonWriteObjectEnd(aji->jw); +} + +struct phyloTree *phyloTreeNewNode(char *name) +/* Alloc & return a new node with no children. */ +{ +struct phyloTree *node; +AllocVar(node); +AllocVar(node->ident); +node->ident->name = cloneString(name); +return node; +} + +struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome) +/* If config.ra has a source of gene annotations, then return the gene list. */ +{ +struct geneInfo *geneInfoList = NULL; +if (isNotEmpty(bigGenePredFile) && fileExists(bigGenePredFile)) + { + struct bbiFile *bbi = bigBedFileOpen(bigGenePredFile); + struct lm *lm = lmInit(0); + struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); + for (bb = bbList; bb != NULL; bb = bb->next) + { + struct genePredExt *gp = genePredFromBigGenePred(chrom, bb); + if (gp->strand[0] != '+') + errAbort("getGeneInfoList: strand must be '+' but got '%s' for gene %s", + gp->strand, gp->name); + int txLen = 0; + int ex; + for (ex = 0; ex < gp->exonCount; ex++) + txLen += (gp->exonEnds[ex] - gp->exonStarts[ex]); + char *seq = needMem(txLen); + int txOffset = 0; + for (ex = 0; ex < gp->exonCount; ex++) + { + int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex]; + safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen); + txOffset += exonLen; + } + struct geneInfo *gi; + AllocVar(gi); + gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen); + gi->txSeq = newDnaSeq(seq, txLen, gp->name2); + slAddHead(&geneInfoList, gi); + } + lmCleanup(&lm); + bigBedFileClose(&bbi); + } +slReverse(&geneInfoList); +return geneInfoList; +} + +static struct hash *getSampleMetadata(char *metadataFile) +/* If config.ra defines a metadataFile, load its contents into a hash indexed by EPI ID and return; + * otherwise return NULL. */ +{ +struct hash *sampleMetadata = NULL; +if (isNotEmpty(metadataFile) && fileExists(metadataFile)) + { + sampleMetadata = hashNew(0); + struct lineFile *lf = lineFileOpen(metadataFile, TRUE); + int headerWordCount = 0; + char **headerWords = NULL; + char *line; + // Check for header line + if (lineFileNext(lf, &line, NULL)) + { + if (startsWithWord("strain", line)) + { + char *headerLine = cloneString(line); + headerWordCount = chopString(headerLine, "\t", NULL, 0); + AllocArray(headerWords, headerWordCount); + chopString(headerLine, "\t", headerWords, headerWordCount); + } + else + errAbort("Missing header line from metadataFile %s", metadataFile); + } + int strainIx = stringArrayIx("strain", headerWords, headerWordCount); + int epiIdIx = stringArrayIx("gisaid_epi_isl", headerWords, headerWordCount); + int dateIx = stringArrayIx("date", headerWords, headerWordCount); + int authorIx = stringArrayIx("authors", headerWords, headerWordCount); + int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount); + int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount); + int countryIx = stringArrayIx("country", headerWords, headerWordCount); + int divisionIx = stringArrayIx("division", headerWords, headerWordCount); + int locationIx = stringArrayIx("location", headerWords, headerWordCount); + int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount); + int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount); + int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount); + int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount); + int regionIx = stringArrayIx("region", headerWords, headerWordCount); + while (lineFileNext(lf, &line, NULL)) + { + char *words[headerWordCount]; + int wordCount = chopTabs(line, words); + lineFileExpectWords(lf, headerWordCount, wordCount); + struct sampleMetadata *met; + AllocVar(met); + met->strain = cloneString(words[strainIx]); + if (epiIdIx >= 0) + met->epiId = cloneString(words[epiIdIx]); + if (dateIx >= 0) + met->date = cloneString(words[dateIx]); + if (authorIx >= 0) + met->author = cloneString(words[authorIx]); + if (gCladeIx >= 0) + met->gClade = cloneString(words[gCladeIx]); + if (lineageIx >= 0) + met->lineage = cloneString(words[lineageIx]); + if (countryIx >= 0) + met->country = cloneString(words[countryIx]); + if (divisionIx >= 0) + met->division = cloneString(words[divisionIx]); + if (locationIx >= 0) + met->location = cloneString(words[locationIx]); + if (countryExpIx >= 0) + met->countryExp = cloneString(words[countryExpIx]); + if (divExpIx >= 0) + met->divExp = cloneString(words[divExpIx]); + if (origLabIx >= 0) + met->origLab = cloneString(words[origLabIx]); + if (subLabIx >= 0) + met->subLab = cloneString(words[subLabIx]); + if (regionIx >= 0) + met->region = cloneString(words[regionIx]); + hashAdd(sampleMetadata, words[epiIdIx], met); + } + lineFileClose(&lf); + } +return sampleMetadata; +} + +void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref, + char *bigGenePredFile, char *metadataFile, char *jsonFile) +/* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format + * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ +{ +struct phyloTree *tree = sti->subtree; +FILE *outF = mustOpen(jsonFile, "w"); +fputs("{ \"version\": \"v2\", ", outF); +writeAuspiceMeta(outF, sti->subtreeUserSampleIds); +// The meta part is mostly constant & easier to just write out, but jsonWrite is better for the +// nested tree structure. +struct jsonWrite *jw = jsonWriteNew(); +jsonWriteObjectStart(jw, "tree"); +int nodeNum = 10000; // Auspice.us starting node number for newick -> json +int depth = 0; + +// Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree +struct phyloTree *root = phyloTreeNewNode("wrapper"); +phyloAddEdge(root, tree); +tree = root; +struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, ref); +struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize); +struct hash *sampleMetadata = getSampleMetadata(metadataFile); +struct auspiceJsonInfo aji = { jw, db, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, + sampleMetadata, nodeNum }; +rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL); +chromSeqWindowFree(&gSeqWin); +jsonWriteObjectEnd(jw); +fputs(jw->dy->string, outF); +jsonWriteFree(&jw); +fputs("}", outF); +carefulClose(&outF); +} +