66a1a82f72f9f3295bb4007b1b5681013976a694 angie Wed Mar 20 15:20:27 2024 -0700 Overhaul struct sampleMetadata: instead of one struct member per anticipated column, use an array of column values and a shared array of column names. As we support more species, the available metadata gets more divergent so this needs to be more general. We still need to make aggregation of attributes on branches more generic / config-driven. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index 1b2d661..613fc83 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -1,928 +1,920 @@ /* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample * mutations, protein changes and metadata. */ /* Copyright (C) 2020 The Regents of the University of California */ #include "common.h" #include "dnaseq.h" #include "errCatch.h" #include "hash.h" #include "hui.h" #include "jsonWrite.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "variantProjector.h" static void auspiceMetaColoringCategoricalStart(struct jsonWrite *jw, char *key, char *title) /* Write key, title and type of a "categorical" coloring spec, but leave it open in case a * scale list needs to be added. */ { jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "key", key); jsonWriteString(jw, "title", title); jsonWriteString(jw, "type", "categorical"); } static void auspiceMetaColoringCategoricalEnd(struct jsonWrite *jw) /* Close out a coloring spec that was opened with auspiceMetaColoringCategoricalStart. */ { jsonWriteObjectEnd(jw); } static void auspiceMetaColoringCategorical(struct jsonWrite *jw, char *key, char *title) /* Write a "categorical" coloring spec with no scale component. */ { auspiceMetaColoringCategoricalStart(jw, key, title); auspiceMetaColoringCategoricalEnd(jw); } static void jsonWritePair(struct jsonWrite *jw, char *valA, char *valB) /* Write a list with two string values. */ { jsonWriteListStart(jw, NULL); jsonWriteString(jw, NULL, valA); jsonWriteString(jw, NULL, valB); jsonWriteListEnd(jw); } static void jsonWriteAppendData(struct jsonWrite *jw, char *data) /* Append data to jw->dy, trusting that it will maintain well-formed JSON results. */ { dyStringAppend(jw->dy, data); } static void auspiceMetaColoringSarsCov2Nextclade(struct jsonWrite *jw, char *key, char *title) /* Write a coloring spec for SARS-CoV-2 Nextstrain clades. (Non-VoC clades are omitted and * will be assigned grayscale values by Auspice. */ { auspiceMetaColoringCategoricalStart(jw, key, title); jsonWriteListStart(jw, "scale"); // Color hex values are words from line N of // https://github.com/nextstrain/ncov/blob/master/defaults/color_schemes.tsv // where N = number of clade_membership lines in // https://github.com/nextstrain/ncov/blob/master/defaults/color_ordering.tsv jsonWritePair(jw, "20H (Beta, V2)", "#5E1D9D"); jsonWritePair(jw, "20I (Alpha, V1)", "#492AB5"); jsonWritePair(jw, "20J (Gamma, V3)", "#4042C7"); jsonWritePair(jw, "21A (Delta)", "#3E5DD0"); jsonWritePair(jw, "21I (Delta)", "#4377CD"); jsonWritePair(jw, "21J (Delta)", "#4A8CC2"); jsonWritePair(jw, "21B (Kappa)", "#549DB2"); jsonWritePair(jw, "21C (Epsilon)", "#60AA9E"); jsonWritePair(jw, "21D (Eta)", "#6EB389"); jsonWritePair(jw, "21E (Theta)", "#80B974"); jsonWritePair(jw, "21F (Iota)", "#92BC63"); jsonWritePair(jw, "21G (Lambda)", "#A6BE55"); jsonWritePair(jw, "21H (Mu)", "#B9BC4A"); jsonWritePair(jw, "21K (Omicron)", "#CBB742"); jsonWritePair(jw, "21L (Omicron)", "#D9AD3D"); jsonWritePair(jw, "21M (Omicron)", "#E29D39"); jsonWritePair(jw, "22A (Omicron)", "#E68634"); jsonWritePair(jw, "22B (Omicron)", "#E56A2F"); jsonWritePair(jw, "22C (Omicron)", "#E04929"); jsonWritePair(jw, "22D (Omicron)", "#DB2823"); jsonWritePair(jw, "uploaded sample", "#000000"); jsonWriteListEnd(jw); auspiceMetaColoringCategoricalEnd(jw); } static void writeAuspiceMetaGenomeAnnotations(struct jsonWrite *jw, struct geneInfo *geneInfoList, uint genomeSize) /* Write out auspice genome annotations (protein-coding gene CDS and "nuc"). */ { jsonWriteObjectStart(jw, "genome_annotations"); struct geneInfo *gi; for (gi = geneInfoList; gi != NULL; gi = gi->next) { jsonWriteObjectStart(jw, gi->psl->qName); if (gi->psl->blockCount > 1) { jsonWriteListStart(jw, "segments"); int i; for (i = 0; i < gi->psl->blockCount; i++) { int cStart = gi->psl->tStarts[i]; int cEnd = gi->psl->tStarts[i] + gi->psl->blockSizes[i]; if (cEnd < gi->cdsStart) continue; if (cStart > gi->cdsEnd) break; if (cStart < gi->cdsStart) cStart = gi->cdsStart; if (cEnd > gi->cdsEnd) cEnd = gi->cdsEnd; jsonWriteObjectStart(jw, NULL); jsonWriteNumber(jw, "start", cStart+1); jsonWriteNumber(jw, "end", cEnd); jsonWriteObjectEnd(jw); } jsonWriteListEnd(jw); } else { jsonWriteNumber(jw, "start", gi->cdsStart+1); jsonWriteNumber(jw, "end", gi->cdsEnd); } jsonWriteString(jw, "strand", (pslOrientation(gi->psl) > 0) ? "+" : "-"); jsonWriteString(jw, "type", "CDS"); jsonWriteObjectEnd(jw); } jsonWriteObjectStart(jw, "nuc"); jsonWriteNumber(jw, "start", 1); jsonWriteNumber(jw, "end", genomeSize); jsonWriteString(jw, "strand", "+"); jsonWriteString(jw, "type", "source"); jsonWriteObjectEnd(jw); jsonWriteObjectEnd(jw); } //#*** TODO: in v459 or later, remove the organism-specific code. static struct slName *getColorFields(char *db, boolean isRsv, boolean isFlu) { struct slName *colorFields = NULL; if (sameString(db, "wuhCor1")) { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "Nextstrain_clade_usher"); slNameAddHead(&colorFields, "pango_lineage_usher"); slNameAddHead(&colorFields, "Nextstrain_clade"); slNameAddHead(&colorFields, "pango_lineage"); } else if (isRsv) { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "GCC_nextclade"); slNameAddHead(&colorFields, "GCC_usher"); slNameAddHead(&colorFields, "GCC_assigned_2023-11"); slNameAddHead(&colorFields, "goya_nextclade"); slNameAddHead(&colorFields, "goya_usher"); } else if (isFlu) { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "Nextstrain_clade"); } else { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "Nextstrain_lineage"); } return colorFields; } //#*** TODO: in v459 or later, remove the organism-specific stuff static char *getDefaultColor(struct slName *colorFields) /* Pick default color from available color fields from metadata. Do not free returned string. */ { char *colorDefault = NULL; if (slNameInList(colorFields, "pango_lineage_usher")) colorDefault = "pango_lineage_usher"; else if (slNameInList(colorFields, "Nextstrain_lineage")) colorDefault = "Nextstrain_lineage"; else if (slNameInList(colorFields, "Nextstrain_clade")) colorDefault = "Nextstrain_clade"; else if (slNameInList(colorFields, "GCC_usher")) colorDefault = "GCC_usher"; else if (colorFields != NULL) colorDefault = colorFields->name; else colorDefault = "userOrOld"; return colorDefault; } //#*** TODO: in v459 or later, remove the organism-specific stuff static void auspiceMetaColorings(struct jsonWrite *jw, char *source, struct slName *colorFields, char *db) /* Write coloring specs for colorFields from metadata, locally added userOrOld, and * Auspice-automatic gt. */ { jsonWriteListStart(jw, "colorings"); auspiceMetaColoringCategoricalStart(jw, "userOrOld", "Sample type"); jsonWriteListStart(jw, "scale"); jsonWritePair(jw, "uploaded sample", "#CC0000"); jsonWritePair(jw, source, "#000000"); jsonWriteListEnd(jw); auspiceMetaColoringCategoricalEnd(jw); auspiceMetaColoringCategorical(jw, "gt", "Genotype"); struct slName *col; for (col = colorFields; col != NULL; col = col->next) { if (sameString(col->name, "Nextstrain_clade")) { if (sameString(db, "wuhCor1")) auspiceMetaColoringSarsCov2Nextclade(jw, col->name, "Nextstrain Clade"); else auspiceMetaColoringCategorical(jw, col->name, "Clade assigned by nextclade"); } else if (sameString(col->name, "Nextstrain_clade_usher")) auspiceMetaColoringSarsCov2Nextclade(jw, col->name, "Nextstrain Clade assigned by UShER"); else if (sameString(col->name, "pango_lineage")) auspiceMetaColoringCategorical(jw, col->name, "Pango lineage"); else if (sameString(col->name, "pango_lineage_usher")) auspiceMetaColoringCategorical(jw, col->name, "Pango lineage assigned by UShER"); else if (sameString(col->name, "Nextstrain_lineage")) auspiceMetaColoringCategorical(jw, col->name, "Nextstrain lineage"); //#*** RSV hacks -- colorings really should come from JSON file in config directory else if (sameString(col->name, "goya_nextclade")) auspiceMetaColoringCategorical(jw, col->name, "Goya 2020 clade assigned by nextclade"); else if (sameString(col->name, "goya_usher")) auspiceMetaColoringCategorical(jw, col->name, "Goya 2020 clade assigned by UShER"); else if (sameString(col->name, "GCC_nextclade")) auspiceMetaColoringCategorical(jw, col->name, "RGCC lineage assigned by nextclade"); else if (sameString(col->name, "GCC_usher")) auspiceMetaColoringCategorical(jw, col->name, "RGCC lineage assigned by UShER"); else if (sameString(col->name, "GCC_assigned_2023-11")) auspiceMetaColoringCategorical(jw, col->name, "RGCC designated lineage"); else if (sameString(col->name, "country")) auspiceMetaColoringCategorical(jw, col->name, "Country"); else auspiceMetaColoringCategorical(jw, col->name, col->name); } jsonWriteListEnd(jw); } static void writeAuspiceMeta(struct jsonWrite *jw, struct slName *subtreeUserSampleIds, char *source, char *db, struct geneInfo *geneInfoList, uint genomeSize, boolean isRsv, boolean isFlu) /* Write metadata to configure Auspice display. */ { jsonWriteObjectStart(jw, "meta"); // Title struct dyString *dy = dyStringCreate("Subtree with %s", subtreeUserSampleIds->name); int sampleCount = slCount(subtreeUserSampleIds); if (sampleCount > 10) dyStringPrintf(dy, " and %d other uploaded samples", sampleCount - 1); else { struct slName *sln; for (sln = subtreeUserSampleIds->next; sln != NULL; sln = sln->next) dyStringPrintf(dy, ", %s", sln->name); } jsonWriteString(jw, "title", dy->string); // Description jsonWriteStringf(jw, "description", "Dataset generated by [UShER web interface]" "(%shgPhyloPlace) using the " "[usher](https://github.com/yatisht/usher/) program. " //#*** TODO: describe input from which tree was generated: user sample, version of tree, etc. "If you have metadata you wish to display, you can now drag on a CSV file and " "it will be added into this view, [see here]("NEXTSTRAIN_DRAG_DROP_DOC") " "for more info." , hLocalHostCgiBinUrl()); // Panels: just the tree and entropy (no map) jsonWriteListStart(jw, "panels"); jsonWriteString(jw, NULL, "tree"); jsonWriteString(jw, NULL, "entropy"); jsonWriteListEnd(jw); char *metaJsonFile = phyloPlaceDbSettingPath(db, "auspiceMeta"); if (isNotEmpty(metaJsonFile) && fileExists(metaJsonFile)) { char *metaJson = NULL; size_t size = 0; readInGulp(metaJsonFile, &metaJson, &size); while (size > 0 && metaJson[size-1] == '\n') metaJson[--size] = '\0'; jsonWriteAppendData(jw, ", "); jsonWriteAppendData(jw, metaJson); freeMem(metaJson); } else { // Default label & color struct slName *colorFields = getColorFields(db, isRsv, isFlu); jsonWriteObjectStart(jw, "display_defaults"); jsonWriteString(jw, "branch_label", "aa mutations"); jsonWriteString(jw, "color_by", getDefaultColor(colorFields)); jsonWriteObjectEnd(jw); // Colorings: userOrOld, gt and whatever we got from metadata auspiceMetaColorings(jw, source, colorFields, db); // Filters jsonWriteListStart(jw, "filters"); jsonWriteString(jw, NULL, "userOrOld"); jsonWriteString(jw, NULL, "country"); //#*** TODO: in v459 or later, remove the organism-specific stuff if (sameString(db, "wuhCor1")) { jsonWriteString(jw, NULL, "pango_lineage_usher"); jsonWriteString(jw, NULL, "pango_lineage"); jsonWriteString(jw, NULL, "Nextstrain_clade_usher"); jsonWriteString(jw, NULL, "Nextstrain_clade"); } else if (isRsv) { jsonWriteString(jw, NULL, "GCC_usher"); jsonWriteString(jw, NULL, "GCC_nextclade"); jsonWriteString(jw, NULL, "GCC_assigned_2023-11"); jsonWriteString(jw, NULL, "goya_usher"); jsonWriteString(jw, NULL, "goya_nextclade"); } else if (isFlu) { jsonWriteString(jw, NULL, "Nextstrain_clade"); } else { jsonWriteString(jw, NULL, "Nextstrain_lineage"); } jsonWriteListEnd(jw); } // Annotations for coloring/filtering by base writeAuspiceMetaGenomeAnnotations(jw, geneInfoList, genomeSize); jsonWriteObjectEnd(jw); } static void jsonWriteObjectValueUrl(struct jsonWrite *jw, char *name, char *value, char *url) /* Write an object with member "value" set to value, and if url is non-empty, "url" set to url. */ { jsonWriteObjectStart(jw, name); jsonWriteString(jw, "value", value); if (isNotEmpty(url)) jsonWriteString(jw, "url", url); jsonWriteObjectEnd(jw); } static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value) /* Write an object with one member, "value", set to value, as most Auspice node attributes are * formatted. */ { jsonWriteObjectValueUrl(jw, name, value, NULL); } static void makeLineageUrl(char *lineage, char *lineageUrl, size_t lineageUrlSize) /* If lineage is not "uploaded sample", make an outbreak.info link to it, otherwise just copy * lineage. */ { if (sameString(lineage, "uploaded sample")) safecpy(lineageUrl, lineageUrlSize, lineage); else safef(lineageUrl, lineageUrlSize, OUTBREAK_INFO_URLBASE "%s", lineage); } static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name, struct sampleMetadata *met, boolean isUserSample, char *source, struct hash *sampleUrls, struct hash *samplePlacements, boolean isRsv, char **retUserOrOld, char **retNClade, char **retGClade, char **retLineage, char **retNLineage, char **retNCladeUsher, char **retLineageUsher) /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash, * or may be a new sample from the user. Set rets for color categories so parent branches can * determine their color categories. */ { *retUserOrOld = isUserSample ? "uploaded sample" : source; jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld); -if (met && met->date) - jsonWriteObjectValue(jw, "date", met->date); -if (met && met->author) +*retNClade = *retGClade = *retLineage = *retNLineage = *retNCladeUsher = *retLineageUsher = ""; +if (met != NULL) { - jsonWriteObjectValue(jw, "author", met->author); - // Note: Nextstrain adds paper_url and title when available; they also add author and use - // a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A") - } -struct placementInfo *pi = (isUserSample && name) ? hashFindVal(samplePlacements, name) : NULL; - -*retNClade = (met && met->nClade) ? met->nClade : isUserSample ? "uploaded sample" : NULL; -if (isNotEmpty(*retNClade)) - jsonWriteObjectValue(jw, (isRsv ? "goya_nextclade" : "Nextstrain_clade"), *retNClade); -*retGClade = (met && met->gClade) ? met->gClade : isUserSample ? "uploaded sample" : NULL; -if (isNotEmpty(*retGClade)) - jsonWriteObjectValue(jw, (isRsv ? "GCC_assigned_2023-11" : "GISAID_clade"), *retGClade); -*retLineage = (met && met->lineage) ? met->lineage : isUserSample ? "uploaded sample" : NULL; -if (isNotEmpty(*retLineage)) + int i; + for (i = 0; i < met->columnCount; i++) + { + char *colName = met->columnNames[i]; + if (sameString(colName, "pangolin_lineage")) + { + colName = "pango_lineage"; + if (isNotEmpty(met->columnValues[i])) { char lineageUrl[1024]; - makeLineageUrl(*retLineage, lineageUrl, sizeof lineageUrl); - jsonWriteObjectValueUrl(jw, (isRsv ? "GCC_nextclade" : "pango_lineage"), - *retLineage, lineageUrl); - } -*retNLineage = (met && met->nLineage) ? met->nLineage : isUserSample ? "uploaded sample" : NULL; -if (isNotEmpty(*retNLineage)) - { - jsonWriteObjectValue(jw, "Nextstrain_lineage", *retNLineage); - } -if (met && met->epiId) - jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId); -if (met && met->gbAcc) - jsonWriteObjectValue(jw, "genbank_accession", met->gbAcc); -if (met && met->country) - jsonWriteObjectValue(jw, "country", met->country); -if (met && met->division) - jsonWriteObjectValue(jw, "division", met->division); -if (met && met->location) - jsonWriteObjectValue(jw, "location", met->location); -if (met && met->countryExp) - jsonWriteObjectValue(jw, "country_exposure", met->countryExp); -if (met && met->divExp) - jsonWriteObjectValue(jw, "division_exposure", met->divExp); -if (met && met->origLab) - jsonWriteObjectValue(jw, "originating_lab", met->origLab); -if (met && met->subLab) - jsonWriteObjectValue(jw, "submitting_lab", met->subLab); -if (met && met->region) - jsonWriteObjectValue(jw, "region", met->region); -*retNCladeUsher = (pi && pi->nextClade) ? pi->nextClade : - (met && met->nCladeUsher) ? met->nCladeUsher : - isUserSample ? "uploaded sample" : NULL; -if (isNotEmpty(*retNCladeUsher)) + makeLineageUrl(met->columnValues[i], lineageUrl, sizeof lineageUrl); + jsonWriteObjectValueUrl(jw, colName, met->columnValues[i], lineageUrl); + } + else if (isNotEmpty(met->columnValues[i])) + jsonWriteObjectValue(jw, colName, met->columnValues[i]); + } + else + jsonWriteObjectValue(jw, colName, met->columnValues[i]); + // Some columns get passed up for aggregation so we can color internal nodes/branches. + if (sameString(colName, "Nextstrain_clade") || sameString(colName, "goya_nextclade")) + *retNClade = met->columnValues[i]; + else if (sameString(colName, "GISAID_clade") || sameString(colName, "GCC_assigned_2023-11")) + *retGClade = met->columnValues[i]; + else if (sameString(colName, "pango_lineage") || sameString(colName, "GCC_nextclade")) + *retLineage = met->columnValues[i]; + else if (sameString(colName, "Nextstrain_clade_usher") || sameString(colName, "goya_usher")) + *retNCladeUsher = met->columnValues[i]; + else if (sameString(colName, "pango_lineage_usher") || sameString(colName, "GCC_usher")) + *retLineageUsher = met->columnValues[i]; + } + } +else if (isUserSample) + { + struct placementInfo *pi = name ? hashFindVal(samplePlacements, name) : NULL; + //#*** Really need to know what columns are present in the absence of met, so we can avoid + //#*** writing objects that shouldn't be there for this org. + *retNClade = *retGClade = *retLineage = *retNLineage = "uploaded sample"; + jsonWriteObjectValue(jw, isRsv ? "goya_nextclade" : "Nextstrain_clade", "uploaded sample"); + jsonWriteObjectValue(jw, isRsv ? "GCC_assigned_2023-11" : "GISAID_clade", "uploaded sample"); + jsonWriteObjectValue(jw, isRsv ? "GCC_nextclade" : "pango_lineage", "uploaded sample"); + jsonWriteObjectValue(jw, "Nextstrain_lineage", "uploaded sample"); + *retNCladeUsher = (pi && pi->nextClade) ? pi->nextClade : "uploaded sample"; jsonWriteObjectValue(jw, (isRsv ? "goya_usher" : "Nextstrain_clade_usher"), *retNCladeUsher); -*retLineageUsher = (pi && pi->pangoLineage) ? pi->pangoLineage : - (met && met->lineageUsher) ? met->lineageUsher : - isUserSample ? "uploaded sample" : NULL; -if (isNotEmpty(*retLineageUsher)) + *retLineageUsher = (pi && pi->pangoLineage) ? pi->pangoLineage : "uploaded sample"; + if (isRsv) + jsonWriteObjectValue(jw, "GCC_usher", *retLineageUsher); + else { char lineageUrl[1024]; makeLineageUrl(*retLineageUsher, lineageUrl, sizeof lineageUrl); - jsonWriteObjectValueUrl(jw, (isRsv ? "GCC_usher" : "pango_lineage_usher"), - *retLineageUsher, lineageUrl); + jsonWriteObjectValueUrl(jw, "pango_lineage_usher", *retLineageUsher, lineageUrl); + } } char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL; if (isNotEmpty(sampleUrl)) { char *p = strstr(sampleUrl, "subtreeAuspice"); char *subtreeNum = p + strlen("subtreeAuspice"); if (p && isdigit(*subtreeNum)) { int num = atoi(subtreeNum); char subtreeLabel[1024]; safef(subtreeLabel, sizeof subtreeLabel, "view subtree %d", num); jsonWriteObjectValueUrl(jw, "subtree", subtreeLabel, sampleUrl); } else jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl); } } static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, boolean isRsv, char *userOrOld, char *nClade, char *gClade, char *lineage, char *nLineage, char *nCladeUsher, char *lineageUsher) /* Write elements of node_attrs for a branch. */ { if (userOrOld) jsonWriteObjectValue(jw, "userOrOld", userOrOld); if (nClade) jsonWriteObjectValue(jw, (isRsv ? "goya_nextclade" : "Nextstrain_clade"), nClade); if (gClade) jsonWriteObjectValue(jw, (isRsv ? "GCC_assigned_2023-11" : "GISAID_clade"), gClade); if (lineage) jsonWriteObjectValue(jw, (isRsv ? "GCC_nextclade" : "pango_lineage"), lineage); if (nLineage) jsonWriteObjectValue(jw, "Nextstrain_lineage", lineage); if (nCladeUsher) jsonWriteObjectValue(jw, (isRsv ? "goya_usher" : "Nextstrain_clade_usher"), nCladeUsher); if (lineageUsher) jsonWriteObjectValue(jw, (isRsv ? "GCC_usher" : "pango_lineage_usher"), lineageUsher); } INLINE char maybeComplement(char base, struct psl *psl) /* If psl is on '+' strand, return base, otherwise return the complement of base. */ { return (pslOrientation(psl) > 0) ? base : ntCompTable[(int)base]; } static struct slName *codonVpTxToAaChange(struct vpTx *codonVpTxList, struct singleNucChange *ancestorMuts, struct geneInfo *gi) /* Given a list of vpTx from the same codon, combine their changes with inherited mutations * in the same codon to get the amino acid change at this node. * Note: this assumes there is no UTR in transcript, only CDS. True so far for pathogens... */ { struct slName *aaChange = NULL; if (codonVpTxList != NULL) { struct vpTx *vpTx = codonVpTxList; int firstAaStart = (vpTx->start.txOffset - gi->cds->start) / 3; int codonStart = firstAaStart * 3; int codonOffset = vpTx->start.txOffset - gi->cds->start - codonStart; char oldCodon[4]; safencpy(oldCodon, sizeof oldCodon, gi->txSeq->dna + gi->cds->start + codonStart, 3); touppers(oldCodon); boolean isRc = (pslOrientation(gi->psl) < 0); int codonStartG = isRc ? vpTx->start.gOffset + codonOffset : vpTx->start.gOffset - codonOffset; struct singleNucChange *anc; for (anc = ancestorMuts; anc != NULL; anc = anc->next) { int ancCodonOffset = isRc ? codonStartG - (anc->chromStart + 1) : anc->chromStart - codonStartG; if (ancCodonOffset >= 0 && ancCodonOffset < 3) { char parBase = isRc ? ntCompTable[(int)anc->parBase] : anc->parBase; if (parBase != oldCodon[ancCodonOffset]) errAbort("codonVpTxToAaChange: expected parBase for ancestral mutation at %d " "(%s codon %d offset %d) to be '%c' but it's '%c'%s", anc->chromStart+1, gi->psl->qName, firstAaStart, ancCodonOffset, oldCodon[ancCodonOffset], parBase, isRc ? " (rev-comp'd)" : ""); oldCodon[ancCodonOffset] = maybeComplement(anc->newBase, gi->psl); } } char oldAa = lookupCodon(oldCodon); if (oldAa == '\0') oldAa = '*'; char newCodon[4]; safecpy(newCodon, sizeof newCodon, oldCodon); for (vpTx = codonVpTxList; vpTx != NULL; vpTx = vpTx->next) { int aaStart = (vpTx->start.txOffset - gi->cds->start) / 3; if (aaStart != firstAaStart) errAbort("codonVpTxToAaChange: program error: firstAaStart %d != aaStart %d", firstAaStart, aaStart); int codonOffset = vpTx->start.txOffset - gi->cds->start - codonStart; // vpTx->txRef[0] is always the reference base, not like singleNucChange parBase, // so we can't compare it to expected value as we could for ancMuts above. newCodon[codonOffset] = vpTx->txAlt[0]; } char newAa = lookupCodon(newCodon); if (newAa == '\0') newAa = '*'; if (newAa != oldAa) { char aaChangeStr[32]; safef(aaChangeStr, sizeof aaChangeStr, "%c%d%c", oldAa, firstAaStart+1, newAa); aaChange = slNameNew(aaChangeStr); } } return aaChange; } struct slPair *getAaMutations(struct singleNucChange *sncList, struct singleNucChange *ancestorMuts, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ { struct slPair *geneChangeList = NULL; struct geneInfo *gi; for (gi = geneInfoList; gi != NULL; gi = gi->next) { struct slName *aaChangeList = NULL; int prevAaStart = -1; struct vpTx *codonVpTxList = NULL; struct singleNucChange *snc; for (snc = sncList; snc != NULL; snc = snc->next) { if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) { struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 }; char gAlt[2]; safef(gAlt, sizeof(gAlt), "%c", snc->newBase); struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq); if (vpTx->start.region == vpExon && vpTx->start.txOffset < gi->cds->end && vpTx->end.txOffset > gi->cds->start) { int aaStart = (vpTx->start.txOffset - gi->cds->start) / 3; // Accumulate vpTxs in the same codon if (aaStart == prevAaStart || prevAaStart == -1) { slAddHead(&codonVpTxList, vpTx); } else { // Done with previous codon; find change from accumulated mutations struct slName *aaChange = codonVpTxToAaChange(codonVpTxList, ancestorMuts, gi); if (aaChange) slAddHead(&aaChangeList, aaChange); slFreeListWithFunc(&codonVpTxList, vpTxFree); codonVpTxList = vpTx; } prevAaStart = aaStart; } } } // Find change from final set of accumulated mutations if (codonVpTxList != NULL) { struct slName *aaChange = codonVpTxToAaChange(codonVpTxList, ancestorMuts, gi); if (aaChange) slAddHead(&aaChangeList, aaChange); slFreeListWithFunc(&codonVpTxList, vpTxFree); } if (aaChangeList != NULL) { slReverse(&aaChangeList); slAddHead(&geneChangeList, slPairNew(gi->psl->qName, aaChangeList)); } } slReverse(&geneChangeList); return geneChangeList; } static void jsonWriteBranchAttrs(struct jsonWrite *jw, struct phyloTree *node, struct singleNucChange *ancestorMuts, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) /* Write mutations (if any). If node is not a leaf, write label for mutations at this node. */ { if (node->priv != NULL) { struct singleNucChange *sncList = node->priv; struct slPair *geneAaMutations = getAaMutations(sncList, ancestorMuts, geneInfoList, gSeqWin); jsonWriteObjectStart(jw, "branch_attrs"); if (node->numEdges > 0) { jsonWriteObjectStart(jw, "labels"); if (node->ident->name) jsonWriteString(jw, "id", node->ident->name); struct singleNucChange *snc = sncList; struct dyString *dy = dyStringCreate("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); for (snc = sncList->next; snc != NULL; snc = snc->next) dyStringPrintf(dy, ",%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); jsonWriteString(jw, "nuc mutations", dy->string); dyStringClear(dy); for (snc = sncList; snc != NULL; snc = snc->next) { char ref[2]; seqWindowCopy(gSeqWin, snc->chromStart, 1, ref, sizeof(ref)); if (snc->newBase == ref[0]) { dyStringAppendSep(dy, ","); dyStringPrintf(dy, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } jsonWriteString(jw, "back-mutations", dy->string); dyStringClear(dy); struct dyString *dyS = dyStringNew(0); struct slPair *geneAaMut; for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next) { struct slName *aaMut; for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) { dyStringAppendSep(dy, ","); dyStringPrintf(dy, "%s:%s", geneAaMut->name, aaMut->name); } if (sameString("S", geneAaMut->name)) { for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) { dyStringAppendSep(dyS, ","); dyStringPrintf(dyS, "%s:%s", geneAaMut->name, aaMut->name); } } } if (isNotEmpty(dy->string)) jsonWriteString(jw, "aa mutations", dy->string); if (isNotEmpty(dyS->string)) jsonWriteString(jw, "Spike mutations", dyS->string); dyStringFree(&dy); dyStringFree(&dyS); jsonWriteObjectEnd(jw); } jsonWriteObjectStart(jw, "mutations"); struct slPair *geneAaMut; for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next) { jsonWriteListStart(jw, geneAaMut->name); struct slName *aaMut; for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) jsonWriteString(jw, NULL, aaMut->name); jsonWriteListEnd(jw); } jsonWriteListStart(jw, "nuc"); struct singleNucChange *snc; for (snc = sncList; snc != NULL; snc = snc->next) jsonWriteStringf(jw, NULL, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); jsonWriteListEnd(jw); jsonWriteObjectEnd(jw); // mutations jsonWriteObjectEnd(jw); // branch_attrs } } struct auspiceJsonInfo /* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the * recursive function doesn't need a dozen args. */ { struct jsonWrite *jw; struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree) struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change struct hash *sampleMetadata; // Sample metadata for decorating tree struct hash *sampleUrls; // URLs for samples, if applicable struct hash *samplePlacements; // Sample placement info e.g. clade/lineage from usher int nodeNum; // For generating sequential node ID (in absence of name) char *source; // Source of non-user sequences in tree (GISAID or public) }; static int cmpstringp(const void *p1, const void *p2) /* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */ { char *s1 = *(char * const *)p1; char *s2 = *(char * const *)p2; if (s1 && s2) return strcmp(s1, s2); else if (s1 && !s2) return 1; else if (s2 && !s1) return -1; return 0; } static char *majorityMaybe(char *array[], int arraySize) /* Sort array and if a majority of its values are the same then return that value; otherwise NULL. */ { if (arraySize < 1) return NULL; qsort(array, arraySize, sizeof array[0], cmpstringp); char *maxRunVal = array[0]; int runLength = 1, maxRunLength = 1; int i; for (i = 1; i < arraySize; i++) { if (sameOk(array[i], array[i-1])) { runLength++; if (runLength > maxRunLength) { maxRunLength = runLength; maxRunVal = array[i]; } } else runLength = 1; } return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL; } static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji, struct singleNucChange *ancestorMuts, boolean isRsv, char **retUserOrOld, char **retNClade, char **retGClade, char **retLineage, char **retNLineage, char **retNCladeUsher, char **retLineageUsher) /* Write Augur/Auspice V2 JSON for tree. Enclosing object start and end are written by caller. */ { struct singleNucChange *sncList = node->priv; if (sncList) { depth += slCount(sncList); } boolean isUserSample = FALSE; if (node->ident->name) isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name); char *name = node->ident->name; struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL; if (name) jsonWriteString(aji->jw, "name", name); else jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++); jsonWriteBranchAttrs(aji->jw, node, ancestorMuts, aji->geneInfoList, aji->gSeqWin); if (node->numEdges > 0) { struct singleNucChange *allMuts = ancestorMuts; struct singleNucChange *ancLast = slLastEl(ancestorMuts); if (ancLast != NULL) ancLast->next = sncList; else allMuts = sncList; jsonWriteListStart(aji->jw, "children"); char *kidUserOrOld[node->numEdges]; char *kidNClade[node->numEdges]; char *kidGClade[node->numEdges]; char *kidLineage[node->numEdges]; char *kidNCladeUsher[node->numEdges]; char *kidLineageUsher[node->numEdges]; char *kidNLineage[node->numEdges]; // Step through children in reverse order because nextstrain/Auspice draws upside-down. :) int i; for (i = node->numEdges - 1; i >= 0; i--) { jsonWriteObjectStart(aji->jw, NULL); rTreeToAuspiceJson(node->edges[i], depth, aji, allMuts, isRsv, &kidUserOrOld[i], &kidNClade[i], &kidGClade[i], &kidLineage[i], &kidNLineage[i], &kidNCladeUsher[i], &kidLineageUsher[i]); jsonWriteObjectEnd(aji->jw); } jsonWriteListEnd(aji->jw); if (retUserOrOld) *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges); if (retNClade) *retNClade = majorityMaybe(kidNClade, node->numEdges); if (retGClade) *retGClade = majorityMaybe(kidGClade, node->numEdges); if (retLineage) *retLineage = majorityMaybe(kidLineage, node->numEdges); if (retNCladeUsher) *retNCladeUsher = majorityMaybe(kidNCladeUsher, node->numEdges); if (retLineageUsher) *retLineageUsher = majorityMaybe(kidLineageUsher, node->numEdges); if (retNLineage) *retNLineage = majorityMaybe(kidNLineage, node->numEdges); if (ancLast) ancLast->next = NULL; } jsonWriteObjectStart(aji->jw, "node_attrs"); jsonWriteDouble(aji->jw, "div", depth); if (node->numEdges == 0) jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls, aji->samplePlacements, isRsv, retUserOrOld, retNClade, retGClade, retLineage, retNLineage, retNCladeUsher, retLineageUsher); else if (retUserOrOld && retGClade && retLineage) jsonWriteBranchNodeAttributes(aji->jw, isRsv, *retUserOrOld, *retNClade, *retGClade, *retLineage, *retNLineage, *retNCladeUsher, *retLineageUsher); jsonWriteObjectEnd(aji->jw); } struct phyloTree *phyloTreeNewNode(char *name) /* Alloc & return a new node with no children. */ { struct phyloTree *node; AllocVar(node); AllocVar(node->ident); node->ident->name = cloneString(name); return node; } struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome) /* If config.ra has a source of gene annotations, then return the gene list. */ { struct geneInfo *geneInfoList = NULL; struct bbiFile *bbi = NULL; struct errCatch *errCatch = errCatchNew(); if (isNotEmpty(bigGenePredFile)) { if (errCatchStart(errCatch)) { bbi = bigBedFileOpen(bigGenePredFile); } errCatchEnd(errCatch); } if (bbi) { struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, refGenome->name, 0, refGenome->size, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { struct genePredExt *gp = genePredFromBigGenePred(refGenome->name, bb); int txLen = 0; int ex; for (ex = 0; ex < gp->exonCount; ex++) txLen += (gp->exonEnds[ex] - gp->exonStarts[ex]); char *seq = needMem(txLen+1); int txOffset = 0; for (ex = 0; ex < gp->exonCount; ex++) { int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex]; safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen); txOffset += exonLen; } if (sameString(gp->strand, "-")) reverseComplement(seq, txLen); struct geneInfo *gi; AllocVar(gi); gi->psl = genePredToPsl((struct genePred *)gp, refGenome->size, txLen); gi->psl->qName = cloneString(gp->name2); gi->txSeq = newDnaSeq(seq, txLen, gp->name2); gi->cdsStart = gp->cdsStart; gi->cdsEnd = gp->cdsEnd; AllocVar(gi->cds); genePredToCds((struct genePred *)gp, gi->cds); int cdsLen = gi->cds->end - gi->cds->start; // Skip genes with no CDS (like RNA genes) or obviously incomplete/incorrect CDS. if (cdsLen > 0 && (cdsLen % 3) == 0) { slAddHead(&geneInfoList, gi); } } lmCleanup(&lm); bigBedFileClose(&bbi); } slReverse(&geneInfoList); return geneInfoList; } void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, struct hash *samplePlacements, char *jsonFile, char *source) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ { struct phyloTree *tree = sti->subtree; FILE *outF = mustOpen(jsonFile, "w"); struct jsonWrite *jw = jsonWriteNew(); jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "version", "v2"); boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) || startsWith("RGCC", db)); boolean isFlu = (stringIn("GCF_000865085", db) || stringIn("GCF_001343785", db)); writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, db, geneInfoList, gSeqWin->end, isRsv, isFlu); jsonWriteObjectStart(jw, "tree"); int nodeNum = 10000; // Auspice.us starting node number for newick -> json int depth = 0; // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree struct phyloTree *root = phyloTreeNewNode("wrapper"); phyloAddEdge(root, tree); tree = root; struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, sampleMetadata, sampleUrls, samplePlacements, nodeNum, source }; rTreeToAuspiceJson(tree, depth, &aji, NULL, isRsv, NULL, NULL, NULL, NULL, NULL, NULL, NULL); jsonWriteObjectEnd(jw); // tree jsonWriteObjectEnd(jw); // top-level object fputs(jw->dy->string, outF); jsonWriteFree(&jw); carefulClose(&outF); }