2d05d30ed4df1612d72ba84c812d004de935b122 angie Fri May 17 16:08:54 2024 -0700 Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup. Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index ebe22a0..0fb9e22 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -1,939 +1,937 @@ /* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample * mutations, protein changes and metadata. */ /* Copyright (C) 2020-2024 The Regents of the University of California */ #include "common.h" #include "dnaseq.h" #include "errCatch.h" #include "hash.h" #include "hui.h" #include "jsonWrite.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "variantProjector.h" static void auspiceMetaColoringCategoricalStart(struct jsonWrite *jw, char *key, char *title) /* Write key, title and type of a "categorical" coloring spec, but leave it open in case a * scale list needs to be added. */ { jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "key", key); jsonWriteString(jw, "title", title); jsonWriteString(jw, "type", "categorical"); } static void auspiceMetaColoringCategoricalEnd(struct jsonWrite *jw) /* Close out a coloring spec that was opened with auspiceMetaColoringCategoricalStart. */ { jsonWriteObjectEnd(jw); } static void auspiceMetaColoringCategorical(struct jsonWrite *jw, char *key, char *title) /* Write a "categorical" coloring spec with no scale component. */ { auspiceMetaColoringCategoricalStart(jw, key, title); auspiceMetaColoringCategoricalEnd(jw); } static void jsonWritePair(struct jsonWrite *jw, char *valA, char *valB) /* Write a list with two string values. */ { jsonWriteListStart(jw, NULL); jsonWriteString(jw, NULL, valA); jsonWriteString(jw, NULL, valB); jsonWriteListEnd(jw); } static void jsonWriteAppendData(struct jsonWrite *jw, char *data) /* Append data to jw->dy, trusting that it will maintain well-formed JSON results. */ { dyStringAppend(jw->dy, data); } static void auspiceMetaColoringSarsCov2Nextclade(struct jsonWrite *jw, char *key, char *title) /* Write a coloring spec for SARS-CoV-2 Nextstrain clades. (Non-VoC clades are omitted and * will be assigned grayscale values by Auspice. */ { auspiceMetaColoringCategoricalStart(jw, key, title); jsonWriteListStart(jw, "scale"); // Color hex values are words from line N of // https://github.com/nextstrain/ncov/blob/master/defaults/color_schemes.tsv // where N = number of clade_membership lines in // https://github.com/nextstrain/ncov/blob/master/defaults/color_ordering.tsv jsonWritePair(jw, "20H (Beta, V2)", "#5E1D9D"); jsonWritePair(jw, "20I (Alpha, V1)", "#492AB5"); jsonWritePair(jw, "20J (Gamma, V3)", "#4042C7"); jsonWritePair(jw, "21A (Delta)", "#3E5DD0"); jsonWritePair(jw, "21I (Delta)", "#4377CD"); jsonWritePair(jw, "21J (Delta)", "#4A8CC2"); jsonWritePair(jw, "21B (Kappa)", "#549DB2"); jsonWritePair(jw, "21C (Epsilon)", "#60AA9E"); jsonWritePair(jw, "21D (Eta)", "#6EB389"); jsonWritePair(jw, "21E (Theta)", "#80B974"); jsonWritePair(jw, "21F (Iota)", "#92BC63"); jsonWritePair(jw, "21G (Lambda)", "#A6BE55"); jsonWritePair(jw, "21H (Mu)", "#B9BC4A"); jsonWritePair(jw, "21K (Omicron)", "#CBB742"); jsonWritePair(jw, "21L (Omicron)", "#D9AD3D"); jsonWritePair(jw, "21M (Omicron)", "#E29D39"); jsonWritePair(jw, "22A (Omicron)", "#E68634"); jsonWritePair(jw, "22B (Omicron)", "#E56A2F"); jsonWritePair(jw, "22C (Omicron)", "#E04929"); jsonWritePair(jw, "22D (Omicron)", "#DB2823"); jsonWritePair(jw, "uploaded sample", "#000000"); jsonWriteListEnd(jw); auspiceMetaColoringCategoricalEnd(jw); } static void writeAuspiceMetaGenomeAnnotations(struct jsonWrite *jw, struct geneInfo *geneInfoList, uint genomeSize) /* Write out auspice genome annotations (protein-coding gene CDS and "nuc"). */ { jsonWriteObjectStart(jw, "genome_annotations"); struct geneInfo *gi; for (gi = geneInfoList; gi != NULL; gi = gi->next) { jsonWriteObjectStart(jw, gi->psl->qName); if (gi->psl->blockCount > 1) { jsonWriteListStart(jw, "segments"); int i; for (i = 0; i < gi->psl->blockCount; i++) { int cStart = gi->psl->tStarts[i]; int cEnd = gi->psl->tStarts[i] + gi->psl->blockSizes[i]; if (cEnd < gi->cdsStart) continue; if (cStart > gi->cdsEnd) break; if (cStart < gi->cdsStart) cStart = gi->cdsStart; if (cEnd > gi->cdsEnd) cEnd = gi->cdsEnd; jsonWriteObjectStart(jw, NULL); jsonWriteNumber(jw, "start", cStart+1); jsonWriteNumber(jw, "end", cEnd); jsonWriteObjectEnd(jw); } jsonWriteListEnd(jw); } else { jsonWriteNumber(jw, "start", gi->cdsStart+1); jsonWriteNumber(jw, "end", gi->cdsEnd); } jsonWriteString(jw, "strand", (pslOrientation(gi->psl) > 0) ? "+" : "-"); jsonWriteString(jw, "type", "CDS"); jsonWriteObjectEnd(jw); } jsonWriteObjectStart(jw, "nuc"); jsonWriteNumber(jw, "start", 1); jsonWriteNumber(jw, "end", genomeSize); jsonWriteString(jw, "strand", "+"); jsonWriteString(jw, "type", "source"); jsonWriteObjectEnd(jw); jsonWriteObjectEnd(jw); } //#*** TODO: in v459 or later, remove the organism-specific code. static struct slName *getColorFields(char *db, boolean isRsv, boolean isFlu) { struct slName *colorFields = NULL; if (sameString(db, "wuhCor1")) { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "Nextstrain_clade_usher"); slNameAddHead(&colorFields, "pango_lineage_usher"); slNameAddHead(&colorFields, "Nextstrain_clade"); slNameAddHead(&colorFields, "pango_lineage"); } else if (isRsv) { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "GCC_nextclade"); slNameAddHead(&colorFields, "GCC_usher"); slNameAddHead(&colorFields, "GCC_assigned_2023-11"); slNameAddHead(&colorFields, "goya_nextclade"); slNameAddHead(&colorFields, "goya_usher"); } else if (isFlu) { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "Nextstrain_clade"); } else { slNameAddHead(&colorFields, "country"); slNameAddHead(&colorFields, "Nextstrain_lineage"); } return colorFields; } //#*** TODO: in v459 or later, remove the organism-specific stuff static char *getDefaultColor(struct slName *colorFields) /* Pick default color from available color fields from metadata. Do not free returned string. */ { char *colorDefault = NULL; if (slNameInList(colorFields, "pango_lineage_usher")) colorDefault = "pango_lineage_usher"; else if (slNameInList(colorFields, "Nextstrain_lineage")) colorDefault = "Nextstrain_lineage"; else if (slNameInList(colorFields, "Nextstrain_clade")) colorDefault = "Nextstrain_clade"; else if (slNameInList(colorFields, "GCC_usher")) colorDefault = "GCC_usher"; else if (colorFields != NULL) colorDefault = colorFields->name; else colorDefault = "userOrOld"; return colorDefault; } //#*** TODO: in v459 or later, remove the organism-specific stuff static void auspiceMetaColorings(struct jsonWrite *jw, char *source, struct slName *colorFields, char *db) /* Write coloring specs for colorFields from metadata, locally added userOrOld, and * Auspice-automatic gt. */ { jsonWriteListStart(jw, "colorings"); auspiceMetaColoringCategoricalStart(jw, "userOrOld", "Sample type"); jsonWriteListStart(jw, "scale"); jsonWritePair(jw, "uploaded sample", "#CC0000"); jsonWritePair(jw, source, "#000000"); jsonWriteListEnd(jw); auspiceMetaColoringCategoricalEnd(jw); auspiceMetaColoringCategorical(jw, "gt", "Genotype"); struct slName *col; for (col = colorFields; col != NULL; col = col->next) { if (sameString(col->name, "Nextstrain_clade")) { if (sameString(db, "wuhCor1")) auspiceMetaColoringSarsCov2Nextclade(jw, col->name, "Nextstrain Clade"); else auspiceMetaColoringCategorical(jw, col->name, "Clade assigned by nextclade"); } else if (sameString(col->name, "Nextstrain_clade_usher")) auspiceMetaColoringSarsCov2Nextclade(jw, col->name, "Nextstrain Clade assigned by UShER"); else if (sameString(col->name, "pango_lineage")) auspiceMetaColoringCategorical(jw, col->name, "Pango lineage"); else if (sameString(col->name, "pango_lineage_usher")) auspiceMetaColoringCategorical(jw, col->name, "Pango lineage assigned by UShER"); else if (sameString(col->name, "Nextstrain_lineage")) auspiceMetaColoringCategorical(jw, col->name, "Nextstrain lineage"); //#*** RSV hacks -- colorings really should come from JSON file in config directory else if (sameString(col->name, "goya_nextclade")) auspiceMetaColoringCategorical(jw, col->name, "Goya 2020 clade assigned by nextclade"); else if (sameString(col->name, "goya_usher")) auspiceMetaColoringCategorical(jw, col->name, "Goya 2020 clade assigned by UShER"); else if (sameString(col->name, "GCC_nextclade")) auspiceMetaColoringCategorical(jw, col->name, "RGCC lineage assigned by nextclade"); else if (sameString(col->name, "GCC_usher")) auspiceMetaColoringCategorical(jw, col->name, "RGCC lineage assigned by UShER"); else if (sameString(col->name, "GCC_assigned_2023-11")) auspiceMetaColoringCategorical(jw, col->name, "RGCC designated lineage"); else if (sameString(col->name, "country")) auspiceMetaColoringCategorical(jw, col->name, "Country"); else auspiceMetaColoringCategorical(jw, col->name, col->name); } jsonWriteListEnd(jw); } static void writeAuspiceMeta(struct jsonWrite *jw, struct slName *subtreeUserSampleIds, char *source, char *org, char *db, struct geneInfo *geneInfoList, uint genomeSize, boolean isRsv, boolean isFlu) /* Write metadata to configure Auspice display. */ { jsonWriteObjectStart(jw, "meta"); // Title struct dyString *dy = dyStringCreate("Subtree with %s", subtreeUserSampleIds->name); int sampleCount = slCount(subtreeUserSampleIds); if (sampleCount > 10) dyStringPrintf(dy, " and %d other uploaded samples", sampleCount - 1); else { struct slName *sln; for (sln = subtreeUserSampleIds->next; sln != NULL; sln = sln->next) dyStringPrintf(dy, ", %s", sln->name); } jsonWriteString(jw, "title", dy->string); // Description jsonWriteStringf(jw, "description", "Dataset generated by [UShER web interface]" "(%shgPhyloPlace) using the " "[usher](https://github.com/yatisht/usher/) program. " //#*** TODO: describe input from which tree was generated: user sample, version of tree, etc. "If you have metadata you wish to display, you can now drag on a CSV file and " "it will be added into this view, [see here]("NEXTSTRAIN_DRAG_DROP_DOC") " "for more info." , hLocalHostCgiBinUrl()); // Panels: just the tree and entropy (no map) jsonWriteListStart(jw, "panels"); jsonWriteString(jw, NULL, "tree"); jsonWriteString(jw, NULL, "entropy"); jsonWriteListEnd(jw); char *metaJsonFile = phyloPlaceRefSettingPath(org, db, "auspiceMeta"); if (isNotEmpty(metaJsonFile) && fileExists(metaJsonFile)) { char *metaJson = NULL; size_t size = 0; readInGulp(metaJsonFile, &metaJson, &size); while (size > 0 && metaJson[size-1] == '\n') metaJson[--size] = '\0'; jsonWriteAppendData(jw, ", "); jsonWriteAppendData(jw, metaJson); freeMem(metaJson); } else { // Default label & color struct slName *colorFields = getColorFields(db, isRsv, isFlu); jsonWriteObjectStart(jw, "display_defaults"); jsonWriteString(jw, "branch_label", "aa mutations"); jsonWriteString(jw, "color_by", getDefaultColor(colorFields)); jsonWriteObjectEnd(jw); // Colorings: userOrOld, gt and whatever we got from metadata auspiceMetaColorings(jw, source, colorFields, db); // Filters jsonWriteListStart(jw, "filters"); jsonWriteString(jw, NULL, "userOrOld"); jsonWriteString(jw, NULL, "country"); //#*** TODO: in v459 or later, remove the organism-specific stuff if (sameString(db, "wuhCor1")) { jsonWriteString(jw, NULL, "pango_lineage_usher"); jsonWriteString(jw, NULL, "pango_lineage"); jsonWriteString(jw, NULL, "Nextstrain_clade_usher"); jsonWriteString(jw, NULL, "Nextstrain_clade"); } else if (isRsv) { jsonWriteString(jw, NULL, "GCC_usher"); jsonWriteString(jw, NULL, "GCC_nextclade"); jsonWriteString(jw, NULL, "GCC_assigned_2023-11"); jsonWriteString(jw, NULL, "goya_usher"); jsonWriteString(jw, NULL, "goya_nextclade"); } else if (isFlu) { jsonWriteString(jw, NULL, "Nextstrain_clade"); } else { jsonWriteString(jw, NULL, "Nextstrain_lineage"); } jsonWriteListEnd(jw); } // Annotations for coloring/filtering by base writeAuspiceMetaGenomeAnnotations(jw, geneInfoList, genomeSize); jsonWriteObjectEnd(jw); } static void jsonWriteObjectValueUrl(struct jsonWrite *jw, char *name, char *value, char *url) /* Write an object with member "value" set to value, and if url is non-empty, "url" set to url. */ { jsonWriteObjectStart(jw, name); jsonWriteString(jw, "value", value); if (isNotEmpty(url)) jsonWriteString(jw, "url", url); jsonWriteObjectEnd(jw); } static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value) /* Write an object with one member, "value", set to value, as most Auspice node attributes are * formatted. */ { jsonWriteObjectValueUrl(jw, name, value, NULL); } static void makeLineageUrl(char *lineage, char *lineageUrl, size_t lineageUrlSize) /* If lineage is not "uploaded sample", make an outbreak.info link to it, otherwise just copy * lineage. */ { if (sameString(lineage, "uploaded sample")) safecpy(lineageUrl, lineageUrlSize, lineage); else safef(lineageUrl, lineageUrlSize, OUTBREAK_INFO_URLBASE "%s", lineage); } -static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name, - struct sampleMetadata *met, boolean isUserSample, - char *source, struct hash *sampleUrls, - struct hash *samplePlacements, boolean isRsv, +struct auspiceJsonInfo +/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the + * recursive function doesn't need a dozen args. */ + { + struct jsonWrite *jw; + struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree) + struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change + struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change + struct sampleMetadataStore *sampleMetadata; // Sample metadata for decorating tree + struct hash *sampleUrls; // URLs for samples, if applicable + struct hash *samplePlacements; // Sample placement info e.g. clade/lineage from usher + int nodeNum; // For generating sequential node ID (in absence of name) + char *source; // Source of non-user sequences in tree (GISAID or public) + }; + +static void jsonWriteLeafNodeAttributes(struct auspiceJsonInfo *aji, char *name, + boolean isUserSample, boolean isRsv, int branchAttrCount, char **branchAttrCols, char **branchAttrVals) /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash, * or may be a new sample from the user. Set rets for color categories so parent branches can * determine their color categories. */ { -char *userOrOld = isUserSample ? "uploaded sample" : source; -jsonWriteObjectValue(jw, "userOrOld", userOrOld); +char *userOrOld = isUserSample ? "uploaded sample" : aji->source; +jsonWriteObjectValue(aji->jw, "userOrOld", userOrOld); int i; for (i = 0; i < branchAttrCount; i++) branchAttrVals[i] = ""; if (branchAttrCount > 0 && sameString(branchAttrCols[0], "userOrOld")) branchAttrVals[0] = userOrOld; +char **met = name ? metadataForSample(aji->sampleMetadata, name) : NULL; if (met != NULL) { int i; - for (i = 0; i < met->columnCount; i++) + for (i = 0; i < aji->sampleMetadata->columnCount; i++) { - char *colName = met->columnNames[i]; + char *colName = aji->sampleMetadata->columnNames[i]; // Tweak old column name if found if (sameString(colName, "pangolin_lineage")) colName = "pango_lineage"; // Link out to outbreak.info for Pango lineages if (startsWith("pango_lineage", colName)) { - if (isNotEmpty(met->columnValues[i])) + if (isNotEmpty(met[i])) { char lineageUrl[1024]; - makeLineageUrl(met->columnValues[i], lineageUrl, sizeof lineageUrl); - jsonWriteObjectValueUrl(jw, colName, met->columnValues[i], lineageUrl); + makeLineageUrl(met[i], lineageUrl, sizeof lineageUrl); + jsonWriteObjectValueUrl(aji->jw, colName, met[i], lineageUrl); } - else if (isNotEmpty(met->columnValues[i])) - jsonWriteObjectValue(jw, colName, met->columnValues[i]); + else + jsonWriteObjectValue(aji->jw, colName, met[i]); } else - jsonWriteObjectValue(jw, colName, met->columnValues[i]); + jsonWriteObjectValue(aji->jw, colName, met[i]); // Some columns get passed upwards for aggregation so we can color internal nodes/branches. int j; for (j = 0; j < branchAttrCount; j++) { if (sameString(colName, branchAttrCols[j])) { - branchAttrVals[j] = met->columnValues[i]; + branchAttrVals[j] = met[i]; break; } } } } else if (isUserSample) { - struct placementInfo *pi = name ? hashFindVal(samplePlacements, name) : NULL; + struct placementInfo *pi = name ? hashFindVal(aji->samplePlacements, name) : NULL; int i; for (i = 0; i < branchAttrCount; i++) { branchAttrVals[i] = "uploaded sample"; // Special cases for using placementInfo of user sample for _usher lineage/clade calls // and outbreak.info link for Pango lineage //#*** TODO: think of a way to make this config-driven boolean wroteLink = FALSE; if (pi) { if (pi->nextClade && (sameString(branchAttrCols[i], "Nextstrain_clade_usher") || sameString(branchAttrCols[i], "goya_usher"))) branchAttrVals[i] = pi->nextClade; else if (pi->pangoLineage) { if (sameString(branchAttrCols[i], "pango_lineage_usher")) { branchAttrVals[i] = pi->pangoLineage; char lineageUrl[1024]; makeLineageUrl(pi->pangoLineage, lineageUrl, sizeof lineageUrl); - jsonWriteObjectValueUrl(jw, branchAttrCols[i], branchAttrVals[i], lineageUrl); + jsonWriteObjectValueUrl(aji->jw, branchAttrCols[i], branchAttrVals[i], + lineageUrl); wroteLink = TRUE; } else if (sameString(branchAttrCols[i], "GCC_usher")) branchAttrVals[i] = pi->pangoLineage; } } if (!wroteLink) - jsonWriteObjectValue(jw, branchAttrCols[i], branchAttrVals[i]); + jsonWriteObjectValue(aji->jw, branchAttrCols[i], branchAttrVals[i]); } } -char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL; +char *sampleUrl = (aji->sampleUrls && name) ? hashFindVal(aji->sampleUrls, name) : NULL; if (isNotEmpty(sampleUrl)) { char *p = strstr(sampleUrl, "subtreeAuspice"); char *subtreeNum = p + strlen("subtreeAuspice"); if (p && isdigit(*subtreeNum)) { int num = atoi(subtreeNum); char subtreeLabel[1024]; safef(subtreeLabel, sizeof subtreeLabel, "view subtree %d", num); - jsonWriteObjectValueUrl(jw, "subtree", subtreeLabel, sampleUrl); + jsonWriteObjectValueUrl(aji->jw, "subtree", subtreeLabel, sampleUrl); } else - jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl); + jsonWriteObjectValueUrl(aji->jw, "subtree", sampleUrl, sampleUrl); } } static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, boolean isRsv, int branchAttrCount, char **branchAttrCols, char **branchAttrVals) /* Write elements of node_attrs for a branch. */ { int i; for (i = 0; i < branchAttrCount; i++) { if (isNotEmpty(branchAttrVals[i])) jsonWriteObjectValue(jw, branchAttrCols[i], branchAttrVals[i]); } } INLINE char maybeComplement(char base, struct psl *psl) /* If psl is on '+' strand, return base, otherwise return the complement of base. */ { return (pslOrientation(psl) > 0) ? base : ntCompTable[(int)base]; } static struct slName *codonVpTxToAaChange(struct vpTx *codonVpTxList, struct singleNucChange *ancestorMuts, struct geneInfo *gi) /* Given a list of vpTx from the same codon, combine their changes with inherited mutations * in the same codon to get the amino acid change at this node. * Note: this assumes there is no UTR in transcript, only CDS. True so far for pathogens... */ { struct slName *aaChange = NULL; if (codonVpTxList != NULL) { struct vpTx *vpTx = codonVpTxList; int firstAaStart = (vpTx->start.txOffset - gi->cds->start) / 3; int codonStart = firstAaStart * 3; int codonOffset = vpTx->start.txOffset - gi->cds->start - codonStart; char oldCodon[4]; safencpy(oldCodon, sizeof oldCodon, gi->txSeq->dna + gi->cds->start + codonStart, 3); touppers(oldCodon); boolean isRc = (pslOrientation(gi->psl) < 0); int codonStartG = isRc ? vpTx->start.gOffset + codonOffset : vpTx->start.gOffset - codonOffset; struct singleNucChange *anc; for (anc = ancestorMuts; anc != NULL; anc = anc->next) { int ancCodonOffset = isRc ? codonStartG - (anc->chromStart + 1) : anc->chromStart - codonStartG; if (ancCodonOffset >= 0 && ancCodonOffset < 3) { char parBase = isRc ? ntCompTable[(int)anc->parBase] : anc->parBase; if (parBase != oldCodon[ancCodonOffset]) errAbort("codonVpTxToAaChange: expected parBase for ancestral mutation at %d " "(%s codon %d offset %d) to be '%c' but it's '%c'%s", anc->chromStart+1, gi->psl->qName, firstAaStart, ancCodonOffset, oldCodon[ancCodonOffset], parBase, isRc ? " (rev-comp'd)" : ""); oldCodon[ancCodonOffset] = maybeComplement(anc->newBase, gi->psl); } } char oldAa = lookupCodon(oldCodon); if (oldAa == '\0') oldAa = '*'; char newCodon[4]; safecpy(newCodon, sizeof newCodon, oldCodon); for (vpTx = codonVpTxList; vpTx != NULL; vpTx = vpTx->next) { int aaStart = (vpTx->start.txOffset - gi->cds->start) / 3; if (aaStart != firstAaStart) errAbort("codonVpTxToAaChange: program error: firstAaStart %d != aaStart %d", firstAaStart, aaStart); int codonOffset = vpTx->start.txOffset - gi->cds->start - codonStart; // vpTx->txRef[0] is always the reference base, not like singleNucChange parBase, // so we can't compare it to expected value as we could for ancMuts above. newCodon[codonOffset] = vpTx->txAlt[0]; } char newAa = lookupCodon(newCodon); if (newAa == '\0') newAa = '*'; if (newAa != oldAa) { char aaChangeStr[32]; safef(aaChangeStr, sizeof aaChangeStr, "%c%d%c", oldAa, firstAaStart+1, newAa); aaChange = slNameNew(aaChangeStr); } } return aaChange; } struct slPair *getAaMutations(struct singleNucChange *sncList, struct singleNucChange *ancestorMuts, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ { struct slPair *geneChangeList = NULL; struct geneInfo *gi; for (gi = geneInfoList; gi != NULL; gi = gi->next) { struct slName *aaChangeList = NULL; int prevAaStart = -1; struct vpTx *codonVpTxList = NULL; struct singleNucChange *snc; for (snc = sncList; snc != NULL; snc = snc->next) { if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) { struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 }; char gAlt[2]; safef(gAlt, sizeof(gAlt), "%c", snc->newBase); struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq); if (vpTx->start.region == vpExon && vpTx->start.txOffset < gi->cds->end && vpTx->end.txOffset > gi->cds->start) { int aaStart = (vpTx->start.txOffset - gi->cds->start) / 3; // Accumulate vpTxs in the same codon if (aaStart == prevAaStart || prevAaStart == -1) { slAddHead(&codonVpTxList, vpTx); } else { // Done with previous codon; find change from accumulated mutations struct slName *aaChange = codonVpTxToAaChange(codonVpTxList, ancestorMuts, gi); if (aaChange) slAddHead(&aaChangeList, aaChange); slFreeListWithFunc(&codonVpTxList, vpTxFree); codonVpTxList = vpTx; } prevAaStart = aaStart; } } } // Find change from final set of accumulated mutations if (codonVpTxList != NULL) { struct slName *aaChange = codonVpTxToAaChange(codonVpTxList, ancestorMuts, gi); if (aaChange) slAddHead(&aaChangeList, aaChange); slFreeListWithFunc(&codonVpTxList, vpTxFree); } if (aaChangeList != NULL) { slReverse(&aaChangeList); slAddHead(&geneChangeList, slPairNew(gi->psl->qName, aaChangeList)); } } slReverse(&geneChangeList); return geneChangeList; } static void jsonWriteBranchAttrs(struct jsonWrite *jw, struct phyloTree *node, struct singleNucChange *ancestorMuts, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) /* Write mutations (if any). If node is not a leaf, write label for mutations at this node. */ { if (node->priv != NULL) { struct singleNucChange *sncList = node->priv; struct slPair *geneAaMutations = getAaMutations(sncList, ancestorMuts, geneInfoList, gSeqWin); jsonWriteObjectStart(jw, "branch_attrs"); if (node->numEdges > 0) { jsonWriteObjectStart(jw, "labels"); if (node->ident->name) jsonWriteString(jw, "id", node->ident->name); struct singleNucChange *snc = sncList; struct dyString *dy = dyStringCreate("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); for (snc = sncList->next; snc != NULL; snc = snc->next) dyStringPrintf(dy, ",%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); jsonWriteString(jw, "nuc mutations", dy->string); dyStringClear(dy); for (snc = sncList; snc != NULL; snc = snc->next) { char ref[2]; seqWindowCopy(gSeqWin, snc->chromStart, 1, ref, sizeof(ref)); if (snc->newBase == ref[0]) { dyStringAppendSep(dy, ","); dyStringPrintf(dy, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } jsonWriteString(jw, "back-mutations", dy->string); dyStringClear(dy); struct dyString *dyS = dyStringNew(0); struct slPair *geneAaMut; for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next) { struct slName *aaMut; for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) { dyStringAppendSep(dy, ","); dyStringPrintf(dy, "%s:%s", geneAaMut->name, aaMut->name); } if (sameString("S", geneAaMut->name)) { for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) { dyStringAppendSep(dyS, ","); dyStringPrintf(dyS, "%s:%s", geneAaMut->name, aaMut->name); } } } if (isNotEmpty(dy->string)) jsonWriteString(jw, "aa mutations", dy->string); if (isNotEmpty(dyS->string)) jsonWriteString(jw, "Spike mutations", dyS->string); dyStringFree(&dy); dyStringFree(&dyS); jsonWriteObjectEnd(jw); } jsonWriteObjectStart(jw, "mutations"); struct slPair *geneAaMut; for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next) { jsonWriteListStart(jw, geneAaMut->name); struct slName *aaMut; for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) jsonWriteString(jw, NULL, aaMut->name); jsonWriteListEnd(jw); } jsonWriteListStart(jw, "nuc"); struct singleNucChange *snc; for (snc = sncList; snc != NULL; snc = snc->next) jsonWriteStringf(jw, NULL, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); jsonWriteListEnd(jw); jsonWriteObjectEnd(jw); // mutations jsonWriteObjectEnd(jw); // branch_attrs } } -struct auspiceJsonInfo -/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the - * recursive function doesn't need a dozen args. */ - { - struct jsonWrite *jw; - struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree) - struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change - struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change - struct hash *sampleMetadata; // Sample metadata for decorating tree - struct hash *sampleUrls; // URLs for samples, if applicable - struct hash *samplePlacements; // Sample placement info e.g. clade/lineage from usher - int nodeNum; // For generating sequential node ID (in absence of name) - char *source; // Source of non-user sequences in tree (GISAID or public) - }; - static int cmpstringp(const void *p1, const void *p2) /* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */ { char *s1 = *(char * const *)p1; char *s2 = *(char * const *)p2; if (s1 && s2) return strcmp(s1, s2); else if (s1 && !s2) return 1; else if (s2 && !s1) return -1; return 0; } static char *majorityMaybe(char *array[], int arraySize) /* Sort array and if a majority of its values are the same then return that value; otherwise NULL. */ { if (arraySize < 1) return NULL; qsort(array, arraySize, sizeof array[0], cmpstringp); char *maxRunVal = array[0]; int runLength = 1, maxRunLength = 1; int i; for (i = 1; i < arraySize; i++) { if (sameOk(array[i], array[i-1])) { runLength++; if (runLength > maxRunLength) { maxRunLength = runLength; maxRunVal = array[i]; } } else runLength = 1; } return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL; } static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji, struct singleNucChange *ancestorMuts, boolean isRsv, int branchAttrCount, char **branchAttrCols, char **branchAttrVals) /* Write Augur/Auspice V2 JSON for tree. Enclosing object start and end are written by caller. */ { struct singleNucChange *sncList = node->priv; if (sncList) { depth += slCount(sncList); } boolean isUserSample = FALSE; if (node->ident->name) isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name); char *name = node->ident->name; -struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL; if (name) jsonWriteString(aji->jw, "name", name); else jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++); jsonWriteBranchAttrs(aji->jw, node, ancestorMuts, aji->geneInfoList, aji->gSeqWin); if (node->numEdges > 0) { struct singleNucChange *allMuts = ancestorMuts; struct singleNucChange *ancLast = slLastEl(ancestorMuts); if (ancLast != NULL) ancLast->next = sncList; else allMuts = sncList; jsonWriteListStart(aji->jw, "children"); char *kidAttrVals[branchAttrCount][node->numEdges]; // Step through children in reverse order because nextstrain/Auspice draws upside-down. :) int i; for (i = node->numEdges - 1; i >= 0; i--) { char *kidNodeAttrVals[branchAttrCount]; jsonWriteObjectStart(aji->jw, NULL); rTreeToAuspiceJson(node->edges[i], depth, aji, allMuts, isRsv, branchAttrCount, branchAttrCols, kidNodeAttrVals); jsonWriteObjectEnd(aji->jw); int j; for (j = 0; j < branchAttrCount; j++) kidAttrVals[j][i] = kidNodeAttrVals[j]; } jsonWriteListEnd(aji->jw); if (branchAttrVals) { for (i = 0; i < branchAttrCount; i++) branchAttrVals[i] = majorityMaybe(kidAttrVals[i], node->numEdges); } if (ancLast) ancLast->next = NULL; } jsonWriteObjectStart(aji->jw, "node_attrs"); jsonWriteDouble(aji->jw, "div", depth); if (node->numEdges == 0) - jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls, - aji->samplePlacements, isRsv, + jsonWriteLeafNodeAttributes(aji, name, isUserSample, isRsv, branchAttrCount, branchAttrCols, branchAttrVals); else if (branchAttrVals) jsonWriteBranchNodeAttributes(aji->jw, isRsv, branchAttrCount, branchAttrCols, branchAttrVals); jsonWriteObjectEnd(aji->jw); } struct phyloTree *phyloTreeNewNode(char *name) /* Alloc & return a new node with no children. */ { struct phyloTree *node; AllocVar(node); AllocVar(node->ident); node->ident->name = cloneString(name); return node; } struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome) /* If config.ra has a source of gene annotations, then return the gene list. */ { struct geneInfo *geneInfoList = NULL; struct bbiFile *bbi = NULL; struct errCatch *errCatch = errCatchNew(); if (isNotEmpty(bigGenePredFile)) { if (errCatchStart(errCatch)) { bbi = bigBedFileOpen(bigGenePredFile); } errCatchEnd(errCatch); } if (bbi) { struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, refGenome->name, 0, refGenome->size, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { struct genePredExt *gp = genePredFromBigGenePred(refGenome->name, bb); int txLen = 0; int ex; for (ex = 0; ex < gp->exonCount; ex++) txLen += (gp->exonEnds[ex] - gp->exonStarts[ex]); char *seq = needMem(txLen+1); int txOffset = 0; for (ex = 0; ex < gp->exonCount; ex++) { int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex]; safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen); txOffset += exonLen; } if (sameString(gp->strand, "-")) reverseComplement(seq, txLen); struct geneInfo *gi; AllocVar(gi); gi->psl = genePredToPsl((struct genePred *)gp, refGenome->size, txLen); gi->psl->qName = cloneString(gp->name2); gi->txSeq = newDnaSeq(seq, txLen, gp->name2); gi->cdsStart = gp->cdsStart; gi->cdsEnd = gp->cdsEnd; AllocVar(gi->cds); genePredToCds((struct genePred *)gp, gi->cds); int cdsLen = gi->cds->end - gi->cds->start; // Skip genes with no CDS (like RNA genes) or obviously incomplete/incorrect CDS. if (cdsLen > 0 && (cdsLen % 3) == 0) { slAddHead(&geneInfoList, gi); } } lmCleanup(&lm); bigBedFileClose(&bbi); } slReverse(&geneInfoList); return geneInfoList; } static int getBranchAttrCols(char *org, char *db, char ***retBranchAttrCols) /* Alloc an array of metadata column names to use as branch attributes and return count. * There will always be at least 1 (userOrOld / Sample type); others come from config setting. */ { int branchAttrCount = 1; struct slName *attrList = NULL, *attr; char *branchAttrSetting = phyloPlaceRefSetting(org, db, "branchAttributes"); if (isNotEmpty(branchAttrSetting)) { attrList = slNameListFromComma(branchAttrSetting); branchAttrCount += slCount(attrList); } char **branchAttrCols = NULL; AllocArray(branchAttrCols, branchAttrCount); branchAttrCols[0] = cloneString("userOrOld"); int i; for (i = 1, attr = attrList; i < branchAttrCount && attr != NULL; i++, attr = attr->next) branchAttrCols[i] = cloneString(trimSpaces(attr->name)); *retBranchAttrCols = branchAttrCols; return branchAttrCount; } void treeToAuspiceJson(struct subtreeInfo *sti, char *org, char *db, struct geneInfo *geneInfoList, - struct seqWindow *gSeqWin, struct hash *sampleMetadata, + struct seqWindow *gSeqWin, struct sampleMetadataStore *sampleMetadata, struct hash *sampleUrls, struct hash *samplePlacements, char *jsonFile, char *source) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ { struct phyloTree *tree = sti->subtree; FILE *outF = mustOpen(jsonFile, "w"); struct jsonWrite *jw = jsonWriteNew(); jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "version", "v2"); boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) || startsWith("RGCC", db)); boolean isFlu = (stringIn("GCF_000865085", db) || stringIn("GCF_001343785", db)); writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, org, db, geneInfoList, gSeqWin->end, isRsv, isFlu); jsonWriteObjectStart(jw, "tree"); int nodeNum = 10000; // Auspice.us starting node number for newick -> json int depth = 0; // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree struct phyloTree *root = phyloTreeNewNode("wrapper"); phyloAddEdge(root, tree); tree = root; struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin, sampleMetadata, sampleUrls, samplePlacements, nodeNum, source }; char **branchAttrCols = NULL; int branchAttrCount = getBranchAttrCols(org, db, &branchAttrCols); rTreeToAuspiceJson(tree, depth, &aji, NULL, isRsv, branchAttrCount, branchAttrCols, NULL); jsonWriteObjectEnd(jw); // tree jsonWriteObjectEnd(jw); // top-level object fputs(jw->dy->string, outF); jsonWriteFree(&jw); carefulClose(&outF); }