b7c2dcc865207dd23298429f93057c9068acef68
angie
Wed Dec 9 15:48:34 2020 -0800
hgPhyloPlace: Add config option treeChoices: tab-sep file of {protobuf, metadata, source, description} so we can offer the user a choice between GISAID and public sequence trees (possibly different releases for reproducibility).
diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index 8f6030b..78c9355 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -1,464 +1,469 @@
/* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample
* mutations, protein changes and metadata. */
/* Copyright (C) 2020 The Regents of the University of California */
#include "common.h"
#include "hash.h"
#include "hui.h"
#include "jsonWrite.h"
#include "linefile.h"
#include "parsimonyProto.h"
#include "phyloPlace.h"
#include "phyloTree.h"
#include "variantProjector.h"
// Globals
extern char *chrom;
extern int chromSize;
-static void writeAuspiceMeta(FILE *outF, struct slName *subtreeUserSampleIds)
+static void writeAuspiceMeta(FILE *outF, struct slName *subtreeUserSampleIds, char *source)
/* Write metadata to configure Auspice display. */
{
fprintf(outF,
"\"meta\": { "
"\"title\": \"Subtree with %s", subtreeUserSampleIds->name);
struct slName *sln;
for (sln = subtreeUserSampleIds->next; sln != NULL; sln = sln->next)
fprintf(outF, ", %s", sln->name);
fputs("\", "
"\"panels\": [ \"tree\"] , "
"\"colorings\": [ "
" { \"key\": \"pangolin_lineage\", "
" \"title\": \"Pangolin lineage\", \"type\": \"categorical\" },"
" { \"key\": \"Nextstrain_clade\","
" \"scale\": [ [ \"19B\", \"#EC676D\" ], [ \"19A\", \"#F79E43\" ],"
" [ \"20A\", \"#B6D77A\" ], [ \"20C\", \"#8FD4ED\" ],"
" [ \"20B\", \"#A692C3\" ] ],"
" \"title\": \"Nextstrain Clade\", \"type\": \"categorical\" },"
" { \"key\": \"GISAID_clade\","
" \"scale\": [ [ \"S\", \"#EC676D\" ], [ \"L\", \"#F79E43\" ], [ \"O\", \"#F9D136\" ],"
" [ \"V\", \"#FAEA95\" ], [ \"G\", \"#B6D77A\" ], [ \"GH\", \"#8FD4ED\" ],"
" [ \"GR\", \"#A692C3\" ] ],"
" \"title\": \"GISAID Clade\", \"type\": \"categorical\" },"
- " { \"key\": \"userOrOld\", "
- " \"scale\": [ [ \"uploaded sample\", \"#CC0000\"] , [ \"GISAID sample\", \"#000000\"] ],"
+ , outF);
+fprintf(outF, " { \"key\": \"userOrOld\", "
+ " \"scale\": [ [ \"uploaded sample\", \"#CC0000\"] , [ \"%s\", \"#000000\"] ],"
" \"title\": \"Sample type\", \"type\": \"categorical\" }"
- " ] , "
+ , source);
+fputs(" ] , "
//#*** Filters didn't seem to work... maybe something about the new fetch feature, or do I need to spcify in some other way?
//#*** "\"filters\": [ \"GISAID_clade\", \"region\", \"country\", \"division\", \"author\" ], "
"\"display_defaults\": { "
" \"branch_label\": \"nuc mutations\" "
"}, "
, outF);
fprintf(outF,
"\"description\": \"Dataset generated by [UShER web interface]"
"(%shgPhyloPlace) using the "
"[usher](https://github.com/yatisht/usher/) program. "
//#*** TODO: describe input from which tree was generated: user sample, version of tree, etc.
, hLocalHostCgiBinUrl());
fputs("If you have metadata you wish to display, you can now drag on a CSV file and it will be "
"added into this view, [see here]("NEXTSTRAIN_DRAG_DROP_DOC") "
"for more info.\"} ,"
, outF);
}
static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value)
/* Write an object with one member, "value", set to value, as most Auspice node attributes are
* formatted. */
{
jsonWriteObjectStart(jw, name);
jsonWriteString(jw, "value", value);
jsonWriteObjectEnd(jw);
}
static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name,
struct sampleMetadata *met, boolean isUserSample,
+ char *source,
char **retUserOrOld, char **retNClade, char **retGClade,
char **retLineage)
/* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash,
* or may be a new sample from the user. Set rets for color categories so parent branches can
* determine their color categories. */
{
-*retUserOrOld = isUserSample ? "uploaded sample" : "GISAID sample";
+*retUserOrOld = isUserSample ? "uploaded sample" : source;
jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld);
if (met && met->date)
jsonWriteObjectValue(jw, "date", met->date);
if (met && met->author)
{
jsonWriteObjectValue(jw, "author", met->author);
// Note: Nextstrain adds paper_url and title when available; they also add author and use
// a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A")
}
*retNClade = isUserSample ? "uploaded sample" : (met && met->nClade) ? met->nClade : NULL;
if (isNotEmpty(*retNClade))
jsonWriteObjectValue(jw, "Nextstrain_clade", *retNClade);
*retGClade = isUserSample ? "uploaded sample" : (met && met->gClade) ? met->gClade : NULL;
if (isNotEmpty(*retGClade))
jsonWriteObjectValue(jw, "GISAID_clade", *retGClade);
*retLineage = isUserSample ? "uploaded sample" :
(met && met->lineage) ? met->lineage : NULL;
if (isNotEmpty(*retLineage))
jsonWriteObjectValue(jw, "pangolin_lineage", *retLineage);
if (met && met->epiId)
jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId);
if (met && met->gbAcc)
jsonWriteObjectValue(jw, "genbank_accession", met->gbAcc);
if (met && met->country)
jsonWriteObjectValue(jw, "country", met->country);
if (met && met->division)
jsonWriteObjectValue(jw, "division", met->division);
if (met && met->location)
jsonWriteObjectValue(jw, "location", met->location);
if (met && met->countryExp)
jsonWriteObjectValue(jw, "country_exposure", met->countryExp);
if (met && met->divExp)
jsonWriteObjectValue(jw, "division_exposure", met->divExp);
if (met && met->origLab)
jsonWriteObjectValue(jw, "originating_lab", met->origLab);
if (met && met->subLab)
jsonWriteObjectValue(jw, "submitting_lab", met->subLab);
if (met && met->region)
jsonWriteObjectValue(jw, "region", met->region);
}
static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld,
char *nClade, char *gClade, char *lineage)
/* Write elements of node_attrs for a branch. */
{
if (userOrOld)
jsonWriteObjectValue(jw, "userOrOld", userOrOld);
if (nClade)
jsonWriteObjectValue(jw, "Nextstrain_clade", nClade);
if (gClade)
jsonWriteObjectValue(jw, "GISAID_clade", gClade);
if (lineage)
jsonWriteObjectValue(jw, "pangolin_lineage", lineage);
}
struct geneInfo
/* Information sufficient to determine whether a genome change causes a coding change. */
{
struct geneInfo *next;
struct psl *psl; // Alignment of transcript to genome
struct dnaSeq *txSeq; // Transcript sequence
};
static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi,
struct seqWindow *gSeqWin,
int *retAaStart, char *retOldAa, char *retNewAa)
/* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly
* (note amino acid values are single-base not strings). */
{
boolean isCodingChange = FALSE;
if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
{
struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 };
char gAlt[2];
safef(gAlt, sizeof(gAlt), "%c", snc->newBase);
if (!sameString(gi->psl->strand, "+"))
errAbort("changesProtein: only worlds for psl->strand='+', but gene '%s' has strand '%s'",
gi->psl->qName, gi->psl->strand);
struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq);
if (vpTx->start.region == vpExon)
{
int aaStart = vpTx->start.txOffset / 3;
int codonStart = aaStart * 3;
char newCodon[4];
safencpy(newCodon, sizeof newCodon, gi->txSeq->dna + codonStart, 3);
int codonOffset = vpTx->start.txOffset - codonStart;
assert(codonOffset < sizeof newCodon);
newCodon[codonOffset] = snc->newBase;
char oldAa = lookupCodon(gi->txSeq->dna + codonStart);
char newAa = lookupCodon(newCodon);
if (newAa != oldAa)
{
isCodingChange = TRUE;
*retAaStart = aaStart;
*retOldAa = oldAa;
*retNewAa = newAa;
}
}
vpTxFree(&vpTx);
}
return isCodingChange;
}
static struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList,
struct seqWindow *gSeqWin)
/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
{
struct slPair *geneChangeList = NULL;
struct geneInfo *gi;
for (gi = geneInfoList; gi != NULL; gi = gi->next)
{
struct slName *aaChangeList = NULL;
struct singleNucChange *snc;
for (snc = sncList; snc != NULL; snc = snc->next)
{
if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
{
int aaStart;
char oldAa, newAa;
if (changesProtein(snc, gi, gSeqWin, &aaStart, &oldAa, &newAa))
{
char aaChange[32];
safef(aaChange, sizeof aaChange, "%c%d%c", oldAa, aaStart+1, newAa);
slNameAddHead(&aaChangeList, aaChange);
}
}
}
if (aaChangeList != NULL)
{
slReverse(&aaChangeList);
slAddHead(&geneChangeList, slPairNew(gi->psl->qName, aaChangeList));
}
}
slReverse(&geneChangeList);
return geneChangeList;
}
static void jsonWriteBranchAttrs(struct jsonWrite *jw, struct phyloTree *node,
struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
/* Write mutations (if any). If node is not a leaf, write label for mutations at this node. */
{
if (node->priv != NULL)
{
struct singleNucChange *sncList = node->priv;
struct slPair *geneAaMutations = getAaMutations(sncList, geneInfoList, gSeqWin);
jsonWriteObjectStart(jw, "branch_attrs");
if (node->numEdges > 0)
{
jsonWriteObjectStart(jw, "labels");
struct singleNucChange *snc = sncList;
struct dyString *dy = dyStringCreate("%c%d%c",
snc->parBase, snc->chromStart+1, snc->newBase);
for (snc = sncList->next; snc != NULL; snc = snc->next)
dyStringPrintf(dy, ",%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
jsonWriteString(jw, "nuc mutations", dy->string);
dyStringClear(dy);
struct slPair *geneAaMut;
for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next)
{
struct slName *aaMut;
for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next)
{
dyStringAppendSep(dy, ",");
dyStringPrintf(dy, "%s:%s", geneAaMut->name, aaMut->name);
}
}
if (isNotEmpty(dy->string))
jsonWriteString(jw, "aa mutations", dy->string);
dyStringFree(&dy);
jsonWriteObjectEnd(jw);
}
jsonWriteObjectStart(jw, "mutations");
struct slPair *geneAaMut;
for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next)
{
jsonWriteListStart(jw, geneAaMut->name);
struct slName *aaMut;
for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next)
jsonWriteString(jw, NULL, aaMut->name);
jsonWriteListEnd(jw);
}
jsonWriteListStart(jw, "nuc");
struct singleNucChange *snc;
for (snc = sncList; snc != NULL; snc = snc->next)
jsonWriteStringf(jw, NULL, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
jsonWriteListEnd(jw);
jsonWriteObjectEnd(jw); // mutations
jsonWriteObjectEnd(jw); // branch_attrs
}
}
struct auspiceJsonInfo
/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the
* recursive function doesn't need a dozen args. */
{
struct jsonWrite *jw;
struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree)
struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change
struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change
struct hash *sampleMetadata; // Sample metadata for decorating tree
int nodeNum; // For generating sequential node ID (in absence of name)
+ char *source; // Source of non-user sequences in tree (GISAID or public)
};
static int cmpstringp(const void *p1, const void *p2)
/* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */
{
char *s1 = *(char * const *)p1;
char *s2 = *(char * const *)p2;
if (s1 && s2)
return strcmp(s1, s2);
else if (s1 && !s2)
return 1;
else if (s2 && !s1)
return -1;
return 0;
}
static char *majorityMaybe(char *array[], int arraySize)
/* Sort array and if a majority of its values are the same then return that value; otherwise NULL. */
{
if (arraySize < 1)
return NULL;
qsort(array, arraySize, sizeof array[0], cmpstringp);
char *maxRunVal = array[0];
int runLength = 1, maxRunLength = 1;
int i;
for (i = 1; i < arraySize; i++)
{
if (sameOk(array[i], array[i-1]))
{
runLength++;
if (runLength > maxRunLength)
{
maxRunLength = runLength;
maxRunVal = array[i];
}
}
else
runLength = 1;
}
return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL;
}
static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji,
char **retUserOrOld, char **retNClade, char **retGClade,
char **retLineage)
/* Write Augur/Auspice V2 JSON for tree. Enclosing object start and end are written by caller. */
{
if (node->priv)
{
struct singleNucChange *sncList = node->priv;
depth += slCount(sncList);
}
boolean isUserSample = FALSE;
if (node->ident->name)
isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name);
char *name = node->ident->name;
struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL;
if (name)
jsonWriteString(aji->jw, "name", name);
else
jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++);
jsonWriteBranchAttrs(aji->jw, node, aji->geneInfoList, aji->gSeqWin);
if (node->numEdges > 0)
{
jsonWriteListStart(aji->jw, "children");
char *kidUserOrOld[node->numEdges];
char *kidNClade[node->numEdges];
char *kidGClade[node->numEdges];
char *kidLineage[node->numEdges];
int i;
for (i = 0; i < node->numEdges; i++)
{
jsonWriteObjectStart(aji->jw, NULL);
rTreeToAuspiceJson(node->edges[i], depth, aji,
&kidUserOrOld[i], &kidNClade[i], &kidGClade[i], &kidLineage[i]);
jsonWriteObjectEnd(aji->jw);
}
jsonWriteListEnd(aji->jw);
if (retUserOrOld)
*retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges);
if (retNClade)
*retNClade = majorityMaybe(kidNClade, node->numEdges);
if (retGClade)
*retGClade = majorityMaybe(kidGClade, node->numEdges);
if (retLineage)
*retLineage = majorityMaybe(kidLineage, node->numEdges);
}
jsonWriteObjectStart(aji->jw, "node_attrs");
jsonWriteDouble(aji->jw, "div", depth);
if (node->numEdges == 0)
- jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample,
+ jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source,
retUserOrOld, retNClade, retGClade, retLineage);
else if (retUserOrOld && retGClade && retLineage)
jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retNClade, *retGClade, *retLineage);
jsonWriteObjectEnd(aji->jw);
}
struct phyloTree *phyloTreeNewNode(char *name)
/* Alloc & return a new node with no children. */
{
struct phyloTree *node;
AllocVar(node);
AllocVar(node->ident);
node->ident->name = cloneString(name);
return node;
}
struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome)
/* If config.ra has a source of gene annotations, then return the gene list. */
{
struct geneInfo *geneInfoList = NULL;
if (isNotEmpty(bigGenePredFile) && fileExists(bigGenePredFile))
{
struct bbiFile *bbi = bigBedFileOpen(bigGenePredFile);
struct lm *lm = lmInit(0);
struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm);
for (bb = bbList; bb != NULL; bb = bb->next)
{
struct genePredExt *gp = genePredFromBigGenePred(chrom, bb);
if (gp->strand[0] != '+')
errAbort("getGeneInfoList: strand must be '+' but got '%s' for gene %s",
gp->strand, gp->name);
int txLen = 0;
int ex;
for (ex = 0; ex < gp->exonCount; ex++)
txLen += (gp->exonEnds[ex] - gp->exonStarts[ex]);
char *seq = needMem(txLen);
int txOffset = 0;
for (ex = 0; ex < gp->exonCount; ex++)
{
int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex];
safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen);
txOffset += exonLen;
}
struct geneInfo *gi;
AllocVar(gi);
gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen);
gi->txSeq = newDnaSeq(seq, txLen, gp->name2);
slAddHead(&geneInfoList, gi);
}
lmCleanup(&lm);
bigBedFileClose(&bbi);
}
slReverse(&geneInfoList);
return geneInfoList;
}
void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref,
- char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile)
+ char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile,
+ char *source)
/* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
* (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
{
struct phyloTree *tree = sti->subtree;
FILE *outF = mustOpen(jsonFile, "w");
fputs("{ \"version\": \"v2\", ", outF);
-writeAuspiceMeta(outF, sti->subtreeUserSampleIds);
+writeAuspiceMeta(outF, sti->subtreeUserSampleIds, source);
// The meta part is mostly constant & easier to just write out, but jsonWrite is better for the
// nested tree structure.
struct jsonWrite *jw = jsonWriteNew();
jsonWriteObjectStart(jw, "tree");
int nodeNum = 10000; // Auspice.us starting node number for newick -> json
int depth = 0;
// Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree
struct phyloTree *root = phyloTreeNewNode("wrapper");
phyloAddEdge(root, tree);
tree = root;
struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, ref);
struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize);
struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin,
- sampleMetadata, nodeNum };
+ sampleMetadata, nodeNum, source };
rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL, NULL);
chromSeqWindowFree(&gSeqWin);
jsonWriteObjectEnd(jw);
fputs(jw->dy->string, outF);
jsonWriteFree(&jw);
fputs("}", outF);
carefulClose(&outF);
}