afafa0301ea4b14fbe1fbd5aa379c5351ecd640d angie Tue Aug 2 19:41:44 2022 -0700 Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub). * Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name. * Add a menu input to the main page for switching between supported genomes if there are more than one. * Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings. * Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions. * Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV). * also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output. TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index 02e6bed..f055fd5 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -1,33 +1,30 @@ /* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample * mutations, protein changes and metadata. */ /* Copyright (C) 2020 The Regents of the University of California */ #include "common.h" #include "hash.h" #include "hui.h" #include "jsonWrite.h" #include "linefile.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "variantProjector.h" -// Globals -extern char *chrom; -extern int chromSize; static void writeAuspiceMeta(FILE *outF, struct slName *subtreeUserSampleIds, char *source) /* Write metadata to configure Auspice display. */ { fprintf(outF, "\"meta\": { " "\"title\": \"Subtree with %s", subtreeUserSampleIds->name); int sampleCount = slCount(subtreeUserSampleIds); if (sampleCount > 10) fprintf(outF, " and %d other uploaded samples", sampleCount - 1); else { struct slName *sln; for (sln = subtreeUserSampleIds->next; sln != NULL; sln = sln->next) fprintf(outF, ", %s", sln->name); @@ -256,31 +253,31 @@ if (nCladeUsher) jsonWriteObjectValue(jw, "Nextstrain_clade_usher", nCladeUsher); if (lineageUsher) jsonWriteObjectValue(jw, "pango_lineage_usher", lineageUsher); } static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi, struct seqWindow *gSeqWin, int *retAaStart, char *retOldAa, char *retNewAa) /* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly * (note amino acid values are single-base not strings). */ { boolean isCodingChange = FALSE; if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) { - struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 }; + struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 }; char gAlt[2]; safef(gAlt, sizeof(gAlt), "%c", snc->newBase); if (!sameString(gi->psl->strand, "+")) errAbort("changesProtein: only works for psl->strand='+', but gene '%s' has strand '%s'", gi->psl->qName, gi->psl->strand); struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq); if (vpTx->start.region == vpExon) { int aaStart = vpTx->start.txOffset / 3; int codonStart = aaStart * 3; char newCodon[4]; safencpy(newCodon, sizeof newCodon, gi->txSeq->dna + codonStart, 3); int codonOffset = vpTx->start.txOffset - codonStart; assert(codonOffset < sizeof newCodon); newCodon[codonOffset] = snc->newBase; @@ -550,52 +547,53 @@ struct phyloTree *node; AllocVar(node); AllocVar(node->ident); node->ident->name = cloneString(name); return node; } struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome) /* If config.ra has a source of gene annotations, then return the gene list. */ { struct geneInfo *geneInfoList = NULL; if (isNotEmpty(bigGenePredFile) && fileExists(bigGenePredFile)) { struct bbiFile *bbi = bigBedFileOpen(bigGenePredFile); struct lm *lm = lmInit(0); - struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); + struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, refGenome->name, 0, + refGenome->size, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { - struct genePredExt *gp = genePredFromBigGenePred(chrom, bb); + struct genePredExt *gp = genePredFromBigGenePred(refGenome->name, bb); if (gp->strand[0] != '+') errAbort("getGeneInfoList: strand must be '+' but got '%s' for gene %s", gp->strand, gp->name); int txLen = 0; int ex; for (ex = 0; ex < gp->exonCount; ex++) txLen += (gp->exonEnds[ex] - gp->exonStarts[ex]); char *seq = needMem(txLen+1); int txOffset = 0; for (ex = 0; ex < gp->exonCount; ex++) { int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex]; safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen); txOffset += exonLen; } struct geneInfo *gi; AllocVar(gi); - gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen); + gi->psl = genePredToPsl((struct genePred *)gp, refGenome->size, txLen); gi->txSeq = newDnaSeq(seq, txLen, gp->name2); slAddHead(&geneInfoList, gi); } lmCleanup(&lm); bigBedFileClose(&bbi); } slReverse(&geneInfoList); return geneInfoList; } void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, char *jsonFile, char *source) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */