src/hg/hgPhyloPlace/treeToAuspiceJson.c afafa0301ea4b14fbe1fbd5aa379c5351ecd640d

afafa0301ea4b14fbe1fbd5aa379c5351ecd640d
angie
  Tue Aug 2 19:41:44 2022 -0700
Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub).
* Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name.
* Add a menu input to the main page for switching between supported genomes if there are more than one.
* Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings.
* Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions.
* Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV).
* also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output.

TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes.

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index 02e6bed..f055fd5 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -1,33 +1,30 @@
 /* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample
  * mutations, protein changes and metadata. */
 
 /* Copyright (C) 2020 The Regents of the University of California */
 
 #include "common.h"
 #include "hash.h"
 #include "hui.h"
 #include "jsonWrite.h"
 #include "linefile.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "phyloTree.h"
 #include "variantProjector.h"
 
-// Globals
-extern char *chrom;
-extern int chromSize;
 
 static void writeAuspiceMeta(FILE *outF, struct slName *subtreeUserSampleIds, char *source)
 /* Write metadata to configure Auspice display. */
 {
 fprintf(outF,
         "\"meta\": { "
         "\"title\": \"Subtree with %s", subtreeUserSampleIds->name);
 int sampleCount = slCount(subtreeUserSampleIds);
 if (sampleCount > 10)
     fprintf(outF, " and %d other uploaded samples", sampleCount - 1);
 else
     {
     struct slName *sln;
     for (sln = subtreeUserSampleIds->next;  sln != NULL;  sln = sln->next)
         fprintf(outF, ", %s", sln->name);
@@ -256,31 +253,31 @@
 if (nCladeUsher)
     jsonWriteObjectValue(jw, "Nextstrain_clade_usher", nCladeUsher);
 if (lineageUsher)
     jsonWriteObjectValue(jw, "pango_lineage_usher", lineageUsher);
 }
 
 static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi,
                               struct seqWindow *gSeqWin,
                               int *retAaStart, char *retOldAa, char *retNewAa)
 /* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly
  * (note amino acid values are single-base not strings). */
 {
 boolean isCodingChange = FALSE;
 if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
     {
-    struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 };
+    struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 };
     char gAlt[2];
     safef(gAlt, sizeof(gAlt), "%c", snc->newBase);
     if (!sameString(gi->psl->strand, "+"))
         errAbort("changesProtein: only works for psl->strand='+', but gene '%s' has strand '%s'",
                  gi->psl->qName, gi->psl->strand);
     struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq);
     if (vpTx->start.region == vpExon)
         {
         int aaStart = vpTx->start.txOffset / 3;
         int codonStart = aaStart * 3;
         char newCodon[4];
         safencpy(newCodon, sizeof newCodon, gi->txSeq->dna + codonStart, 3);
         int codonOffset = vpTx->start.txOffset - codonStart;
         assert(codonOffset < sizeof newCodon);
         newCodon[codonOffset] = snc->newBase;
@@ -550,52 +547,53 @@
 struct phyloTree *node;
 AllocVar(node);
 AllocVar(node->ident);
 node->ident->name = cloneString(name);
 return node;
 }
 
 struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome)
 /* If config.ra has a source of gene annotations, then return the gene list. */
 {
 struct geneInfo *geneInfoList = NULL;
 if (isNotEmpty(bigGenePredFile) && fileExists(bigGenePredFile))
     {
     struct bbiFile *bbi = bigBedFileOpen(bigGenePredFile);
     struct lm *lm = lmInit(0);
-    struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm);
+    struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, refGenome->name, 0,
+                                                             refGenome->size, 0, lm);
     for (bb = bbList;  bb != NULL;  bb = bb->next)
         {
-        struct genePredExt *gp = genePredFromBigGenePred(chrom, bb);
+        struct genePredExt *gp = genePredFromBigGenePred(refGenome->name, bb);
         if (gp->strand[0] != '+')
             errAbort("getGeneInfoList: strand must be '+' but got '%s' for gene %s",
                      gp->strand, gp->name);
         int txLen = 0;
         int ex;
         for (ex = 0;  ex < gp->exonCount;  ex++)
             txLen += (gp->exonEnds[ex] - gp->exonStarts[ex]);
         char *seq = needMem(txLen+1);
         int txOffset = 0;
         for (ex = 0;  ex < gp->exonCount;  ex++)
             {
             int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex];
             safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen);
             txOffset += exonLen;
             }
         struct geneInfo *gi;
         AllocVar(gi);
-        gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen);
+        gi->psl = genePredToPsl((struct genePred *)gp, refGenome->size, txLen);
         gi->txSeq = newDnaSeq(seq, txLen, gp->name2);
         slAddHead(&geneInfoList, gi);
         }
     lmCleanup(&lm);
     bigBedFileClose(&bbi);
     }
 slReverse(&geneInfoList);
 return geneInfoList;
 }
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
                        struct seqWindow *gSeqWin, struct hash *sampleMetadata,
                        struct hash *sampleUrls, char *jsonFile, char *source)
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */