afafa0301ea4b14fbe1fbd5aa379c5351ecd640d
angie
  Tue Aug 2 19:41:44 2022 -0700
Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub).
* Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name.
* Add a menu input to the main page for switching between supported genomes if there are more than one.
* Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings.
* Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions.
* Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV).
* also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output.

TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes.

diff --git src/hg/hgPhyloPlace/writeCustomTracks.c src/hg/hgPhyloPlace/writeCustomTracks.c
index 08ca4be..1ee0bde 100644
--- src/hg/hgPhyloPlace/writeCustomTracks.c
+++ src/hg/hgPhyloPlace/writeCustomTracks.c
@@ -1,40 +1,34 @@
 /* Write custom tracks: one per subtree and one with user's uploaded samples. */
 
 /* Copyright (C) 2020 The Regents of the University of California */
 
 #include "common.h"
 #include "hash.h"
+#include "hdb.h"
 #include "iupac.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "phyloTree.h"
 #include "trashDir.h"
 #include "vcf.h"
 
-// wuhCor1-specific:
-char *geneTrack = "ncbiGeneBGP";
-int maxVcfRows = 30000;
-
-// Globals
-extern char *chrom;
-extern int chromSize;
-
 // Parameter constants:
 int minSamplesForOwnTree = 3;  // If user uploads at least this many samples, show tree for them.
 
-static char ***imputedBasesByPosition(struct hash *samplePlacements, struct slName *sampleIds)
+static char ***imputedBasesByPosition(struct hash *samplePlacements, struct slName *sampleIds,
+                                      int chromSize)
 /* Unpack imputedBases into an array of (a few) arrays indexed by 0-based position and sample index,
  * for speedy retrieval while rewriting user VCF. */
 {
 char ***impsByPos = needMem(chromSize * sizeof(char **));
 int sampleCount = slCount(sampleIds);
 struct slName *sample;
 int ix;
 for (ix = 0, sample = sampleIds;  sample != NULL; ix++, sample = sample->next)
     {
     struct placementInfo *info = hashFindVal(samplePlacements, sample->name);
     if (!info)
         errAbort("imputedBasesByPosition: can't find placementInfo for sample '%s'", sample->name);
     struct baseVal *bv;
     for (bv = info->imputedBases;  bv != NULL;  bv = bv->next)
         {
@@ -48,45 +42,45 @@
 
 static boolean anyImputedBases(struct hash *samplePlacements, struct slName *sampleIds)
 /* Return TRUE if any sample has at least one imputed base. */
 {
 struct slName *sample;
 for (sample = sampleIds;  sample != NULL;  sample = sample->next)
     {
     struct placementInfo *info = hashFindVal(samplePlacements, sample->name);
     if (info && info->imputedBases)
         return TRUE;
     }
 return FALSE;
 }
 
 struct tempName *userVcfWithImputedBases(struct tempName *userVcfTn, struct hash *samplePlacements,
-                                         struct slName *sampleIds)
+                                         struct slName *sampleIds, int chromSize)
 /* If usher imputed the values of ambiguous bases in user's VCF, then swap in
  * the imputed values and write a new VCF to use as the custom track.  Otherwise just use
  * the user's VCF for the custom track. */
 {
 struct tempName *ctVcfTn = userVcfTn;
 if (anyImputedBases(samplePlacements, sampleIds))
     {
     AllocVar(ctVcfTn);
     trashDirFile(ctVcfTn, "ct", "imputed", ".vcf");
     FILE *f = mustOpen(ctVcfTn->forCgi, "w");
     struct lineFile *lf = lineFileOpen(userVcfTn->forCgi, TRUE);
     int sampleCount = slCount(sampleIds);
     int colCount = VCF_NUM_COLS_BEFORE_GENOTYPES + sampleCount;
-    char ***impsByPos = imputedBasesByPosition(samplePlacements, sampleIds);
+    char ***impsByPos = imputedBasesByPosition(samplePlacements, sampleIds, chromSize);
     char *line;
     int size;
     while (lineFileNext(lf, &line, &size))
         {
         if (line[0] == '#')
             fputs(line, f);
         else
             {
             char *words[colCount];
             int wordCount = chopTabs(line, words);
             lineFileExpectWords(lf, colCount, wordCount);
             int chromStart = atoi(words[1]) - 1;
             if (impsByPos[chromStart])
                 {
                 verbose(3, "%d ref=%s alt=%s\n", chromStart+1, words[3], words[4]);
@@ -195,76 +189,76 @@
 tree = phyloPruneToIds(tree, sampleIds);
 FILE *f = mustOpen(sampleTreeFile, "w");
 phyloPrintTree(tree, f);
 carefulClose(&f);
 return tree;
 }
 
 static int heightForSampleCount(int fontHeight, int count)
 /* Return a height sufficient for this many labels stacked vertically. */
 {
 return (fontHeight + 1) * count;
 }
 
 static struct phyloTree *addSampleOnlyCustomTrack(FILE *ctF, struct tempName *vcfTn,
                                                   char *bigTreePlusFile, struct slName *sampleIds,
-                                                  int fontHeight, int *pStartTime)
+                                                  char *geneTrack, int fontHeight, int *pStartTime)
 /* Make custom track with uploaded VCF.  If there are enough samples to make a non-trivial tree,
  * then make the tree by pruning down the big tree plus samples to only the uploaded samples. */
 {
 struct phyloTree *sampleTree = NULL;
 fprintf(ctF, "track name=uploadedSamples description='Uploaded sample genotypes' "
         "type=vcf visibility=pack "
         "hapClusterEnabled=on hapClusterHeight=%d ",
         heightForSampleCount(fontHeight, slCount(sampleIds)));
 if (slCount(sampleIds) >= minSamplesForOwnTree)
     {
     struct tempName sampleTreeTn;
     trashDirFile(&sampleTreeTn, "ct", "uploadedSamples", ".nwk");
     sampleTree = uploadedSamplesTree(sampleTreeTn.forCgi, bigTreePlusFile, sampleIds);
     fprintf(ctF, "hapClusterMethod='treeFile %s' ", sampleTreeTn.forCgi);
     }
 else
     fprintf(ctF, "hapClusterMethod=fileOrder ");
 if (isNotEmpty(geneTrack))
     fprintf(ctF, "hapClusterColorBy=function geneTrack=%s", geneTrack);
 fputc('\n', ctF);
 struct lineFile *vcfLf = lineFileOpen(vcfTn->forCgi, TRUE);
 char *line;
 while (lineFileNext(vcfLf, &line, NULL))
     fprintf(ctF, "%s\n",line);
 lineFileClose(&vcfLf);
 reportTiming(pStartTime, "add custom track for uploaded samples");
 return sampleTree;
 }
 
-static struct vcfFile *parseUserVcf(char *userVcfFile, int *pStartTime)
+static struct vcfFile *parseUserVcf(char *userVcfFile, int chromSize, int *pStartTime)
 /* Read in user VCF file and parse genotypes. */
 {
-struct vcfFile *userVcf = vcfFileMayOpen(userVcfFile, chrom, 0, chromSize, 0, maxVcfRows, TRUE);
+struct vcfFile *userVcf = vcfFileMayOpen(userVcfFile, NULL, 0, chromSize, 0, chromSize, TRUE);
 if (! userVcf)
     return NULL;
 reportTiming(pStartTime, "read userVcf");
 struct vcfRecord *rec;
 for (rec = userVcf->records;  rec != NULL;  rec = rec->next)
     vcfParseGenotypesGtOnly(rec);
 reportTiming(pStartTime, "parse userVcf genotypes");
 return userVcf;
 }
 
 static void writeSubtreeTrackLine(FILE *ctF, struct subtreeInfo *ti, int subtreeNum, char *source,
-                                  int fontHeight)
+                                  char *geneTrack, int fontHeight)
 /* Write a custom track line to ctF for one subtree. */
 {
 fprintf(ctF, "track name=subtree%d", subtreeNum);
 // Keep the description from being too long in order to avoid buffer overflow in hgTrackUi
 // (and limited space for track title in hgTracks)
 int descLen = fprintf(ctF, " description='Subtree %d: uploaded sample%s %s",
                       subtreeNum, (slCount(ti->subtreeUserSampleIds) > 1 ? "s" : ""),
                       ti->subtreeUserSampleIds->name);
 struct slName *id;
 for (id = ti->subtreeUserSampleIds->next;  id != NULL;  id = id->next)
     {
     if (descLen > 100)
         {
         fprintf(ctF, " and %d other samples", slCount(id));
         break;
@@ -455,32 +449,32 @@
         genotypes[ix] = '.';
     else
         {
         int altIx;
         for (altIx = 0;  altIx < altCount;  altIx++)
             if (altAlleles[altIx] == sampleAl)
                 break;
         if (altIx == altCount)
             errAbort("altsToGenotypes: sample %d allele '%c' not found in array of %d alts.",
                      ix, sampleAl, altCount);
         genotypes[ix] = '1' + altIx;
         }
     }
 }
 
-static void baseAllelesToVcf(char *refBases, char **sampleBases, int baseCount, int sampleCount,
-                             FILE *f)
+static void baseAllelesToVcf(char *refBases, char **sampleBases, char *chrom, int baseCount,
+                             int sampleCount, FILE *f)
 /* Write VCF rows with per-sample genotypes from refBases and sampleBases. */
 {
 int chromStart;
 for (chromStart = 0;  chromStart < baseCount;  chromStart++)
     {
     char ref = refBases[chromStart];
     if (ref != '\0')
         {
         // This position has a variant.  Tally up alternate alleles and counts and determine
         // allele indices to use when printing out genotypes.
         int maxAlts = 16;
         char altAlleles[maxAlts];
         int altCounts[maxAlts];
         int noCallCount;
         int altCount = tallyAlts(sampleBases[chromStart], ref, sampleCount,
@@ -499,88 +493,96 @@
         for (altIx = 1;  altIx < altCount;  altIx++)
         fprintf(f, ",%c", toupper(altAlleles[altIx]));
         fputs("\t.\t.\t", f);
         fprintf(f, "AC=%d", altCounts[0]);
         for (altIx = 1;  altIx < altCount;  altIx++)
             fprintf(f, ",%d", altCounts[altIx]);
         fprintf(f, ";AN=%d\tGT", sampleCount - noCallCount);
         int gtIx;
         for (gtIx = 0;  gtIx < sampleCount;  gtIx++)
             fprintf(f, "\t%c", genotypes[gtIx]);
         fputc('\n', f);
         }
     }
 }
 
-static void writeSubtreeVcf(FILE *f, struct hash *sampleToIx,
+static void writeSubtreeVcf(FILE *f, struct hash *sampleToIx, char *chrom, int chromSize,
                             struct vcfFile *userVcf, struct mutationAnnotatedTree *bigTree)
 /* Write rows of VCF with genotypes for samples in sampleToIx (with order determined by sampleToIx)
  * with some samples found in userVcf and the rest found in bigTree which has been annotated
  * with single-nucleotide variants. */
 {
 char refBases[chromSize];
 memset(refBases, 0, sizeof refBases);
 char *sampleBases[chromSize];
 memset(sampleBases, 0, sizeof sampleBases);
 treeToBaseAlleles(bigTree->tree, bigTree->condensedNodes, refBases, sampleBases, sampleToIx, NULL);
 vcfToBaseAlleles(userVcf, refBases, sampleBases, sampleToIx);
 int sampleCount = sampleToIx->elCount;
-baseAllelesToVcf(refBases, sampleBases, chromSize, sampleCount, f);
+baseAllelesToVcf(refBases, sampleBases, chrom, chromSize, sampleCount, f);
 int i;
 for (i = 0;  i < chromSize;  i++)
     freeMem(sampleBases[i]);
 }
 
 static void addSubtreeCustomTracks(FILE *ctF, char *userVcfFile, struct subtreeInfo *subtreeInfoList,
-                                   struct hash *samplePlacements,
+                                   struct hash *samplePlacements, char *chrom, int chromSize,
                                    struct mutationAnnotatedTree *bigTree,
-                                   char *source, int fontHeight, int *pStartTime)
+                                   char *source, char *geneTrack, int fontHeight, int *pStartTime)
 /* For each subtree trashfile, write VCF+treeFile custom track text to ctF. */
 {
-struct vcfFile *userVcf = parseUserVcf(userVcfFile, pStartTime);
+struct vcfFile *userVcf = parseUserVcf(userVcfFile, chromSize, pStartTime);
 if (! userVcf)
     {
     warn("Problem parsing VCF file with user variants '%s'; can't make subtree subtracks.",
          userVcfFile);
     return;
     }
 if (!bigTree)
     return;
 struct subtreeInfo *ti;
 int subtreeNum;
 for (subtreeNum = 1, ti = subtreeInfoList;  ti != NULL;  ti = ti->next, subtreeNum++)
     {
-    writeSubtreeTrackLine(ctF, ti, subtreeNum, source, fontHeight);
+    writeSubtreeTrackLine(ctF, ti, subtreeNum, source, geneTrack, fontHeight);
     writeVcfHeader(ctF, ti->subtreeNameList);
-    writeSubtreeVcf(ctF, ti->subtreeIdToIx, userVcf, bigTree);
+    writeSubtreeVcf(ctF, ti->subtreeIdToIx, chrom, chromSize, userVcf, bigTree);
     fputc('\n', ctF);
     }
 vcfFileFree(&userVcf);
 reportTiming(pStartTime, "write subtree custom tracks");
 }
 
-struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur,
+struct tempName *writeCustomTracks(char *db, struct tempName *vcfTn, struct usherResults *ur,
                                    struct slName *sampleIds, struct mutationAnnotatedTree *bigTree,
-                                   char *source, int fontHeight, struct phyloTree **retSampleTree,
-                                   int *pStartTime)
+                                   char *source, int fontHeight,
+                                   struct phyloTree **retSampleTree, int *pStartTime)
 /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */
 {
-struct tempName *ctVcfTn = userVcfWithImputedBases(vcfTn, ur->samplePlacements, sampleIds);
+char *chrom = hDefaultChrom(db);
+int chromSize = hChromSize(db, chrom);
+char *geneTrack = phyloPlaceDbSetting(db, "geneTrack");
+// Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra.
+if (isEmpty(geneTrack))
+    geneTrack = sameString(db, "wuhCor1") ? "ncbiGeneBGP" : "ncbiGene";
+struct tempName *ctVcfTn = userVcfWithImputedBases(vcfTn, ur->samplePlacements, sampleIds,
+                                                   chromSize);
 struct tempName *ctTn;
 AllocVar(ctTn);
 trashDirFile(ctTn, "ct", "ct_pp", ".ct");
 FILE *ctF = mustOpen(ctTn->forCgi, "w");
 int subtreeCount = slCount(ur->subtreeInfoList);
 if (subtreeCount <= MAX_SUBTREE_CTS)
-    addSubtreeCustomTracks(ctF, ctVcfTn->forCgi, ur->subtreeInfoList, ur->samplePlacements, bigTree,
-                           source, fontHeight, pStartTime);
+    addSubtreeCustomTracks(ctF, ctVcfTn->forCgi, ur->subtreeInfoList, ur->samplePlacements,
+                           chrom, chromSize, bigTree, source, geneTrack, fontHeight, pStartTime);
 else
     printf("<p>Subtree custom tracks are added when there are at most %d subtrees, "
            "but %d subtrees were found.</p>\n",
            MAX_SUBTREE_CTS, subtreeCount);
 struct phyloTree *sampleTree = addSampleOnlyCustomTrack(ctF, ctVcfTn, ur->bigTreePlusTn->forCgi,
-                                                        sampleIds, fontHeight, pStartTime);
+                                                        sampleIds, geneTrack, fontHeight,
+                                                        pStartTime);
 if (retSampleTree)
     *retSampleTree = sampleTree;
 carefulClose(&ctF);
 return ctTn;
 }