src/hg/hgPhyloPlace/runUsher.c afafa0301ea4b14fbe1fbd5aa379c5351ecd640d

afafa0301ea4b14fbe1fbd5aa379c5351ecd640d
angie
  Tue Aug 2 19:41:44 2022 -0700
Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub).
* Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name.
* Add a menu input to the main page for switching between supported genomes if there are more than one.
* Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings.
* Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions.
* Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV).
* also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output.

TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes.

diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c
index 60efed3..3c04bdc 100644
--- src/hg/hgPhyloPlace/runUsher.c
+++ src/hg/hgPhyloPlace/runUsher.c
@@ -59,72 +59,78 @@
 
 static struct singleNucChange *parseSnc(char *sncStr)
 /* If sncStr looks like a <old><pos><new>-style single nucleotide change then parse out those
  * values & return singleNucChange (with parBase and newBase; no refBase), otherwise return NULL.  */
 {
 struct singleNucChange *snc = NULL;
 regmatch_t substrs[4];
 if (regexMatchSubstr(sncStr, "^([ACGT])([0-9]+)([ACGT])$", substrs, ArraySize(substrs)))
     {
     int chromStart = regexSubstringInt(sncStr, substrs[2]) - 1;
     snc = sncNew(chromStart, '\0', sncStr[0], sncStr[substrs[3].rm_so]);
     }
 return snc;
 }
 
-static boolean parseImputedMutations(char **words, struct placementInfo *info)
-/* If words[] looks like it defines imputed mutations of the most recently named sample,
- * then parse out the list and add to info->imputedBases and return TRUE. */
+static struct baseVal *bvListFromSemiColonSep(char *mutStr)
+/* Parse a string of ;-sep'd position:allele & return a list of struct baseVal. */
 {
-// Example line:
-// words[0] = Imputed mutations: 
-// words[1] = 6709:A;23403:G
-boolean matches = FALSE;
-if (stringIn(imputedMutsPrefix, words[0]))
-    {
-    matches = TRUE;
-    char *muts[strlen(words[1]) / 4];
-    int mutCount = chopString(words[1], ";", muts, ArraySize(muts));
+char *muts[strlen(mutStr) / 4];
+int mutCount = chopString(mutStr, ";", muts, ArraySize(muts));
 struct baseVal *bvList = NULL;
 int i;
 for (i = 0;  i < mutCount;  i++)
     {
     boolean problem = FALSE;
     char *colon = strchr(muts[i], ':');
     if (colon)
         {
         int pos = atoi(muts[i]);
         char *val = cloneString(colon+1);
         if (pos < 1)
             problem = TRUE;
         else if (!isAllNt(val, strlen(val)))
             problem = TRUE;
         else
             {
             struct baseVal *bv;
             AllocVar(bv);
             bv->chromStart = pos - 1;
             bv->val = val;
             slAddHead(&bvList, bv);
             }
         }
     if (problem)
         errAbort("Problem parsing stderr output of usher: "
                  "expected imputed mutation to be number:base, but got '%s'", muts[i]);
     }
 slReverse(&bvList);
-    info->imputedBases = bvList;
+return bvList;
+}
+
+static boolean parseImputedMutations(char **words, struct placementInfo *info)
+/* If words[] looks like it defines imputed mutations of the most recently named sample,
+ * then parse out the list and add to info->imputedBases and return TRUE. */
+{
+// Example line:
+// words[0] = Imputed mutations: 
+// words[1] = 6709:A;23403:G
+boolean matches = FALSE;
+if (stringIn(imputedMutsPrefix, words[0]))
+    {
+    matches = TRUE;
+    info->imputedBases = bvListFromSemiColonSep(words[1]);
     }
 return matches;
 }
 
 static void parseStderr(char *amsStderrFile, struct hash *samplePlacements)
 /* The stderr output of usher is where we find important info for each sample:
  * the path of variants on nodes from root to sample leaf, imputed values of ambiguous bases
  * (if any), and parsimony score. */
 {
 struct lineFile *lf = lineFileOpen(amsStderrFile, TRUE);
 char *sampleId = NULL;
 int size;
 char *line;
 while (lineFileNext(lf, &line, &size))
     {
@@ -137,30 +143,60 @@
     else if (wordCount == 2)
         {
         if (! sampleId)
             errAbort("Problem parsing stderr output of usher: "
                      "Got line starting with '%s' that was not preceded by a line that "
                      "defines sample ID.:\n%s", words[0], line);
         struct placementInfo *info = hashFindVal(samplePlacements, sampleId);
         if (!info)
             errAbort("Problem parsing stderr output of usher: "
                      "Can't find placement info for sample '%s'", sampleId);
         parseImputedMutations(words, info);
         }
     }
 }
 
+static void parsePlacements(char *outDirName, char *stderrName, struct hash *samplePlacements)
+/* If usher created the file outdir/placement_stats.tsv then parse its contents into
+ * samplePlacements; otherwise parse the same info out of the stderr output. */
+{
+char placementsFileName[PATH_LEN];
+safef(placementsFileName, sizeof placementsFileName, "%s/placement_stats.tsv", outDirName);
+if (fileExists(placementsFileName))
+    {
+    struct lineFile *lf = lineFileOpen(placementsFileName, TRUE);
+    char *line;
+    while (lineFileNext(lf, &line, NULL))
+        {
+        char *words[5];
+        int wordCount = chopTabs(line, words);
+        lineFileExpectAtLeast(lf, 4, wordCount);
+        char *sampleId = words[0];
+        struct placementInfo *info;
+        AllocVar(info);
+        hashAdd(samplePlacements, sampleId, info);
+        info->sampleId = cloneString(sampleId);
+        info->parsimonyScore = atoi(words[1]);
+        info->bestNodeCount = atoi(words[2]);
+        info->imputedBases = bvListFromSemiColonSep(words[3]);
+        }
+    lineFileClose(&lf);
+    }
+else
+    parseStderr(stderrName, samplePlacements);
+}
+
 static void parseVariantPaths(char *filename, struct hash *samplePlacements)
 /* Parse out space-sep list of {node ID, ':', node-associated ,-sep variant list} into
  * variantPathNode list and associate with sample ID. */
 {
 // Example line (note the back-mutation at 28144T... may want to highlight those):
 // words[0] = MySeq
 // words[1] = 1:C8782T,T28144C 2309:C29095T 2340:T8782C 2342:T29095C 2588:C28144T MySeq:C29867T 
 struct lineFile *lf = lineFileOpen(filename, TRUE);
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     char *words[3];
     int wordCount = chopTabs(line, words);
     lineFileExpectWords(lf, 2, wordCount);
     char *sampleId = words[0];
@@ -720,48 +756,48 @@
 return subtreeCount;
 }
 
 #define MAX_SUBTREES 1000
 
 struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName *userSampleIds,
                               struct hash *condensedNodes, int *pStartTime)
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
  * and parse other results out of stderr output. */
 {
 struct usherResults *results = usherResultsNew();
 char subtreeSizeStr[16];
 safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize);
-char *numThreadsStr = "16";
 struct tempName tnOutDir;
 trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir");
 char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi,
-                "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-T", numThreadsStr, "-u",
+                "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS, "-u",
                 NULL };
 char **cmds[] = { cmd, NULL };
 struct tempName tnStderr;
 trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt");
 struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0);
 pipelineClose(&pl);
 reportTiming(pStartTime, "run usher");
 parseStderr(tnStderr.forCgi, results->samplePlacements);
 
 struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES];
 struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES];
-int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts,
-                                      subtreeTns, subtreeMuts, MAX_SUBTREES);
+parsePlacements(tnOutDir.forCgi, tnStderr.forCgi, results->samplePlacements);
+int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn,
+                                      &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES);
 if (singleSubtreeTn == NULL)
     {
     warn("Sorry, there was a problem running usher.  "
          "Please ask genome-www@soe.ucsc.edu to take a look at %s.", tnStderr.forCgi);
     return NULL;
     }
 results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts,
                                          subtreeTns, subtreeMuts, userSampleIds, condensedNodes);
 results->singleSubtreeInfo = results->subtreeInfoList;
 results->subtreeInfoList = results->subtreeInfoList->next;
 return results;
 }
 
 static void addEmptyPlacements(struct slName *sampleIds, struct hash *samplePlacements)
 /* Parsing an usher-style clades.txt file from matUtils extract requires samplePlacements to