src/hg/hgPhyloPlace/runUsher.c 17cdc50389b482ee9f3f1ce649ed297201269d53

17cdc50389b482ee9f3f1ce649ed297201269d53
angie
  Thu Feb 25 20:41:01 2021 -0800
If usher's output includes clade and lineage assignments, parse them and include them in the summary table.

diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c
index 4903c14..6987b4a 100644
--- src/hg/hgPhyloPlace/runUsher.c
+++ src/hg/hgPhyloPlace/runUsher.c
@@ -635,30 +635,58 @@
 struct subtreeInfo *subtreeInfoList = NULL;
 int sIx;
 for (sIx = 0;  sIx < subtreeCount;  sIx++)
     {
     struct subtreeInfo *ti = parseOneSubtree(subtreeTns[sIx], subtreeMuts[sIx], userSampleIds,
                                              condensedNodes);
     slAddHead(&subtreeInfoList, ti);
     }
 slReverse(&subtreeInfoList);
 struct subtreeInfo *ti = parseOneSubtree(singleSubtreeTn, singleSubtreeMuts, userSampleIds,
                                          condensedNodes);
 slAddHead(&subtreeInfoList, ti);
 return subtreeInfoList;
 }
 
+static void parseClades(char *filename, struct hash *samplePlacements)
+/* Parse usher's clades.txt, which might have {sample, clade} or {sample, clade, lineage}. */
+{
+struct hash *wordStore = hashNew(0);
+struct lineFile *lf = lineFileOpen(filename, TRUE);
+char *line;
+while (lineFileNext(lf, &line, NULL))
+    {
+    char *words[3];
+    int wordCount = chopTabs(line, words);
+    char *sampleId = words[0];
+    struct placementInfo *info = hashFindVal(samplePlacements, sampleId);
+    if (!info)
+        errAbort("parseClades: can't find placementInfo for sample '%s'", sampleId);
+    if (wordCount > 1)
+        {
+        // Nextstrain's clade "20E (EU1)" has to be tweaked to "20E.EU1" for matUtils to avoid
+        // whitespace trouble; tweak it back.
+        if (sameString(words[1], "20E.EU1"))
+            words[1] = "20E (EU1)";
+        info->nextClade = hashStoreName(wordStore, words[1]);
+        }
+    if (wordCount > 2)
+        info->pangoLineage = hashStoreName(wordStore, words[2]);
+    }
+lineFileClose(&lf);
+}
+
 static char *dirPlusFile(struct dyString *dy, char *dir, char *file)
 /* Write dir/file into dy and return pointer to dy->string. */
 {
 dyStringClear(dy);
 dyStringPrintf(dy, "%s/%s", dir, file);
 return dy->string;
 }
 
 static int processOutDirFiles(struct usherResults *results, char *outDir,
                               struct tempName **retSingleSubtreeTn,
                               struct variantPathNode **retSingleSubtreeMuts,
                               struct tempName *subtreeTns[], struct variantPathNode *subtreeMuts[],
                               int maxSubtrees)
 /* Get paths to files in outDir; parse them, move files that we'll keep up to trash/ct/,
  * remove outDir. */
@@ -755,30 +783,34 @@
             if (sameString(parts[2], "mutations.txt"))
                 {
                 if (retSingleSubtreeMuts)
                     *retSingleSubtreeMuts = parseSubtreeMutations(path);
                 }
             else if (sameString(parts[2], "expanded.txt"))
                 {
                 // Don't need this, just remove it
                 }
             else
                 warn("Unexpected filename '%s' from usher, ignoring", file->name);
             }
         else
             warn("Unexpected filename '%s' from usher, ignoring", file->name);
         }
+    else if (sameString(file->name, "clades.txt"))
+        {
+        parseClades(path, results->samplePlacements);
+        }
     else if (sameString(file->name, "final-tree.nh"))
         {
         // Don't need this, just remove it.
         }
     else
         warn("Unexpected filename '%s' from usher, ignoring", file->name);
     unlink(path);
     }
 rmdir(outDir);
 // Make sure we got a complete range of subtrees [0..subtreeCount-1]
 int i;
 for (i = 0;  i < subtreeCount;  i++)
     {
     if (subtreeTns[i] == NULL)
         errAbort("Missing file subtree-%d.nh in usher results", i+1);