17cdc50389b482ee9f3f1ce649ed297201269d53 angie Thu Feb 25 20:41:01 2021 -0800 If usher's output includes clade and lineage assignments, parse them and include them in the summary table. diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c index 4903c14..6987b4a 100644 --- src/hg/hgPhyloPlace/runUsher.c +++ src/hg/hgPhyloPlace/runUsher.c @@ -635,30 +635,58 @@ struct subtreeInfo *subtreeInfoList = NULL; int sIx; for (sIx = 0; sIx < subtreeCount; sIx++) { struct subtreeInfo *ti = parseOneSubtree(subtreeTns[sIx], subtreeMuts[sIx], userSampleIds, condensedNodes); slAddHead(&subtreeInfoList, ti); } slReverse(&subtreeInfoList); struct subtreeInfo *ti = parseOneSubtree(singleSubtreeTn, singleSubtreeMuts, userSampleIds, condensedNodes); slAddHead(&subtreeInfoList, ti); return subtreeInfoList; } +static void parseClades(char *filename, struct hash *samplePlacements) +/* Parse usher's clades.txt, which might have {sample, clade} or {sample, clade, lineage}. */ +{ +struct hash *wordStore = hashNew(0); +struct lineFile *lf = lineFileOpen(filename, TRUE); +char *line; +while (lineFileNext(lf, &line, NULL)) + { + char *words[3]; + int wordCount = chopTabs(line, words); + char *sampleId = words[0]; + struct placementInfo *info = hashFindVal(samplePlacements, sampleId); + if (!info) + errAbort("parseClades: can't find placementInfo for sample '%s'", sampleId); + if (wordCount > 1) + { + // Nextstrain's clade "20E (EU1)" has to be tweaked to "20E.EU1" for matUtils to avoid + // whitespace trouble; tweak it back. + if (sameString(words[1], "20E.EU1")) + words[1] = "20E (EU1)"; + info->nextClade = hashStoreName(wordStore, words[1]); + } + if (wordCount > 2) + info->pangoLineage = hashStoreName(wordStore, words[2]); + } +lineFileClose(&lf); +} + static char *dirPlusFile(struct dyString *dy, char *dir, char *file) /* Write dir/file into dy and return pointer to dy->string. */ { dyStringClear(dy); dyStringPrintf(dy, "%s/%s", dir, file); return dy->string; } static int processOutDirFiles(struct usherResults *results, char *outDir, struct tempName **retSingleSubtreeTn, struct variantPathNode **retSingleSubtreeMuts, struct tempName *subtreeTns[], struct variantPathNode *subtreeMuts[], int maxSubtrees) /* Get paths to files in outDir; parse them, move files that we'll keep up to trash/ct/, * remove outDir. */ @@ -755,30 +783,34 @@ if (sameString(parts[2], "mutations.txt")) { if (retSingleSubtreeMuts) *retSingleSubtreeMuts = parseSubtreeMutations(path); } else if (sameString(parts[2], "expanded.txt")) { // Don't need this, just remove it } else warn("Unexpected filename '%s' from usher, ignoring", file->name); } else warn("Unexpected filename '%s' from usher, ignoring", file->name); } + else if (sameString(file->name, "clades.txt")) + { + parseClades(path, results->samplePlacements); + } else if (sameString(file->name, "final-tree.nh")) { // Don't need this, just remove it. } else warn("Unexpected filename '%s' from usher, ignoring", file->name); unlink(path); } rmdir(outDir); // Make sure we got a complete range of subtrees [0..subtreeCount-1] int i; for (i = 0; i < subtreeCount; i++) { if (subtreeTns[i] == NULL) errAbort("Missing file subtree-%d.nh in usher results", i+1);