afafa0301ea4b14fbe1fbd5aa379c5351ecd640d angie Tue Aug 2 19:41:44 2022 -0700 Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub). * Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name. * Add a menu input to the main page for switching between supported genomes if there are more than one. * Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings. * Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions. * Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV). * also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output. TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes. diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c index 60efed3..3c04bdc 100644 --- src/hg/hgPhyloPlace/runUsher.c +++ src/hg/hgPhyloPlace/runUsher.c @@ -59,72 +59,78 @@ static struct singleNucChange *parseSnc(char *sncStr) /* If sncStr looks like a <old><pos><new>-style single nucleotide change then parse out those * values & return singleNucChange (with parBase and newBase; no refBase), otherwise return NULL. */ { struct singleNucChange *snc = NULL; regmatch_t substrs[4]; if (regexMatchSubstr(sncStr, "^([ACGT])([0-9]+)([ACGT])$", substrs, ArraySize(substrs))) { int chromStart = regexSubstringInt(sncStr, substrs[2]) - 1; snc = sncNew(chromStart, '\0', sncStr[0], sncStr[substrs[3].rm_so]); } return snc; } -static boolean parseImputedMutations(char **words, struct placementInfo *info) -/* If words[] looks like it defines imputed mutations of the most recently named sample, - * then parse out the list and add to info->imputedBases and return TRUE. */ +static struct baseVal *bvListFromSemiColonSep(char *mutStr) +/* Parse a string of ;-sep'd position:allele & return a list of struct baseVal. */ { -// Example line: -// words[0] = Imputed mutations: -// words[1] = 6709:A;23403:G -boolean matches = FALSE; -if (stringIn(imputedMutsPrefix, words[0])) - { - matches = TRUE; - char *muts[strlen(words[1]) / 4]; - int mutCount = chopString(words[1], ";", muts, ArraySize(muts)); +char *muts[strlen(mutStr) / 4]; +int mutCount = chopString(mutStr, ";", muts, ArraySize(muts)); struct baseVal *bvList = NULL; int i; for (i = 0; i < mutCount; i++) { boolean problem = FALSE; char *colon = strchr(muts[i], ':'); if (colon) { int pos = atoi(muts[i]); char *val = cloneString(colon+1); if (pos < 1) problem = TRUE; else if (!isAllNt(val, strlen(val))) problem = TRUE; else { struct baseVal *bv; AllocVar(bv); bv->chromStart = pos - 1; bv->val = val; slAddHead(&bvList, bv); } } if (problem) errAbort("Problem parsing stderr output of usher: " "expected imputed mutation to be number:base, but got '%s'", muts[i]); } slReverse(&bvList); - info->imputedBases = bvList; +return bvList; +} + +static boolean parseImputedMutations(char **words, struct placementInfo *info) +/* If words[] looks like it defines imputed mutations of the most recently named sample, + * then parse out the list and add to info->imputedBases and return TRUE. */ +{ +// Example line: +// words[0] = Imputed mutations: +// words[1] = 6709:A;23403:G +boolean matches = FALSE; +if (stringIn(imputedMutsPrefix, words[0])) + { + matches = TRUE; + info->imputedBases = bvListFromSemiColonSep(words[1]); } return matches; } static void parseStderr(char *amsStderrFile, struct hash *samplePlacements) /* The stderr output of usher is where we find important info for each sample: * the path of variants on nodes from root to sample leaf, imputed values of ambiguous bases * (if any), and parsimony score. */ { struct lineFile *lf = lineFileOpen(amsStderrFile, TRUE); char *sampleId = NULL; int size; char *line; while (lineFileNext(lf, &line, &size)) { @@ -137,30 +143,60 @@ else if (wordCount == 2) { if (! sampleId) errAbort("Problem parsing stderr output of usher: " "Got line starting with '%s' that was not preceded by a line that " "defines sample ID.:\n%s", words[0], line); struct placementInfo *info = hashFindVal(samplePlacements, sampleId); if (!info) errAbort("Problem parsing stderr output of usher: " "Can't find placement info for sample '%s'", sampleId); parseImputedMutations(words, info); } } } +static void parsePlacements(char *outDirName, char *stderrName, struct hash *samplePlacements) +/* If usher created the file outdir/placement_stats.tsv then parse its contents into + * samplePlacements; otherwise parse the same info out of the stderr output. */ +{ +char placementsFileName[PATH_LEN]; +safef(placementsFileName, sizeof placementsFileName, "%s/placement_stats.tsv", outDirName); +if (fileExists(placementsFileName)) + { + struct lineFile *lf = lineFileOpen(placementsFileName, TRUE); + char *line; + while (lineFileNext(lf, &line, NULL)) + { + char *words[5]; + int wordCount = chopTabs(line, words); + lineFileExpectAtLeast(lf, 4, wordCount); + char *sampleId = words[0]; + struct placementInfo *info; + AllocVar(info); + hashAdd(samplePlacements, sampleId, info); + info->sampleId = cloneString(sampleId); + info->parsimonyScore = atoi(words[1]); + info->bestNodeCount = atoi(words[2]); + info->imputedBases = bvListFromSemiColonSep(words[3]); + } + lineFileClose(&lf); + } +else + parseStderr(stderrName, samplePlacements); +} + static void parseVariantPaths(char *filename, struct hash *samplePlacements) /* Parse out space-sep list of {node ID, ':', node-associated ,-sep variant list} into * variantPathNode list and associate with sample ID. */ { // Example line (note the back-mutation at 28144T... may want to highlight those): // words[0] = MySeq // words[1] = 1:C8782T,T28144C 2309:C29095T 2340:T8782C 2342:T29095C 2588:C28144T MySeq:C29867T struct lineFile *lf = lineFileOpen(filename, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { char *words[3]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, 2, wordCount); char *sampleId = words[0]; @@ -720,48 +756,48 @@ return subtreeCount; } #define MAX_SUBTREES 1000 struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName *userSampleIds, struct hash *condensedNodes, int *pStartTime) /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files * and parse other results out of stderr output. */ { struct usherResults *results = usherResultsNew(); char subtreeSizeStr[16]; safef(subtreeSizeStr, sizeof subtreeSizeStr, "%d", subtreeSize); -char *numThreadsStr = "16"; struct tempName tnOutDir; trashDirFile(&tnOutDir, "ct", "usher_outdir", ".dir"); char *cmd[] = { usherPath, "-v", vcfFile, "-i", usherAssignmentsPath, "-d", tnOutDir.forCgi, - "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-T", numThreadsStr, "-u", + "-k", subtreeSizeStr, "-K", SINGLE_SUBTREE_SIZE, "-T", USHER_NUM_THREADS, "-u", NULL }; char **cmds[] = { cmd, NULL }; struct tempName tnStderr; trashDirFile(&tnStderr, "ct", "usher_stderr", ".txt"); struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, tnStderr.forCgi, 0); pipelineClose(&pl); reportTiming(pStartTime, "run usher"); parseStderr(tnStderr.forCgi, results->samplePlacements); struct tempName *singleSubtreeTn = NULL, *subtreeTns[MAX_SUBTREES]; struct variantPathNode *singleSubtreeMuts = NULL, *subtreeMuts[MAX_SUBTREES]; -int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, &singleSubtreeMuts, - subtreeTns, subtreeMuts, MAX_SUBTREES); +parsePlacements(tnOutDir.forCgi, tnStderr.forCgi, results->samplePlacements); +int subtreeCount = processOutDirFiles(results, tnOutDir.forCgi, &singleSubtreeTn, + &singleSubtreeMuts, subtreeTns, subtreeMuts, MAX_SUBTREES); if (singleSubtreeTn == NULL) { warn("Sorry, there was a problem running usher. " "Please ask genome-www@soe.ucsc.edu to take a look at %s.", tnStderr.forCgi); return NULL; } results->subtreeInfoList = parseSubtrees(subtreeCount, singleSubtreeTn, singleSubtreeMuts, subtreeTns, subtreeMuts, userSampleIds, condensedNodes); results->singleSubtreeInfo = results->subtreeInfoList; results->subtreeInfoList = results->subtreeInfoList->next; return results; } static void addEmptyPlacements(struct slName *sampleIds, struct hash *samplePlacements) /* Parsing an usher-style clades.txt file from matUtils extract requires samplePlacements to