f22f0b5f1d467e570dad2407813748889408d9bd angie Tue Sep 14 09:38:11 2021 -0700 Don't rely on outputs printed only when usher is compiled with -DDEBUG=1; those are going away in server mode. We already weren't showing the 'Best nodes' output, so just remove that code. We already have sample mutations in seqInfo->sncList, so just reformat that into placementInfo->sampleMuts. diff --git src/hg/hgPhyloPlace/runUsher.c src/hg/hgPhyloPlace/runUsher.c index 9689bd8..605894e 100644 --- src/hg/hgPhyloPlace/runUsher.c +++ src/hg/hgPhyloPlace/runUsher.c @@ -5,33 +5,30 @@ #include "common.h" #include "dnautil.h" #include "hash.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "regexHelper.h" #include "pipeline.h" #include "trashDir.h" // Keywords in stderr output of usher: #define sampleIdPrefix "Sample name:" #define pScorePrefix "Parsimony score:" #define numPlacementsPrefix "Number of parsimony-optimal placements:" -#define bestNodePrefix "Best node (" -#define mutationsPrefix "Mutations: " -#define sampleMutsPrefix "Sample mutations:" #define imputedMutsPrefix "Imputed mutations:" static void parseSampleIdAndParsimonyScore(char **words, char **retSampleId, struct hash *samplePlacements) /* If words[] seems to contain columns of the line that gives sample ID and parsimony score, * then parse out those values. */ { // Example line: // words[0] = Current tree size (#nodes): 70775 // words[1] = Sample name: MyLabSequence2 // words[2] = Parsimony score: 1 // words[3] = Number of parsimony-optimal placements: 1 char *p = stringIn(sampleIdPrefix, words[1]); if (p) { @@ -62,129 +59,30 @@ static struct singleNucChange *parseSnc(char *sncStr) /* If sncStr looks like a <old><pos><new>-style single nucleotide change then parse out those * values & return singleNucChange (with parBase and newBase; no refBase), otherwise return NULL. */ { struct singleNucChange *snc = NULL; regmatch_t substrs[4]; if (regexMatchSubstr(sncStr, "^([ACGT])([0-9]+)([ACGT])$", substrs, ArraySize(substrs))) { int chromStart = regexSubstringInt(sncStr, substrs[2]) - 1; snc = sncNew(chromStart, '\0', sncStr[0], sncStr[substrs[3].rm_so]); } return snc; } -static struct variantPathNode *parsePipeyPath(char *pipeyPath) -/* Parse something like "|C8782T|T28144C| > |C29095T| > |G2494A|T11083G|C18501T|" - * into a variant path with unknown node names. */ -{ -if (isEmpty(pipeyPath)) - return NULL; -struct variantPathNode *variantPath = NULL; -char *words[strlen(pipeyPath) / 5]; -int wordCount = chopString(pipeyPath, " > ", words, ArraySize(words)); -int i; -for (i = 0; i < wordCount; i++) - { - struct variantPathNode *vpn; - AllocVar(vpn); - vpn->nodeName = cloneString("?"); - char *mutStr = words[i]; - // Trim first and last pipes - if (mutStr[0] == '|') - mutStr++; - if (mutStr[strlen(mutStr)-1] == '|') - mutStr[strlen(mutStr)-1] = '\0'; - // Split by pipe - char *muts[strlen(mutStr) / 4]; - int sncCount = chopString(mutStr, "|", muts, ArraySize(muts)); - int j; - for (j = 0; j < sncCount; j++) - { - struct singleNucChange *snc = parseSnc(muts[j]); - if (!snc) - errAbort("Expected single-nucleotide change but got '%s' when parsing pipe-separated " - "best node path", muts[j]); - slAddHead(&vpn->sncList, snc); - } - slReverse(&vpn->sncList); - slAddHead(&variantPath, vpn); - } -slReverse(&variantPath); -return variantPath; -} - -static boolean parseBestNode(char **words, struct placementInfo *info) -/* If the line starts with "Best node", parse out the name and path of the node - * (or one of the nodes) with the lowest parsimony distance from the sample being placed; - * add to info->bestNodes and return TRUE. */ -{ -// Example line (* means this node was chosen for placement): -// words[0] = Best node (child)*: 1239 -// words[1] = Mutations: |C8782T|T28144C| > |C29095T| > |T8782C| > |T29095C| > |C28144T| -// or -// words[0] = Best node (sibling): SomeLeafName -// words[1] = Mutations: |C8782T|T28144C| > |C29095T| > |T8782C| > |T29095C| > |C28144T| > |G11083T| > |G2494A|T11083G|C18501T| -boolean matches = FALSE; -if (stringIn(bestNodePrefix, words[0])) - { - matches = TRUE; - struct bestNodeInfo *bn; - AllocVar(bn); - char *p = words[0] + strlen(bestNodePrefix); - if (startsWith("sibling", p)) - bn->isSibling = TRUE; - boolean isChosen = (stringIn(")*:", words[0]) != NULL); - p = stringIn(": ", words[0]); - if (p) - bn->name = cloneString(p + strlen(": ")); - else - errAbort("parseBestNode: expected first column to have ': ' followed by name, but got '%s'", - words[0]); - if (startsWith(mutationsPrefix, words[1])) - bn->variantPath = parsePipeyPath(words[1] + strlen(mutationsPrefix)); - else - errAbort("parseBestNode: expected second column to have '" mutationsPrefix"' followed by " - "path, but got '%s'", words[1]); - if (isChosen) - slAddHead(&info->bestNodes, bn); - else - slAddTail(&info->bestNodes, bn); - } -return matches; -} - - -static boolean parseSampleMutations(char **words, struct placementInfo *info) -/* If words[] looks like it defines the sample mutations relative to the reference genome, - * then parse out the list and add to info->sampleMuts and return TRUE. */ -{ -// Example line: -// words[0] = Sample mutations: -// words[1] = |C241T| |C3037T| |C14408T| |A23403G| -boolean matches = FALSE; -if (stringIn(sampleMutsPrefix, words[0])) - { - matches = TRUE; - char *mutStr = words[1]; - stripChar(mutStr, '|'); - info->sampleMuts = slNameListFromString(mutStr, ' '); - } -return matches; -} - static boolean parseImputedMutations(char **words, struct placementInfo *info) /* If words[] looks like it defines imputed mutations of the most recently named sample, * then parse out the list and add to info->imputedBases and return TRUE. */ { // Example line: // words[0] = Imputed mutations: // words[1] = 6709:A;23403:G boolean matches = FALSE; if (stringIn(imputedMutsPrefix, words[0])) { matches = TRUE; char *muts[strlen(words[1]) / 4]; int mutCount = chopString(words[1], ";", muts, ArraySize(muts)); struct baseVal *bvList = NULL; int i; @@ -234,32 +132,30 @@ safencpy(lineCpy, sizeof lineCpy, line, size); char *words[16]; int wordCount = chopTabs(lineCpy, words); if (wordCount == 4) parseSampleIdAndParsimonyScore(words, &sampleId, samplePlacements); else if (wordCount == 2) { if (! sampleId) errAbort("Problem parsing stderr output of usher: " "Got line starting with '%s' that was not preceded by a line that " "defines sample ID.:\n%s", words[0], line); struct placementInfo *info = hashFindVal(samplePlacements, sampleId); if (!info) errAbort("Problem parsing stderr output of usher: " "Can't find placement info for sample '%s'", sampleId); - if (! parseBestNode(words, info) && - ! parseSampleMutations(words, info)) parseImputedMutations(words, info); } } } static void parseVariantPaths(char *filename, struct hash *samplePlacements) /* Parse out space-sep list of {node ID, ':', node-associated ,-sep variant list} into * variantPathNode list and associate with sample ID. */ { // Example line (note the back-mutation at 28144T... may want to highlight those): // words[0] = MySeq // words[1] = 1:C8782T,T28144C 2309:C29095T 2340:T8782C 2342:T29095C 2588:C28144T MySeq:C29867T struct lineFile *lf = lineFileOpen(filename, TRUE); char *line; while (lineFileNext(lf, &line, NULL))