b226cb3af001057dea4635de2d4c6c52f0ebe047 angie Sat Jun 20 14:34:08 2020 -0700 New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build. Updates to other VCF & tree utils. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/newick.py src/hg/utils/otto/nextstrainNcov/newick.py index fde0978..6c58179 100644 --- src/hg/utils/otto/nextstrainNcov/newick.py +++ src/hg/utils/otto/nextstrainNcov/newick.py @@ -1,17 +1,18 @@ # Parse Newick-formatted file into tree of { 'kids': [], 'label': '', 'length': '' } +import logging from utils import die def skipSpaces(treeString, offset): while (offset != len(treeString) and treeString[offset].isspace()): offset += 1 return offset def parseQuoted(treeString, offset): """Read in a quoted, possibly \-escaped string in treeString at offset""" label = '' quoteChar = treeString[offset] offset += 1 labelStart = offset while (offset != len(treeString) and treeString[offset] != quoteChar): if (treeString[offset] == '\\'): @@ -104,30 +105,32 @@ def parseFile(treeFile): """Read Newick file, return tree object""" with open(treeFile, 'r') as treeF: line1 = treeF.readline().strip() if (line1 == ''): return None (tree, offset, internalNode) = parseString(line1) if (offset != len(line1) and line1[offset] != ';'): die("Tree terminated without ';' before '" + line1[offset:offset+100] + "'") treeF.close() return tree def treeToString(node, pretty=False, indent=0): """Return a Newick string encoding node and its descendants, optionally pretty-printing with newlines and indentation. String is not ';'-terminated, caller must do that.""" + if not node: + return '' labelLen = '' if (node['label']): labelLen += node['label'] if (node['length']): labelLen += ':' + node['length'] if (node['kids']): string = '(' kidIndent = indent + 1 kidStrings = [ treeToString(kid, pretty, kidIndent) for kid in node['kids'] ] sep = ',' if (pretty): sep = ',\n' + ' ' * kidIndent string += sep.join(kidStrings) string += ')' string += labelLen @@ -135,15 +138,55 @@ string = labelLen return string; def printTree(tree, pretty=False, indent=0): """Print out Newick text encoding tree, optionally pretty-printing with newlines and indentation.""" print(treeToString(tree, pretty, indent) + ';') def leafNames(node): """Return a list of labels of all leaf descendants of node""" if (node['kids']): return [ leaf for kid in node['kids'] for leaf in leafNames(kid) ] else: return [ node['label'] ] +def treeIntersectIds(node, idLookup, sampleSet, lookupFunc=None): + """For each leaf in node, attempt to look up its label in idLookup; replace if found. + Prune nodes with no matching leaves. Store new leaf labels in sampleSet. + If lookupFunc is given, it is passed two arguments (label, idLookup) and returns a + possible empty list of matches.""" + if (node['kids']): + # Internal node: prune + prunedKids = [] + for kid in (node['kids']): + kidIntersected = treeIntersectIds(kid, idLookup, sampleSet, lookupFunc) + if (kidIntersected): + prunedKids.append(kidIntersected) + if (len(prunedKids) > 1): + node['kids'] = prunedKids + elif (len(prunedKids) == 1): + node = prunedKids[0] + else: + node = None + else: + # Leaf: lookup, prune if not found + label = node['label'] + if (lookupFunc): + matchList = lookupFunc(node['label'], idLookup) + elif label in idLookup: + matchList = idLookup[label] + else: + matchList = [] + if (not matchList): + logging.info("No match for leaf '" + label + "'") + node = None + else: + if (len(matchList) != 1): + logging.warn("Non-unique match for leaf '" + label + "': ['" + + "', '".join(matchList) + "']") + else: + logging.debug(label + ' --> ' + matchList[0]); + node['label'] = matchList[0] + sampleSet.add(matchList[0]) + return node +