2e6ddaec223b5b760cbba134eee52d14a327fda3 angie Fri May 29 15:22:45 2020 -0700 Adding tree manipulation python scripts and modules that I've been working on for David et al. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/newick.py src/hg/utils/otto/nextstrainNcov/newick.py new file mode 100644 index 0000000..3ae8281 --- /dev/null +++ src/hg/utils/otto/nextstrainNcov/newick.py @@ -0,0 +1,148 @@ +# Parse Newick-formatted file into tree of { 'kids': [], 'label': '', 'length': '' } + +from utils import die + +def skipSpaces(treeString, offset): + while (offset != len(treeString) and treeString[offset].isspace()): + offset += 1 + return offset + +def parseQuoted(treeString, offset): + """Read in a quoted, possibly \-escaped string in treeString at offset""" + label = '' + quoteChar = treeString[offset] + offset += 1 + labelStart = offset + while (offset != len(treeString) and treeString[offset] != quoteChar): + if (treeString[offset] == '\\'): + offset += 1 + offset += 1 + if (treeString[offset] != quoteChar): + die("Missing end-" + quoteChar + " after '" + treeString + "'") + else: + label = treeString[labelStart:offset] + offset = skipSpaces(treeString, offset + 1) + return (label, offset) + +def terminatesLabel(treeString, offset): + """Return True if treeString+offset is empty or starts w/char that would terminate a label""" + return (offset == len(treeString) or treeString[offset] == ',' or treeString[offset] == ')' or + treeString[offset] == ';' or treeString[offset] == ':') + +def parseLabel(treeString, offset): + """Read in a possibly quoted, possibly \-escaped node label terminated by [,):;]""" + if (offset == len(treeString)): + label = '' + elif (treeString[offset] == "'" or treeString[offset] == '"'): + (label, offset) = parseQuoted(treeString, offset) + else: + labelStart = offset + while (not terminatesLabel(treeString, offset)): + if (treeString[offset] == '\\'): + offset += 1 + offset += 1 + label = treeString[labelStart:offset] + offset = skipSpaces(treeString, offset) + return(label, offset) + +def parseLength(treeString, offset): + """If treeString[offset] is ':', then parse the number that follows; otherwise return 0.0.""" + if (offset != len(treeString) and treeString[offset] == ':'): + # Branch length + offset = skipSpaces(treeString, offset + 1) + if (not treeString[offset].isdigit()): + die("Expected number to follow ':' but instead got '" + + treeString[offset:offset+100] + "'") + lengthStart = offset + while (offset != len(treeString) and + (treeString[offset].isdigit() or treeString[offset] == '.' or + treeString[offset] == 'E' or treeString[offset] == 'e' or + treeString[offset] == '-')): + offset += 1 + lengthStr = treeString[lengthStart:offset] + offset = skipSpaces(treeString, offset) + return (lengthStr, offset) + else: + return ('', offset) + +def parseBranch(treeString, offset): + """Recursively parse Newick branch (x, y, z)[label][:length] from treeString at offset""" + if (treeString[offset] != '('): + die("parseBranch called on treeString that doesn't begin with '(': '" + + treeString + "'") + branchStart = offset + branch = { 'kids': [], 'label': '', 'length': '' } + offset = skipSpaces(treeString, offset + 1) + while (offset != len(treeString) and treeString[offset] != ')' and treeString[offset] != ';'): + (child, offset) = parseString(treeString, offset) + branch['kids'].append(child) + if (treeString[offset] == ','): + offset = skipSpaces(treeString, offset + 1) + if (offset == len(treeString)): + die("Input ended before ')' for '" + treeString[branchStart:branchStart+100] + "'") + if (treeString[offset] == ')'): + offset = skipSpaces(treeString, offset + 1) + else: + die("Can't find ')' matching '" + treeString[branchStart:branchStart+100] + "', " + + "instead got '" + treeString[offset:offset+100] + "'") + (branch['label'], offset) = parseLabel(treeString, offset) + (branch['length'], offset) = parseLength(treeString, offset) + return (branch, offset) + +def parseString(treeString, offset=0): + """Recursively parse Newick tree from treeString""" + offset = skipSpaces(treeString, offset) + if (treeString[offset] == '('): + return parseBranch(treeString, offset) + else: + (label, offset) = parseLabel(treeString, offset) + (length, offset) = parseLength(treeString, offset) + leaf = { 'kids': None, 'label': label, 'length': length } + return (leaf, offset) + +def parseFile(treeFile): + """Read Newick file, return tree object""" + with open(treeFile, 'r') as treeF: + line1 = treeF.readline().strip() + if (line1 == ''): + return None + (tree, offset) = parseString(line1) + if (offset != len(line1) and line1[offset] != ';'): + die("Tree terminated without ';' before '" + line1[offset:offset+100] + "'") + treeF.close() + return tree + +def treeToString(node, pretty=False, indent=0): + """Return a Newick string encoding node and its descendants, optionally pretty-printing with + newlines and indentation. String is not ';'-terminated, caller must do that.""" + labelLen = '' + if (node['label']): + labelLen += node['label'] + if (node['length']): + labelLen += ':' + node['length'] + if (node['kids']): + string = '(' + kidIndent = indent + 1 + kidStrings = [ treeToString(kid, pretty, kidIndent) for kid in node['kids'] ] + sep = ',' + if (pretty): + sep = ',\n' + ' ' * kidIndent + string += sep.join(kidStrings) + string += ')' + string += labelLen + else: + string = labelLen + return string; + +def printTree(tree, pretty=False, indent=0): + """Print out Newick text encoding tree, optionally pretty-printing with + newlines and indentation.""" + print(treeToString(tree, pretty, indent) + ';') + +def leafNames(node): + """Return a list of labels of all leaf descendants of node""" + if (node['kids']): + return [ leaf for kid in node['kids'] for leaf in leafNames(kid) ] + else: + return [ node['label'] ] +