775f8e290daefbb3312fd1d8962344690550c887 angie Mon Jun 8 16:20:24 2020 -0700 Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/newick.py src/hg/utils/otto/nextstrainNcov/newick.py index 3ae8281..fde0978 100644 --- src/hg/utils/otto/nextstrainNcov/newick.py +++ src/hg/utils/otto/nextstrainNcov/newick.py @@ -53,72 +53,73 @@ if (not treeString[offset].isdigit()): die("Expected number to follow ':' but instead got '" + treeString[offset:offset+100] + "'") lengthStart = offset while (offset != len(treeString) and (treeString[offset].isdigit() or treeString[offset] == '.' or treeString[offset] == 'E' or treeString[offset] == 'e' or treeString[offset] == '-')): offset += 1 lengthStr = treeString[lengthStart:offset] offset = skipSpaces(treeString, offset) return (lengthStr, offset) else: return ('', offset) -def parseBranch(treeString, offset): +def parseBranch(treeString, offset, internalNode): """Recursively parse Newick branch (x, y, z)[label][:length] from treeString at offset""" if (treeString[offset] != '('): die("parseBranch called on treeString that doesn't begin with '(': '" + treeString + "'") branchStart = offset - branch = { 'kids': [], 'label': '', 'length': '' } + internalNode += 1 + branch = { 'kids': [], 'label': '', 'length': '', 'inode': internalNode } offset = skipSpaces(treeString, offset + 1) while (offset != len(treeString) and treeString[offset] != ')' and treeString[offset] != ';'): - (child, offset) = parseString(treeString, offset) + (child, offset, internalNode) = parseString(treeString, offset, internalNode) branch['kids'].append(child) if (treeString[offset] == ','): offset = skipSpaces(treeString, offset + 1) if (offset == len(treeString)): die("Input ended before ')' for '" + treeString[branchStart:branchStart+100] + "'") if (treeString[offset] == ')'): offset = skipSpaces(treeString, offset + 1) else: die("Can't find ')' matching '" + treeString[branchStart:branchStart+100] + "', " + "instead got '" + treeString[offset:offset+100] + "'") (branch['label'], offset) = parseLabel(treeString, offset) (branch['length'], offset) = parseLength(treeString, offset) - return (branch, offset) + return (branch, offset, internalNode) -def parseString(treeString, offset=0): +def parseString(treeString, offset=0, internalNode=0): """Recursively parse Newick tree from treeString""" offset = skipSpaces(treeString, offset) if (treeString[offset] == '('): - return parseBranch(treeString, offset) + return parseBranch(treeString, offset, internalNode) else: (label, offset) = parseLabel(treeString, offset) (length, offset) = parseLength(treeString, offset) leaf = { 'kids': None, 'label': label, 'length': length } - return (leaf, offset) + return (leaf, offset, internalNode) def parseFile(treeFile): """Read Newick file, return tree object""" with open(treeFile, 'r') as treeF: line1 = treeF.readline().strip() if (line1 == ''): return None - (tree, offset) = parseString(line1) + (tree, offset, internalNode) = parseString(line1) if (offset != len(line1) and line1[offset] != ';'): die("Tree terminated without ';' before '" + line1[offset:offset+100] + "'") treeF.close() return tree def treeToString(node, pretty=False, indent=0): """Return a Newick string encoding node and its descendants, optionally pretty-printing with newlines and indentation. String is not ';'-terminated, caller must do that.""" labelLen = '' if (node['label']): labelLen += node['label'] if (node['length']): labelLen += ':' + node['length'] if (node['kids']): string = '('