775f8e290daefbb3312fd1d8962344690550c887
angie
  Mon Jun 8 16:20:24 2020 -0700
Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/newick.py src/hg/utils/otto/nextstrainNcov/newick.py
index 3ae8281..fde0978 100644
--- src/hg/utils/otto/nextstrainNcov/newick.py
+++ src/hg/utils/otto/nextstrainNcov/newick.py
@@ -1,148 +1,149 @@
 # Parse Newick-formatted file into tree of { 'kids': [], 'label': '', 'length': '' }
 
 from utils import die
 
 def skipSpaces(treeString, offset):
     while (offset != len(treeString) and treeString[offset].isspace()):
         offset += 1
     return offset
 
 def parseQuoted(treeString, offset):
     """Read in a quoted, possibly \-escaped string in treeString at offset"""
     label = ''
     quoteChar = treeString[offset]
     offset += 1
     labelStart = offset
     while (offset != len(treeString) and treeString[offset] != quoteChar):
         if (treeString[offset] == '\\'):
             offset += 1
         offset += 1
     if (treeString[offset] != quoteChar):
         die("Missing end-" + quoteChar + " after '" + treeString + "'")
     else:
         label = treeString[labelStart:offset]
         offset = skipSpaces(treeString, offset + 1)
     return (label, offset)
 
 def terminatesLabel(treeString, offset):
     """Return True if treeString+offset is empty or starts w/char that would terminate a label"""
     return (offset == len(treeString) or treeString[offset] == ',' or treeString[offset] == ')' or
             treeString[offset] == ';' or treeString[offset] == ':')
 
 def parseLabel(treeString, offset):
     """Read in a possibly quoted, possibly \-escaped node label terminated by [,):;]"""
     if (offset == len(treeString)):
         label = ''
     elif (treeString[offset] == "'" or treeString[offset] == '"'):
         (label, offset) = parseQuoted(treeString, offset)
     else:
         labelStart = offset
         while (not terminatesLabel(treeString, offset)):
             if (treeString[offset] == '\\'):
                 offset += 1
             offset += 1
         label = treeString[labelStart:offset]
         offset = skipSpaces(treeString, offset)
     return(label, offset)
 
 def parseLength(treeString, offset):
     """If treeString[offset] is ':', then parse the number that follows; otherwise return 0.0."""
     if (offset != len(treeString) and treeString[offset] == ':'):
         # Branch length
         offset = skipSpaces(treeString, offset + 1)
         if (not treeString[offset].isdigit()):
             die("Expected number to follow ':' but instead got '" +
                 treeString[offset:offset+100] + "'")
         lengthStart = offset
         while (offset != len(treeString) and
                (treeString[offset].isdigit() or treeString[offset] == '.' or
                 treeString[offset] == 'E' or treeString[offset] == 'e' or
                 treeString[offset] == '-')):
             offset += 1
         lengthStr = treeString[lengthStart:offset]
         offset = skipSpaces(treeString, offset)
         return (lengthStr, offset)
     else:
         return ('', offset)
 
-def parseBranch(treeString, offset):
+def parseBranch(treeString, offset, internalNode):
     """Recursively parse Newick branch (x, y, z)[label][:length] from treeString at offset"""
     if (treeString[offset] != '('):
         die("parseBranch called on treeString that doesn't begin with '(': '" +
             treeString + "'")
     branchStart = offset
-    branch = { 'kids': [],  'label': '', 'length': '' }
+    internalNode += 1
+    branch = { 'kids': [],  'label': '', 'length': '', 'inode': internalNode }
     offset = skipSpaces(treeString, offset + 1)
     while (offset != len(treeString) and treeString[offset] != ')' and treeString[offset] != ';'):
-        (child, offset) = parseString(treeString, offset)
+        (child, offset, internalNode) = parseString(treeString, offset, internalNode)
         branch['kids'].append(child)
         if (treeString[offset] == ','):
             offset = skipSpaces(treeString, offset + 1)
     if (offset == len(treeString)):
         die("Input ended before ')' for '" + treeString[branchStart:branchStart+100] + "'")
     if (treeString[offset] == ')'):
         offset = skipSpaces(treeString, offset + 1)
     else:
         die("Can't find ')' matching '" + treeString[branchStart:branchStart+100] + "', " +
             "instead got '" + treeString[offset:offset+100] + "'")
     (branch['label'], offset) = parseLabel(treeString, offset)
     (branch['length'], offset) = parseLength(treeString, offset)
-    return (branch, offset)
+    return (branch, offset, internalNode)
 
-def parseString(treeString, offset=0):
+def parseString(treeString, offset=0, internalNode=0):
     """Recursively parse Newick tree from treeString"""
     offset = skipSpaces(treeString, offset)
     if (treeString[offset] == '('):
-        return parseBranch(treeString, offset)
+        return parseBranch(treeString, offset, internalNode)
     else:
         (label, offset) = parseLabel(treeString, offset)
         (length, offset) = parseLength(treeString, offset)
         leaf = { 'kids': None, 'label': label, 'length': length }
-        return (leaf, offset)
+        return (leaf, offset, internalNode)
 
 def parseFile(treeFile):
     """Read Newick file, return tree object"""
     with open(treeFile, 'r') as treeF:
         line1 = treeF.readline().strip()
         if (line1 == ''):
             return None
-        (tree, offset) = parseString(line1)
+        (tree, offset, internalNode) = parseString(line1)
         if (offset != len(line1) and line1[offset] != ';'):
             die("Tree terminated without ';' before '" + line1[offset:offset+100] + "'")
         treeF.close()
     return tree
 
 def treeToString(node, pretty=False, indent=0):
     """Return a Newick string encoding node and its descendants, optionally pretty-printing with
     newlines and indentation.  String is not ';'-terminated, caller must do that."""
     labelLen = ''
     if (node['label']):
         labelLen += node['label']
     if (node['length']):
         labelLen += ':' + node['length']
     if (node['kids']):
         string = '('
         kidIndent = indent + 1
         kidStrings = [ treeToString(kid, pretty, kidIndent) for kid in node['kids'] ]
         sep = ','
         if (pretty):
             sep = ',\n' + ' ' * kidIndent
         string += sep.join(kidStrings)
         string += ')'
         string += labelLen
     else:
         string = labelLen
     return string;
 
 def printTree(tree, pretty=False, indent=0):
     """Print out Newick text encoding tree, optionally pretty-printing with
     newlines and indentation."""
     print(treeToString(tree, pretty, indent) + ';')
 
 def leafNames(node):
     """Return a list of labels of all leaf descendants of node"""
     if (node['kids']):
         return [ leaf for kid in node['kids'] for leaf in leafNames(kid) ]
     else:
         return [ node['label'] ]