72730b6971500837e39e21461e8fce00eebfc008
angie
  Fri May 1 10:21:30 2020 -0700
Add internal node labels to the newick tree files that encode clade and the sequence of mutations leading to the node.  This actually does not affect the track at this point, but helps in comparing the trees from different releases.  refs #25188

diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py
index 038cf16..bb3e41c 100755
--- src/hg/utils/otto/nextstrainNcov/nextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/nextstrain.py
@@ -417,44 +417,59 @@
                                        numDateToYmdStr(clade['dateConfMax']),
                                        clade['countryInferred'],
                                        clade['countryConf'],
                                        cladeSampleCounts[name],
                                        ', '.join(cladeSampleNames[name]) ])) + '\n')
     outC.close()
 
 # Newick-formatted tree of samples for VCF display
 def cladeRgbFromName(cladeName):
     """Look up the r,g,b string color for clade; convert to int RGB."""
     rgbCommaStr = cladeColorFromName(cladeName)
     r, g, b = [ int(x) for x in rgbCommaStr.split(',') ]
     rgb = (r << 16) | (g << 8) | b
     return rgb
 
-def rNextstrainToNewick(node, parentColor=None):
+def rNextstrainToNewick(node, parentClade=None, parentVarStr=''):
     """Recursively descend ncov.tree and build Newick tree string of samples to file"""
     kids = node.get('children')
     if (kids):
+        # Make a more concise variant path string than the one we make for the clade track,
+        # to embed in internal node labels for Yatish's tree explorations.
+        localVariants = []
+        if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
+            node['branch_attrs']['mutations'].get('nuc')):
+            # Nucleotide variants specific to this branch
+            for varName in node['branch_attrs']['mutations']['nuc']:
+                if (snvRe.match(varName)):
+                    localVariants.append(varName)
+        varStr = '+'.join(localVariants)
+        if (len(parentVarStr) and len(varStr)):
+            varStr = ';'.join([parentVarStr, varStr])
+        elif (not len(varStr)):
+            varStr = parentVarStr
         nodeAttrs = node['node_attrs']
         if (nodeAttrs.get('clade_membership')):
             cladeName = nodeAttrs['clade_membership']['value']
-            color = str(cladeRgbFromName(cladeName))
-        elif (parentColor):
-            color = parentColor
+        elif (parentClade):
+            cladeName = parentClade
         else:
-            color = '0'
-        descendants = ','.join([ rNextstrainToNewick(child, color) for child in kids ])
-        treeString = '(' + descendants + ')' + ':' + color
+            cladeName = 'unassigned'
+        color = str(cladeRgbFromName(cladeName))
+        descendants = ','.join([ rNextstrainToNewick(child, cladeName, varStr) for child in kids ])
+        label = '#'.join([cladeName, varStr])
+        treeString = '(' + descendants + ')' + label + ':' + color
     else:
         nodeAttrs = node['node_attrs']
         gId = nodeAttrs['gisaid_epi_isl']['value']
         name = node['name']
         date = numDateToMonthDay(nodeAttrs['num_date']['value'])
         cladeName = nodeAttrs['clade_membership']['value']
         color = str(cladeRgbFromName(cladeName))
         treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color
     return treeString
 
 with open('nextstrain.nh', 'w') as outF:
     outF.write(rNextstrainToNewick(ncov['tree']) + ';\n')
     outF.close()
 
 for cladeName, node in cladeNodes.items():