72730b6971500837e39e21461e8fce00eebfc008 angie Fri May 1 10:21:30 2020 -0700 Add internal node labels to the newick tree files that encode clade and the sequence of mutations leading to the node. This actually does not affect the track at this point, but helps in comparing the trees from different releases. refs #25188 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 038cf16..bb3e41c 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -417,44 +417,59 @@ numDateToYmdStr(clade['dateConfMax']), clade['countryInferred'], clade['countryConf'], cladeSampleCounts[name], ', '.join(cladeSampleNames[name]) ])) + '\n') outC.close() # Newick-formatted tree of samples for VCF display def cladeRgbFromName(cladeName): """Look up the r,g,b string color for clade; convert to int RGB.""" rgbCommaStr = cladeColorFromName(cladeName) r, g, b = [ int(x) for x in rgbCommaStr.split(',') ] rgb = (r << 16) | (g << 8) | b return rgb -def rNextstrainToNewick(node, parentColor=None): +def rNextstrainToNewick(node, parentClade=None, parentVarStr=''): """Recursively descend ncov.tree and build Newick tree string of samples to file""" kids = node.get('children') if (kids): + # Make a more concise variant path string than the one we make for the clade track, + # to embed in internal node labels for Yatish's tree explorations. + localVariants = [] + if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and + node['branch_attrs']['mutations'].get('nuc')): + # Nucleotide variants specific to this branch + for varName in node['branch_attrs']['mutations']['nuc']: + if (snvRe.match(varName)): + localVariants.append(varName) + varStr = '+'.join(localVariants) + if (len(parentVarStr) and len(varStr)): + varStr = ';'.join([parentVarStr, varStr]) + elif (not len(varStr)): + varStr = parentVarStr nodeAttrs = node['node_attrs'] if (nodeAttrs.get('clade_membership')): cladeName = nodeAttrs['clade_membership']['value'] - color = str(cladeRgbFromName(cladeName)) - elif (parentColor): - color = parentColor + elif (parentClade): + cladeName = parentClade else: - color = '0' - descendants = ','.join([ rNextstrainToNewick(child, color) for child in kids ]) - treeString = '(' + descendants + ')' + ':' + color + cladeName = 'unassigned' + color = str(cladeRgbFromName(cladeName)) + descendants = ','.join([ rNextstrainToNewick(child, cladeName, varStr) for child in kids ]) + label = '#'.join([cladeName, varStr]) + treeString = '(' + descendants + ')' + label + ':' + color else: nodeAttrs = node['node_attrs'] gId = nodeAttrs['gisaid_epi_isl']['value'] name = node['name'] date = numDateToMonthDay(nodeAttrs['num_date']['value']) cladeName = nodeAttrs['clade_membership']['value'] color = str(cladeRgbFromName(cladeName)) treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color return treeString with open('nextstrain.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree']) + ';\n') outF.close() for cladeName, node in cladeNodes.items():