775f8e290daefbb3312fd1d8962344690550c887 angie Mon Jun 8 16:20:24 2020 -0700 Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/treeLineage.py src/hg/utils/otto/nextstrainNcov/treeLineage.py new file mode 100755 index 0000000..3dd87bc --- /dev/null +++ src/hg/utils/otto/nextstrainNcov/treeLineage.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python3 + +import logging, argparse, sys +import lineageColors, newick, nextstrainVcf, utils, virusNames + +def assignColors(node, idLookup, labelToLineage): + if (node['kids']): + # Internal node: do all descendants have same lineage? + kids = node['kids'] + for kid in kids: + assignColors(kid, idLookup, labelToLineage) + kidLineages = set([ kid.get('lineage') for kid in kids ]); + if (len(kidLineages) == 1): + node['lineage'] = list(kidLineages)[0] + else: + node['lineage'] = '' + kidColors = set([ kid.get('color') for kid in kids ]) + if (len(kidColors) == 1): + node['color'] = list(kidColors)[0] + if (node['color']): + label = node['label'] + if (not label): + label = str(node['inode']) + if (label): + print("\t".join([ label, node['lineage'], node['color']])) + else: + # Leaf: look up lineage by label + lineage = labelToLineage.get(virusNames.maybeLookupSeqName(node['label'], idLookup)) + if (not lineage): + logging.warn('No lineage for "' + node['label'] + '"') + lineage = '' + node['lineage'] = lineage + node['color'] = "#%06x" % (lineageColors.lineageToColor(lineage)) + print('\t'.join([ node['label'], lineage, node['color'] ])) + +def main(): + parser = argparse.ArgumentParser(description=""" +Read tree from Newick treeFile. +Read sample IDs that are a concatenation of EPI ID, sample name and approximate date, +for resolving sampleFile IDs and lineageFile IDs, from a Nextstrain VCF file. +Read lineage assignments from lineageFile. +Figure out what lineage and color (if any) are assigned to each leaf, and then work +back towards root assigning color to each named node whose descendants all have same color. +Write out 3 tab-sep columns: +sampleOrNode, lineage, lineageColor. +""" + ) + parser.add_argument('treeFile', help='File containing sample IDs') + parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data') + parser.add_argument('lineageFile', help='Two-column tab-sep file mapping sample to lineage') + args = parser.parse_args() + + tree = newick.parseFile(args.treeFile) + (vcfSamples, vcfSampleClades) = nextstrainVcf.readVcfSamples(args.vcfFile) + idLookup = virusNames.makeIdLookup(vcfSamples) + lineages = utils.dictFromFile(args.lineageFile) + nsLineages = dict([ (virusNames.maybeLookupSeqName(name, idLookup), lin) + for name, lin in lineages.items() ]) + assignColors(tree, idLookup, nsLineages) + +main()