775f8e290daefbb3312fd1d8962344690550c887
angie
  Mon Jun 8 16:20:24 2020 -0700
Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/treeLineage.py src/hg/utils/otto/nextstrainNcov/treeLineage.py
new file mode 100755
index 0000000..3dd87bc
--- /dev/null
+++ src/hg/utils/otto/nextstrainNcov/treeLineage.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+import logging, argparse, sys
+import lineageColors, newick, nextstrainVcf, utils, virusNames
+
+def assignColors(node, idLookup, labelToLineage):
+    if (node['kids']):
+        # Internal node: do all descendants have same lineage?
+        kids = node['kids']
+        for kid in kids:
+            assignColors(kid, idLookup, labelToLineage)
+        kidLineages = set([ kid.get('lineage') for kid in kids ]);
+        if (len(kidLineages) == 1):
+            node['lineage'] = list(kidLineages)[0]
+        else:
+            node['lineage'] = ''
+        kidColors = set([ kid.get('color') for kid in kids ])
+        if (len(kidColors) == 1):
+            node['color'] = list(kidColors)[0]
+            if (node['color']):
+                label = node['label']
+                if (not label):
+                    label = str(node['inode'])
+                if (label):
+                    print("\t".join([ label, node['lineage'], node['color']]))
+    else:
+        # Leaf: look up lineage by label
+        lineage = labelToLineage.get(virusNames.maybeLookupSeqName(node['label'], idLookup))
+        if (not lineage):
+            logging.warn('No lineage for "' + node['label'] + '"')
+            lineage = ''
+        node['lineage'] = lineage
+        node['color'] = "#%06x" % (lineageColors.lineageToColor(lineage))
+        print('\t'.join([ node['label'], lineage, node['color'] ]))
+
+def main():
+    parser = argparse.ArgumentParser(description="""
+Read tree from Newick treeFile.
+Read sample IDs that are a concatenation of EPI ID, sample name and approximate date,
+for resolving sampleFile IDs and lineageFile IDs, from a Nextstrain VCF file.
+Read lineage assignments from lineageFile.
+Figure out what lineage and color (if any) are assigned to each leaf, and then work
+back towards root assigning color to each named node whose descendants all have same color.
+Write out 3 tab-sep columns:
+sampleOrNode, lineage, lineageColor.
+"""
+    )
+    parser.add_argument('treeFile', help='File containing sample IDs')
+    parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data')
+    parser.add_argument('lineageFile', help='Two-column tab-sep file mapping sample to lineage')
+    args = parser.parse_args()
+
+    tree = newick.parseFile(args.treeFile)
+    (vcfSamples, vcfSampleClades) = nextstrainVcf.readVcfSamples(args.vcfFile)
+    idLookup = virusNames.makeIdLookup(vcfSamples)
+    lineages = utils.dictFromFile(args.lineageFile)
+    nsLineages = dict([ (virusNames.maybeLookupSeqName(name, idLookup), lin)
+                        for name, lin in lineages.items() ])
+    assignColors(tree, idLookup, nsLineages)
+
+main()