b226cb3af001057dea4635de2d4c6c52f0ebe047
angie
  Sat Jun 20 14:34:08 2020 -0700
New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build.  Updates to other VCF & tree utils.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/treeBranchWithSamples.py src/hg/utils/otto/nextstrainNcov/treeBranchWithSamples.py
new file mode 100755
index 0000000..9caf2a3
--- /dev/null
+++ src/hg/utils/otto/nextstrainNcov/treeBranchWithSamples.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+import logging, argparse, sys
+import newick, utils
+
+def countSampleLeaves(node, samples):
+    """Count the number of leaves descending from this node whose labels are in samples.
+    Add the number as node['sampleCount'] and also return the list of samples found."""
+    samplesFound = set()
+    if (node['kids']):
+        for kid in node['kids']:
+            samplesFound = samplesFound.union(countSampleLeaves(kid, samples))
+    else:
+        if (node['label'] in samples):
+            samplesFound = set([node['label']])
+    node['sampleCount'] = len(samplesFound)
+    return samplesFound
+
+def topBranchWithSampleCount(node, sampleCount):
+    """If a descendant of this node is the topmost branch with sampleCount then return the
+    descendant.  Return self if no descendant is the topmost branch but this node has sampleCount."""
+    if (node['sampleCount'] < sampleCount):
+        return None;
+    if (node['kids']):
+        for kid in node['kids']:
+            topKid = topBranchWithSampleCount(kid, sampleCount)
+            if (topKid):
+                return topKid
+    return node
+
+def samplesMissing(samples, samplesFound):
+    """Return a string with a limited number of samples not found for error reporting."""
+    missing = list(set(samples) - samplesFound)
+    if (len(missing) > 10):
+        return ', '.join(missing[0:10]) + ', ...'
+    else:
+        return ', '.join(missing)
+
+def treeBranchWithSamples(tree, samples):
+    """Return the topmost (farthest from root) branch that has all samples as descendants.
+    Error out if not all samples are found."""
+    sampleSet = set(samples)
+    if (len(samples) > len(sampleSet)):
+        utils.die("List of samples must be unique, but contains duplicate(s)")
+    samplesFound = countSampleLeaves(tree, sampleSet)
+    if (len(samplesFound) != len(sampleSet)):
+        utils.die("Expected to find %d samples, but found %d.  (Not found: %s)" %
+                  (len(sampleSet), len(samplesFound), samplesMissing(sampleSet, samplesFound)))
+    return topBranchWithSampleCount(tree, len(samplesFound))
+
+def main():
+    parser = argparse.ArgumentParser(description="""
+Read in tree, read samples, find branch of tree that has all of the samples as leaves,
+and write out that branch as a new tree.  All samples must exactly match leaf names
+and all must be found.
+"""
+    )
+    parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain')
+    parser.add_argument('sampleFile', help='File with one sample ID per line')
+    args = parser.parse_args()
+    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
+    sys.setrecursionlimit(100000)
+    # logging.basicConfig(level=logging.DEBUG, filename='debug.log')
+    tree = newick.parseFile(args.treeFile)
+    samples = utils.listFromFile(args.sampleFile)
+    branch = treeBranchWithSamples(tree, samples)
+    newick.printTree(branch)
+
+main()