src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py 2e6ddaec223b5b760cbba134eee52d14a327fda3

2e6ddaec223b5b760cbba134eee52d14a327fda3
angie
  Fri May 29 15:22:45 2020 -0700
Adding tree manipulation python scripts and modules that I've been working on for David et al.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py
new file mode 100755
index 0000000..2e3a83a
--- /dev/null
+++ src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+
+import logging, argparse, sys
+import random
+from collections import defaultdict
+from utils import die
+import newick, cladeColors, virusNames
+
+def readVcfSamples(vcfFile):
+    """Read VCF sample IDs from the #CHROM line, and parse out clades from the first row GT cols"""
+    samples = []
+    sampleClades = defaultdict()
+    with open(vcfFile, 'r') as vcfF:
+        line = vcfF.readline().strip()
+        while (line):
+            if (line.startswith('#CHROM')):
+                samples = line.split('\t')[9:]
+            elif (not line.startswith('#')):
+                gts = line.split('\t')[9:]
+                if (len(gts) != len(samples)):
+                    die("VCF file '%s' has %d samples but %d genotypes in first row" %
+                        (vcfFile, len(samples), len(gts)));
+                for sample, gt in zip(samples, gts):
+                    gtVal, clade = gt.split(':')
+                    sampleClades[sample] = clade
+                break
+            line = vcfF.readline().strip()
+        vcfF.close()
+    return samples, sampleClades
+
+def treeIntersectIds(node, idLookup, sampleSet):
+    """For each leaf in node, attempt to look up its label in idLookup; replace if found.
+    Prune nodes with no matching leaves.  Store new leaf labels in sampleSet."""
+    if (node['kids']):
+        # Internal node: prune
+        prunedKids = []
+        for kid in (node['kids']):
+            kidIntersected = treeIntersectIds(kid, idLookup, sampleSet)
+            if (kidIntersected):
+                prunedKids.append(kidIntersected)
+        if (len(prunedKids) > 1):
+            node['kids'] = prunedKids
+        elif (len(prunedKids) == 1):
+            node = prunedKids[0]
+        else:
+            node = None
+    else:
+        # Leaf: lookup, prune if not found
+        label = node['label']
+        matchList = virusNames.lookupSeqName(node['label'], idLookup)
+        if (not matchList):
+            logging.info("No match for leaf '" + label + "'")
+            node = None
+        else:
+            if (len(matchList) != 1):
+                logging.warn("Non-unique match for leaf '" + label + "': ['" +
+                             "', '".join(matchList) + "']")
+            else:
+                logging.debug(label + ' --> ' + matchList[0]);
+            node['label'] = matchList[0]
+            sampleSet.add(matchList[0])
+    return node
+
+def vcfSubset(vcfIn, sampleSet, vcfOut):
+    """Copy data in vcfIn to vcfOut, but with only the samples in sampleSet."""
+    with open(vcfIn, 'r') as inF:
+        with open(vcfOut, 'w') as outF:
+            line = inF.readline().rstrip()
+            while (line):
+                if (line.startswith('#CHROM')):
+                    colNamesIn = line.split('\t')
+                    colNamesOut = colNamesIn[0:9]
+                    columnsKept = [True for i in range(0, 9)]
+                    for vcfSample in colNamesIn[9:]:
+                        if (vcfSample in sampleSet):
+                            colNamesOut.append(vcfSample)
+                            columnsKept.append(True)
+                        else:
+                            columnsKept.append(False)
+                    outF.write('\t'.join(colNamesOut) + '\n');
+                elif (line[0] == '#'):
+                    outF.write(line + '\n');
+                else:
+                    columnsIn = line.split('\t')
+                    columnsOut = []
+                    for ix, colVal in enumerate(columnsIn):
+                        if (columnsKept[ix]):
+                            columnsOut.append(colVal)
+                    outF.write('\t'.join(columnsOut) + '\n')
+                line = inF.readline().rstrip()
+            inF.close()
+            outF.close()
+
+def main():
+    parser = argparse.ArgumentParser(description="""
+Read in tree, read in samples from VCF, attempt to match tree's leaf IDs with VCF IDs,
+prune tree to only branches with leaves found in VCF, output pruned tree with VCF IDs.
+"""
+    )
+    parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain')
+    parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data')
+    args = parser.parse_args()
+    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
+    sys.setrecursionlimit(100000)
+    # logging.basicConfig(level=logging.DEBUG, filename='intersect.log')
+    tree = newick.parseFile(args.treeFile)
+    (vcfSamples, vcfSampleClades) = readVcfSamples(args.vcfFile)
+    idLookup = virusNames.makeIdLookup(vcfSamples)
+    for key in idLookup:
+        values = idLookup[key]
+        if (len(values) != 1):
+            logging.warn('Duplicate name/component in VCF: ' + key + " -> " + ", ".join(values))
+    sampleSet = set()
+    tree = treeIntersectIds(tree, idLookup, sampleSet)
+    cladeColors.addCladesAsBogusLength(tree, vcfSampleClades)
+    newick.printTree(tree)
+    if (len(sampleSet) < len(vcfSamples)):
+        logging.warn("VCF has %d samples but pruned tree has %d leaves (%d VCF samples not found)" %
+                     (len(vcfSamples), len(sampleSet), len(vcfSamples) - len(sampleSet)))
+        vcfSampleSet = set(vcfSamples)
+        vcfNotTree = vcfSampleSet - sampleSet
+        logging.warn("Example VCF samples not found:\n" +
+                     "\n".join(random.sample(vcfNotTree, 10)))
+        vcfOutName = 'intersected.vcf'
+        logging.warn("Writing VCF to " + vcfOutName)
+        vcfSubset(args.vcfFile, sampleSet, vcfOutName)
+
+main()