775f8e290daefbb3312fd1d8962344690550c887 angie Mon Jun 8 16:20:24 2020 -0700 Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py index 2e3a83a..b270821 100755 --- src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py +++ src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py @@ -1,44 +1,22 @@ #!/usr/bin/env python3 import logging, argparse, sys import random from collections import defaultdict from utils import die -import newick, cladeColors, virusNames - -def readVcfSamples(vcfFile): - """Read VCF sample IDs from the #CHROM line, and parse out clades from the first row GT cols""" - samples = [] - sampleClades = defaultdict() - with open(vcfFile, 'r') as vcfF: - line = vcfF.readline().strip() - while (line): - if (line.startswith('#CHROM')): - samples = line.split('\t')[9:] - elif (not line.startswith('#')): - gts = line.split('\t')[9:] - if (len(gts) != len(samples)): - die("VCF file '%s' has %d samples but %d genotypes in first row" % - (vcfFile, len(samples), len(gts))); - for sample, gt in zip(samples, gts): - gtVal, clade = gt.split(':') - sampleClades[sample] = clade - break - line = vcfF.readline().strip() - vcfF.close() - return samples, sampleClades +import newick, cladeColors, nextstrainVcf, virusNames def treeIntersectIds(node, idLookup, sampleSet): """For each leaf in node, attempt to look up its label in idLookup; replace if found. Prune nodes with no matching leaves. Store new leaf labels in sampleSet.""" if (node['kids']): # Internal node: prune prunedKids = [] for kid in (node['kids']): kidIntersected = treeIntersectIds(kid, idLookup, sampleSet) if (kidIntersected): prunedKids.append(kidIntersected) if (len(prunedKids) > 1): node['kids'] = prunedKids elif (len(prunedKids) == 1): node = prunedKids[0] @@ -92,31 +70,31 @@ outF.close() def main(): parser = argparse.ArgumentParser(description=""" Read in tree, read in samples from VCF, attempt to match tree's leaf IDs with VCF IDs, prune tree to only branches with leaves found in VCF, output pruned tree with VCF IDs. """ ) parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain') parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data') args = parser.parse_args() # Very large, deeply nested trees can exceed the default recursion limit of 1000. sys.setrecursionlimit(100000) # logging.basicConfig(level=logging.DEBUG, filename='intersect.log') tree = newick.parseFile(args.treeFile) - (vcfSamples, vcfSampleClades) = readVcfSamples(args.vcfFile) + (vcfSamples, vcfSampleClades) = nextstrainVcf.readVcfSamples(args.vcfFile) idLookup = virusNames.makeIdLookup(vcfSamples) for key in idLookup: values = idLookup[key] if (len(values) != 1): logging.warn('Duplicate name/component in VCF: ' + key + " -> " + ", ".join(values)) sampleSet = set() tree = treeIntersectIds(tree, idLookup, sampleSet) cladeColors.addCladesAsBogusLength(tree, vcfSampleClades) newick.printTree(tree) if (len(sampleSet) < len(vcfSamples)): logging.warn("VCF has %d samples but pruned tree has %d leaves (%d VCF samples not found)" % (len(vcfSamples), len(sampleSet), len(vcfSamples) - len(sampleSet))) vcfSampleSet = set(vcfSamples) vcfNotTree = vcfSampleSet - sampleSet logging.warn("Example VCF samples not found:\n" +