b226cb3af001057dea4635de2d4c6c52f0ebe047 angie Sat Jun 20 14:34:08 2020 -0700 New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build. Updates to other VCF & tree utils. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/treeIntersectVcf.py src/hg/utils/otto/nextstrainNcov/treeIntersectVcf.py new file mode 100755 index 0000000..92bf796 --- /dev/null +++ src/hg/utils/otto/nextstrainNcov/treeIntersectVcf.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 + +import logging, argparse, sys +import random +from utils import die +import newick, vcf, virusNames + +def main(): + parser = argparse.ArgumentParser(description=""" +Read in tree, read in samples from VCF, attempt to match tree's leaf IDs with VCF IDs, +prune tree to only branches with leaves found in VCF, output pruned tree with VCF IDs. +""" + ) + parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain') + parser.add_argument('vcfFile', help='VCF file with IDs similar to Nextstrain') + args = parser.parse_args() + # Very large, deeply nested trees can exceed the default recursion limit of 1000. + sys.setrecursionlimit(100000) + # logging.basicConfig(level=logging.DEBUG, filename='intersect.log') + tree = newick.parseFile(args.treeFile) + vcfSamples = vcf.readSamples(args.vcfFile) + idLookup = virusNames.makeIdLookup(vcfSamples) + badKeys = [] + for key in idLookup: + values = idLookup[key] + if (len(values) > 3): + badKeys.append(key) + elif (len(values) != 1): + logging.warn('Duplicate name/component in VCF: ' + key + " -> " + ", ".join(values)) + for key in badKeys: + del idLookup[key] + sampleSet = set() + tree = newick.treeIntersectIds(tree, idLookup, sampleSet, virusNames.lookupSeqName) + newick.printTree(tree) + if (len(sampleSet) < len(vcfSamples)): + logging.warn("VCF has %d samples but pruned tree has %d leaves (%d VCF samples not found)" % + (len(vcfSamples), len(sampleSet), len(vcfSamples) - len(sampleSet))) + vcfSampleSet = set(vcfSamples) + vcfNotTree = vcfSampleSet - sampleSet + logging.warn("Example VCF samples not found:\n" + + "\n".join(random.sample(vcfNotTree, 10))) + vcfOutName = 'intersected.vcf' + logging.warn("Writing VCF to " + vcfOutName) + vcf.pruneToSamples(args.vcfFile, sampleSet, vcfOutName) + +main()