b226cb3af001057dea4635de2d4c6c52f0ebe047 angie Sat Jun 20 14:34:08 2020 -0700 New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build. Updates to other VCF & tree utils. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/vcfPruneToSamples.py src/hg/utils/otto/nextstrainNcov/vcfPruneToSamples.py new file mode 100755 index 0000000..7affe2d --- /dev/null +++ src/hg/utils/otto/nextstrainNcov/vcfPruneToSamples.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import logging, argparse, sys +import random +import utils, vcf, virusNames + +def main(): + parser = argparse.ArgumentParser(description=""" +Read in VCF, read in samples from sampleFile, attempt to match VCF IDs with samples, +remove VCF genotype columns for samples not found in sampleFile, output VCF with +updated AC and AN counts. +""" + ) + parser.add_argument('vcfFile', help='VCF file with sample genotype columns') + parser.add_argument('sampleFile', help='File with one sample ID per line') + parser.add_argument('vcfOutFile', help='VCF output file with only samples in sampleFile') + args = parser.parse_args() + samples = utils.listFromFile(args.sampleFile) + idLookup = virusNames.makeIdLookup(samples) + for key in idLookup: + values = idLookup[key] + if (len(values) != 1): + logging.warn('Duplicate name/component in ' + args.sampleFile + ': ' + key + " -> " + + ", ".join(values)) + vcfSamples = vcf.readSamples(args.vcfFile) + foundSampleSet = set([ sample for sample in vcfSamples + if virusNames.lookupSeqName(sample, idLookup) ]) + vcf.pruneToSamples(args.vcfFile, foundSampleSet, args.vcfOutFile) + if (len(foundSampleSet) < len(samples)): + logging.warn("%s has %d samples but pruned VCF has %d samples (%d samples not found)" % + (args.sampleFile, len(samples), len(foundSampleSet), + len(samples) - len(foundSampleSet))) + allSampleSet = set([ virusNames.maybeLookupSeqName(sample, idLookup) for sample in samples ]) + sampleFileNotVcf = allSampleSet - foundSampleSet + logging.warn("Example samples not found:\n" + + "\n".join(random.sample(sampleFileNotVcf, 10))) + +main()