775f8e290daefbb3312fd1d8962344690550c887 angie Mon Jun 8 16:20:24 2020 -0700 Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py new file mode 100644 index 0000000..d338368 --- /dev/null +++ src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py @@ -0,0 +1,27 @@ +# Utils for dealing with VCF generated by nextstrain.py, with a particular style of ID +# and clades appearing in genotypes... + +from collections import defaultdict + +def readVcfSamples(vcfFile): + """Read VCF sample IDs from the #CHROM line, and parse out clades from the first row GT cols""" + samples = [] + sampleClades = defaultdict() + with open(vcfFile, 'r') as vcfF: + line = vcfF.readline().strip() + while (line): + if (line.startswith('#CHROM')): + samples = line.split('\t')[9:] + elif (not line.startswith('#')): + gts = line.split('\t')[9:] + if (len(gts) != len(samples)): + die("VCF file '%s' has %d samples but %d genotypes in first row" % + (vcfFile, len(samples), len(gts))); + for sample, gt in zip(samples, gts): + gtVal, clade = gt.split(':') + sampleClades[sample] = clade + break + line = vcfF.readline().strip() + vcfF.close() + return samples, sampleClades +