b226cb3af001057dea4635de2d4c6c52f0ebe047 angie Sat Jun 20 14:34:08 2020 -0700 New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build. Updates to other VCF & tree utils. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py index d338368..bc94c1e 100644 --- src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py +++ src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py @@ -1,27 +1,27 @@ # Utils for dealing with VCF generated by nextstrain.py, with a particular style of ID # and clades appearing in genotypes... from collections import defaultdict -def readVcfSamples(vcfFile): +def readVcfSampleClades(vcfFile): """Read VCF sample IDs from the #CHROM line, and parse out clades from the first row GT cols""" samples = [] sampleClades = defaultdict() with open(vcfFile, 'r') as vcfF: line = vcfF.readline().strip() while (line): if (line.startswith('#CHROM')): samples = line.split('\t')[9:] elif (not line.startswith('#')): gts = line.split('\t')[9:] if (len(gts) != len(samples)): die("VCF file '%s' has %d samples but %d genotypes in first row" % (vcfFile, len(samples), len(gts))); for sample, gt in zip(samples, gts): gtVal, clade = gt.split(':') sampleClades[sample] = clade break line = vcfF.readline().strip() vcfF.close() return samples, sampleClades