74e378797b2f1b99b00d73f49c260bf5384cf670 angie Thu Apr 2 09:40:56 2020 -0700 David noticed that nextstrainClade samples & counts weren't keeping up with change to VCF subtracks, so that nested clades include all descendants. Fixed. refs #25188 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 50e5a9e..6a1dca5 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -198,39 +198,30 @@ else: warn('No country or division for new clade ' + cladeName) cladeNodes[cladeName] = branch kids = branch.get('children') if (kids): for child in kids: rUnpackNextstrainTree(child, branchVariants, branchVarStr); else: for varName in branchVariants: variantCounts[varName] += 1 samples.append({ 'id': nodeAttrs['gisaid_epi_isl']['value'], 'name': branch['name'], 'clade': nodeAttrs['clade_membership']['value'], 'date': numDateToMonthDay(nodeAttrs['num_date']['value']), 'variants': branchVariants }) - if (cladeName and cladeName != 'unassigned'): - if (clades[cladeName].get('sampleCount')): - clades[cladeName]['sampleCount'] += 1 - else: - clades[cladeName]['sampleCount'] = 1 - if (clades[cladeName].get('samples')): - clades[cladeName]['samples'].append(branch['name']) - else: - clades[cladeName]['samples'] = [ branch['name'] ] rUnpackNextstrainTree(ncov['tree'], {}, '') sampleCount = len(samples) sampleNames = [ '|'.join([sample['id'], sample['name'], sample['date']]) for sample in samples ] # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order parsedVars = [] for varName in variantCounts.keys(): m = snvRe.match(varName) if (m): ref, pos, alt = m.groups() parsedVars.append([int(pos), varName, ref, alt]) else: warn("Can't match " + varName) @@ -261,89 +252,97 @@ varName = pv[1] info = 'AC=' + str(variantCounts[varName]) + ';AN=' + str(sampleCount) aaChange = variantAaChanges.get(varName) if (aaChange): info += ';AACHANGE=' + aaChange genotypes = [] for sample in samples: gt = boolToStr01(sample['variants'].get(varName)) genotypes.append(gt + ':' + sample['clade']) outC.write('\t'.join([ chrom, '\t'.join(map(str, pv)), '\t'.join(['.', 'PASS', info, 'GT:CLADE']), '\t'.join(genotypes) ]) + '\n') outC.close() -# Per-clade VCF subset +# Assign samples to clades; a sample can appear in multiple clades if they are nested. +cladeSamples = {} +cladeSampleCounts = {} +cladeSampleNames = {} + def sampleIdsFromNode(node, ids): """Fill in a dict of IDs of all samples found under node.""" kids = node.get('children') if (kids): for kid in kids: sampleIdsFromNode(kid, ids) else: sampleId = node['node_attrs']['gisaid_epi_isl']['value'] ids[sampleId] = 1 for cladeName, node in cladeNodes.items(): sampleIds = {} sampleIdsFromNode(node, sampleIds) - cladeSamples = [ sample for sample in samples if sample['id'] in sampleIds ] - cladeSampleCount = len(cladeSamples) - cladeSampleNames = [ '|'.join([ sample['id'], sample['name'], sample['date'] ]) - for sample in cladeSamples ] + cladeSampleList = [ sample for sample in samples if sample['id'] in sampleIds ] + cladeSamples[cladeName] = cladeSampleList + cladeSampleCounts[cladeName] = len(cladeSampleList) + cladeSampleNames[cladeName] = [ '|'.join([ sample['id'], sample['name'], sample['date'] ]) + for sample in cladeSampleList ] + +# Per-clade VCF subset +for cladeName, cladeSampleList in cladeSamples.items(): with open('nextstrainSamples' + cladeName + '.vcf', 'w') as outV: writeVcfHeaderExceptSamples(outV) - outV.write('\t'.join(cladeSampleNames) + '\n') + outV.write('\t'.join(cladeSampleNames[cladeName]) + '\n') for pv in parsedVars: varName = pv[1] genotypes = [] ac=0 - for sample in cladeSamples: + for sample in cladeSampleList: gt = boolToStr01(sample['variants'].get(varName)) genotypes.append(gt) if (sample['variants'].get(varName)): ac += 1 if (ac > 0): - info = 'AC=' + str(ac) + ';AN=' + str(cladeSampleCount) + info = 'AC=' + str(ac) + ';AN=' + str(cladeSampleCounts[cladeName]) aaChange = variantAaChanges.get(varName) if (aaChange): info += ';AACHANGE=' + aaChange outV.write('\t'.join([ chrom, '\t'.join(map(str, pv)), '\t'.join(['.', 'PASS', info, 'GT']), '\t'.join(genotypes) ]) + '\n') outV.close() # BED+ file for clades with open('nextstrainClade.bed', 'w') as outC: for name, clade in clades.items(): if (clade.get('thickStart')): outC.write('\t'.join(map(str, [ chrom, 0, 29903, name, 0, '.', clade['thickStart'], clade['thickEnd'], clade['color'], len(clade['varSizes']) + 2, '1,' + ','.join(map(str, clade['varSizes'])) + ',1,', '0,' + ','.join(map(str, clade['varStarts'])) + ',29902,', clade['varNames'], numDateToYmdStr(clade['dateInferred']), numDateToYmdStr(clade['dateConfMin']), numDateToYmdStr(clade['dateConfMax']), clade['countryInferred'], clade['countryConf'], - clade['sampleCount'], - ', '.join(clade['samples']) ])) + '\n') + cladeSampleCounts[name], + ', '.join(cladeSampleNames[name]) ])) + '\n') outC.close() # Newick-formatted tree of samples for VCF display def rNextstrainToNewick(node): """Recursively descend ncov.tree and build Newick tree string of samples to file""" kids = node.get('children') if (kids): treeString = '(' + ','.join([ rNextstrainToNewick(child) for child in kids ]) + ')' else: nodeAttrs = node['node_attrs'] gId = nodeAttrs['gisaid_epi_isl']['value'] name = node['name'] date = numDateToMonthDay(nodeAttrs['num_date']['value']) treeString = '|'.join([ gId, name, date ]) return treeString