475424b5d722b4ff761681f1d3bd91ad666fb2d6 angie Wed Apr 1 10:17:03 2020 -0700 For nested clades, add all clade samples to per-clade VCF. Also generate Newick (.nh) tree file and tweak sample name separator, in preparation for drawing the nextstrain tree in the VCF left label area. refs #25188 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 4c3971e..50e5a9e 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -18,30 +18,31 @@ geneBed.append([chrom, anno['start']-1, anno['end'], gene]) def bedStart(bed): return bed[1] geneBed.sort(key=bedStart) with open('nextstrainGene.bed', 'w') as outG: for bed in geneBed: outG.write('\t'.join(map(str, bed)) + '\n') outG.close() # Variants and "clades" snvRe = re.compile('^([ACGT])([0-9]+)([ACGT])$') snvAaRe = re.compile('^([A-Z*])([0-9]+)([A-Z*])$') clades = {} +cladeNodes = {} variantCounts = {} variantAaChanges = {} samples = [] cladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207', 'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81', 'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53', 'B4': '249,53,41' } def cladeColorFromName(cladeName): color = cladeColors.get(cladeName); if (not color): color = 'purple' return color @@ -175,65 +176,66 @@ # Make an ordered variant string as David requested: semicolons between nodes, # comma-separated within a node: branchVarStr = '' for varName in localVariants: if (len(branchVarStr)): branchVarStr += ', ' branchVarStr += varName aaVar = variantAaChanges.get(varName) if (aaVar): branchVarStr += ' (' + aaVar + ')' if (len(parentVarStr) and len(branchVarStr)): branchVarStr = parentVarStr + '; ' + branchVarStr nodeAttrs = branch['node_attrs'] if (nodeAttrs.get('clade_membership')): cladeName = nodeAttrs['clade_membership']['value'] - if (not cladeName in clades): + if (cladeName != 'unassigned' and not cladeName in clades): clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr) addDatesToClade(clades[cladeName], nodeAttrs['num_date']) if (nodeAttrs.get('country')): addCountryToClade(clades[cladeName], nodeAttrs['country']) elif (nodeAttrs.get('division')): addCountryToClade(clades[cladeName], nodeAttrs['division']) else: warn('No country or division for new clade ' + cladeName) + cladeNodes[cladeName] = branch kids = branch.get('children') if (kids): for child in kids: rUnpackNextstrainTree(child, branchVariants, branchVarStr); else: for varName in branchVariants: variantCounts[varName] += 1 samples.append({ 'id': nodeAttrs['gisaid_epi_isl']['value'], 'name': branch['name'], 'clade': nodeAttrs['clade_membership']['value'], 'date': numDateToMonthDay(nodeAttrs['num_date']['value']), 'variants': branchVariants }) - if (cladeName): + if (cladeName and cladeName != 'unassigned'): if (clades[cladeName].get('sampleCount')): clades[cladeName]['sampleCount'] += 1 else: clades[cladeName]['sampleCount'] = 1 if (clades[cladeName].get('samples')): clades[cladeName]['samples'].append(branch['name']) else: clades[cladeName]['samples'] = [ branch['name'] ] rUnpackNextstrainTree(ncov['tree'], {}, '') sampleCount = len(samples) -sampleNames = [ ':'.join([sample['id'], sample['name'], sample['date']]) for sample in samples ] +sampleNames = [ '|'.join([sample['id'], sample['name'], sample['date']]) for sample in samples ] # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order parsedVars = [] for varName in variantCounts.keys(): m = snvRe.match(varName) if (m): ref, pos, alt = m.groups() parsedVars.append([int(pos), varName, ref, alt]) else: warn("Can't match " + varName) # Sort by position def parsedVarPos(pv): return pv[0] parsedVars.sort(key=parsedVarPos) @@ -260,35 +262,47 @@ info = 'AC=' + str(variantCounts[varName]) + ';AN=' + str(sampleCount) aaChange = variantAaChanges.get(varName) if (aaChange): info += ';AACHANGE=' + aaChange genotypes = [] for sample in samples: gt = boolToStr01(sample['variants'].get(varName)) genotypes.append(gt + ':' + sample['clade']) outC.write('\t'.join([ chrom, '\t'.join(map(str, pv)), '\t'.join(['.', 'PASS', info, 'GT:CLADE']), '\t'.join(genotypes) ]) + '\n') outC.close() # Per-clade VCF subset -for cladeName in clades: - if cladeName != 'unassigned': - cladeSamples = [ sample for sample in samples if (sample['clade'] == cladeName) ] +def sampleIdsFromNode(node, ids): + """Fill in a dict of IDs of all samples found under node.""" + kids = node.get('children') + if (kids): + for kid in kids: + sampleIdsFromNode(kid, ids) + else: + sampleId = node['node_attrs']['gisaid_epi_isl']['value'] + ids[sampleId] = 1 + +for cladeName, node in cladeNodes.items(): + sampleIds = {} + sampleIdsFromNode(node, sampleIds) + cladeSamples = [ sample for sample in samples if sample['id'] in sampleIds ] cladeSampleCount = len(cladeSamples) - cladeSampleNames = [ sample['name'] for sample in cladeSamples ] + cladeSampleNames = [ '|'.join([ sample['id'], sample['name'], sample['date'] ]) + for sample in cladeSamples ] with open('nextstrainSamples' + cladeName + '.vcf', 'w') as outV: writeVcfHeaderExceptSamples(outV) outV.write('\t'.join(cladeSampleNames) + '\n') for pv in parsedVars: varName = pv[1] genotypes = [] ac=0 for sample in cladeSamples: gt = boolToStr01(sample['variants'].get(varName)) genotypes.append(gt) if (sample['variants'].get(varName)): ac += 1 if (ac > 0): info = 'AC=' + str(ac) + ';AN=' + str(cladeSampleCount) aaChange = variantAaChanges.get(varName) @@ -307,15 +321,39 @@ outC.write('\t'.join(map(str, [ chrom, 0, 29903, name, 0, '.', clade['thickStart'], clade['thickEnd'], clade['color'], len(clade['varSizes']) + 2, '1,' + ','.join(map(str, clade['varSizes'])) + ',1,', '0,' + ','.join(map(str, clade['varStarts'])) + ',29902,', clade['varNames'], numDateToYmdStr(clade['dateInferred']), numDateToYmdStr(clade['dateConfMin']), numDateToYmdStr(clade['dateConfMax']), clade['countryInferred'], clade['countryConf'], clade['sampleCount'], ', '.join(clade['samples']) ])) + '\n') outC.close() + +# Newick-formatted tree of samples for VCF display +def rNextstrainToNewick(node): + """Recursively descend ncov.tree and build Newick tree string of samples to file""" + kids = node.get('children') + if (kids): + treeString = '(' + ','.join([ rNextstrainToNewick(child) for child in kids ]) + ')' + else: + nodeAttrs = node['node_attrs'] + gId = nodeAttrs['gisaid_epi_isl']['value'] + name = node['name'] + date = numDateToMonthDay(nodeAttrs['num_date']['value']) + treeString = '|'.join([ gId, name, date ]) + return treeString + +with open('nextstrain.nh', 'w') as outF: + outF.write(rNextstrainToNewick(ncov['tree']) + ';\n') + outF.close + +for cladeName, node in cladeNodes.items(): + filename = 'nextstrain' + cladeName + '.nh' + with open(filename, 'w') as outF: + outF.write(rNextstrainToNewick(node) + ';\n') + outF.close