fd95861dbf3dd970c833b11076daa0da8738b1e8 angie Fri Mar 20 16:43:31 2020 -0700 Adding 3 tracks derived from nextstrain.org: Nextstrain {Genes, Clades, Variants}. refs #25188 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py new file mode 100755 index 0000000..ceaad22 --- /dev/null +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +import json +import re +from warnings import warn + +with open('ncov.json') as f: + ncov = json.load(f) + +chrom = 'NC_045512v2' + +# Genes (to which AA mutations are relative): +genePos = {} +geneBed = [] +for gene,anno in ncov['meta']['genome_annotations'].items(): + if (gene != 'nuc'): + genePos[gene] = anno['start'] - 1 + geneBed.append([chrom, anno['start']-1, anno['end'], gene]) +def bedStart(bed): + return bed[1] +geneBed.sort(key=bedStart) +with open('nextstrainGene.bed', 'w') as outG: + for bed in geneBed: + outG.write('\t'.join(map(str, bed)) + '\n') + outG.close() + +# Variants and "clades" + +snvRe = re.compile('^([ACGT])([0-9]+)([ACGT])$') +snvAaRe = re.compile('^([A-Z*])([0-9]+)([A-Z*])$') + +clades = {} +variantCounts = {} +variantAaChanges = {} +samples = [] + +cladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207', + 'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81', + 'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53', + 'B4': '249,53,41' } + +def cladeColorFromName(cladeName): + color = cladeColors.get(cladeName); + if (not color): + color = 'purple' + return color + +def subtractStart(coord, start): + return coord - start + +def cladeFromVariants(name, variants): + """Extract bed12 info from an object whose keys are SNV variant names""" + clade = {} + snvEnds = [] + varNames = [] + for varName in variants: + m = snvRe.match(varName) + if (m): + snvEnds.append(int(m.group(2))) + varNames.append(varName) + if snvEnds: + snvEnds.sort() + snvStarts = list(map(lambda x: x-1, snvEnds)) + snvSizes = list(map(lambda x: 1, snvEnds)) + clade['thickStart'] = min(snvStarts) + clade['thickEnd'] = max(snvEnds) + clade['name'] = name + clade['color'] = cladeColorFromName(name) + clade['varSizes'] = snvSizes + clade['varStarts'] = snvStarts + clade['varNames'] = varNames + return clade + +def rUnpackNextstrainTree(branch, parentVariants): + """Recursively descend ncov.tree and build data structures for genome browser tracks""" + # Inherit parent variants + branchVariants = parentVariants.copy() + # Add variants specific to this branch (if any) + try: + # Nucleotide variants specific to this branch + for varName in branch['branch_attrs']['mutations']['nuc']: + if (snvRe.match(varName)): + branchVariants[varName] = 1 + if (not variantCounts.get(varName)): + variantCounts[varName] = 0; + # Amino acid variants specific to this branch + for geneName in branch['branch_attrs']['mutations'].keys(): + if (geneName != 'nuc'): + for change in branch['branch_attrs']['mutations'][geneName]: + # Get nucleotide coords & figure out which nuc var this aa change corresponds to + aaM = snvAaRe.match(change) + if (aaM): + aaRef, aaPos, aaAlt = aaM.groups() + varStartMin = (int(aaPos) - 1) * 3 + if (genePos.get(geneName)): + cdsStart = genePos.get(geneName) + varStartMin += cdsStart + varStartMax = varStartMin + 2 + for varName in branchVariants.keys(): + ref, pos, alt = snvRe.match(varName).groups() + pos = int(pos) + if (pos >= varStartMin and pos <= varStartMax): + variantAaChanges[varName] = geneName + ':' + change + else: + warn("Can't find start for gene " + geneName) + else: + warn("Can't match amino acid change" + change) + except KeyError: + pass + if (branch['node_attrs'].get('clade_membership')): + cladeName = branch['node_attrs']['clade_membership']['value'] + if (not cladeName in clades): + clades[cladeName] = cladeFromVariants(cladeName, branchVariants) + kids = branch.get('children') + if (kids): + for child in kids: + rUnpackNextstrainTree(child, branchVariants); + else: + for varName in branchVariants: + variantCounts[varName] += 1 + samples.append({ 'id': branch['node_attrs']['gisaid_epi_isl']['value'], + 'name': branch['name'], + 'clade': branch['node_attrs']['clade_membership']['value'], + 'variants': branchVariants }) + if (cladeName): + if (clades[cladeName].get('sampleCount')): + clades[cladeName]['sampleCount'] += 1 + else: + clades[cladeName]['sampleCount'] = 1 + if (clades[cladeName].get('samples')): + clades[cladeName]['samples'].append(branch['name']) + else: + clades[cladeName]['samples'] = [ branch['name'] ] + +rUnpackNextstrainTree(ncov['tree'], {}) + +sampleCount = len(samples) +sampleNames = [ sample['id'] + ':' + sample['name'] for sample in samples ] + +# Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order +parsedVars = [] +for varName in variantCounts.keys(): + m = snvRe.match(varName) + if (m): + ref, pos, alt = m.groups() + parsedVars.append([int(pos), varName, ref, alt]) + else: + warn("Can't match " + varName) +# Sort by position +def parsedVarPos(pv): + return pv[0] +parsedVars.sort(key=parsedVarPos) + +def boolToStr01(bool): + if (bool): + return '1' + else: + return '0' + +with open('nextstrainSamples.vcf', 'w') as outC: + outC.write('##fileformat=VCFv4.3\n') + outC.write('##source=nextstrain.org\n') + outC.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) + + '\t' + '\t'.join(sampleNames) + '\n') + for pv in parsedVars: + varName = pv[1] + info = 'AC=' + str(variantCounts[varName]) + ';AN=' + str(sampleCount) + aaChange = variantAaChanges.get(varName) + if (aaChange): + info += ';AACHANGE=' + aaChange + genotypes = [] + for sample in samples: + gt = boolToStr01(sample['variants'].get(varName)) + genotypes.append(gt + ':' + sample['clade']) + outC.write('\t'.join([ chrom, + '\t'.join(map(str, pv)), + '\t'.join(['.', 'PASS', info, 'GT:CLADE']), + '\t'.join(genotypes) ]) + '\n') + outC.close() + +with open('nextstrainClade.bed', 'w') as outC: + for name, clade in clades.items(): + if (clade.get('thickStart')): + outC.write('\t'.join(map(str, + [ chrom, 0, 29903, name, 0, '.', + clade['thickStart'], clade['thickEnd'], clade['color'], + len(clade['varSizes']) + 2, + '1,' + ','.join(map(str, clade['varSizes'])) + ',1,', + '0,' + ','.join(map(str, clade['varStarts'])) + ',29902,', + ', '.join(clade['varNames']), + clade['sampleCount'], + ', '.join(clade['samples']) ])) + '\n') + outC.close()