0b18ad3971852efb98daabd60d55ec1cc1711612 angie Thu May 14 13:46:59 2020 -0700 Python learning: the whole point of 'with open(...) as f' is that you don't have to remember to close f. diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 3012d76..775fad5 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -10,31 +10,30 @@ chrom = 'NC_045512v2' # Genes (to which AA mutations are relative): genePos = {} geneBed = [] for gene,anno in ncov['meta']['genome_annotations'].items(): if (gene != 'nuc'): genePos[gene] = anno['start'] - 1 geneBed.append([chrom, anno['start']-1, anno['end'], gene]) def bedStart(bed): return bed[1] geneBed.sort(key=bedStart) with open('nextstrainGene.bed', 'w') as outG: for bed in geneBed: outG.write('\t'.join(map(str, bed)) + '\n') - outG.close() # Variants and "clades" snvRe = re.compile('^([ACGT])([0-9]+)([ACGT])$') snvAaRe = re.compile('^([A-Z*])([0-9]+)([A-Z*])$') clades = {} cladeNodes = {} variantCounts = {} variantAaChanges = {} samples = [] cladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207', 'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81', 'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53', @@ -328,31 +327,30 @@ info = 'AC=' + ','.join(map(str, altCounts)) + ';AN=' + str(sampleCount) varNameMerged = pv[1] aaChange = tallyAaChanges(varNameMerged) if (len(aaChange)): info += ';AACHANGE=' + aaChange if (len(backMutSamples)): info += ';BACKMUTS=' + ','.join(backMutSamples) genotypes = [] for sample, alIx in zip(samples, sampleAlleles): gt = str(alIx) genotypes.append(gt + ':' + sample['clade']) outC.write('\t'.join([ chrom, '\t'.join(map(str, pv)), '\t'.join(['.', 'PASS', info, 'GT:CLADE']), '\t'.join(genotypes) ]) + '\n') - outC.close() # Assign samples to clades; a sample can appear in multiple clades if they are nested. cladeSamples = {} cladeSampleCounts = {} cladeSampleNames = {} def sampleIdsFromNode(node, ids): """Fill in a dict of IDs of all samples found under node.""" kids = node.get('children') if (kids): for kid in kids: sampleIdsFromNode(kid, ids) else: sampleId = node['node_attrs']['gisaid_epi_isl']['value'] ids[sampleId] = 1 @@ -385,51 +383,49 @@ acTotal += 1 if (acTotal > 0): info = 'AC=' + ','.join(map(str, altCounts)) info += ';AN=' + str(cladeSampleCounts[cladeName]) aaChange = tallyAaChanges(varNameMerged) if (len(aaChange)): info += ';AACHANGE=' + aaChange cladeBackMuts = [ sampleName for sampleName in backMutSamples if sampleName in cladeSampleNames[cladeName] ] if (len(cladeBackMuts)): info += ';BACKMUTS=' + ','.join(cladeBackMuts) outV.write('\t'.join([ chrom, '\t'.join(map(str, pv)), '\t'.join(['.', 'PASS', info, 'GT']), '\t'.join(genotypes) ]) + '\n') - outV.close() # BED+ file for clades with open('nextstrainClade.bed', 'w') as outC: for name, clade in clades.items(): if (clade.get('thickStart')): outC.write('\t'.join(map(str, [ chrom, 0, 29903, name, 0, '.', clade['thickStart'], clade['thickEnd'], clade['color'], len(clade['varSizes']) + 2, '1,' + ','.join(map(str, clade['varSizes'])) + ',1,', '0,' + ','.join(map(str, clade['varStarts'])) + ',29902,', clade['varNames'], numDateToYmdStr(clade['dateInferred']), numDateToYmdStr(clade['dateConfMin']), numDateToYmdStr(clade['dateConfMax']), clade['countryInferred'], clade['countryConf'], cladeSampleCounts[name], ', '.join(cladeSampleNames[name]) ])) + '\n') - outC.close() # Newick-formatted tree of samples for VCF display def cladeRgbFromName(cladeName): """Look up the r,g,b string color for clade; convert to int RGB.""" rgbCommaStr = cladeColorFromName(cladeName) r, g, b = [ int(x) for x in rgbCommaStr.split(',') ] rgb = (r << 16) | (g << 8) | b return rgb def rNextstrainToNewick(node, parentClade=None, parentVarStr=''): """Recursively descend ncov.tree and build Newick tree string of samples to file""" kids = node.get('children') if (kids): # Make a more concise variant path string than the one we make for the clade track, # to embed in internal node labels for Yatish's tree explorations. @@ -456,97 +452,91 @@ descendants = ','.join([ rNextstrainToNewick(child, cladeName, varStr) for child in kids ]) label = '#'.join([cladeName, varStr]) treeString = '(' + descendants + ')' + label + ':' + color else: nodeAttrs = node['node_attrs'] gId = nodeAttrs['gisaid_epi_isl']['value'] name = node['name'] date = numDateToMonthDay(nodeAttrs['num_date']['value']) cladeName = nodeAttrs['clade_membership']['value'] color = str(cladeRgbFromName(cladeName)) treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color return treeString with open('nextstrain.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree']) + ';\n') - outF.close() for cladeName, node in cladeNodes.items(): filename = 'nextstrain' + cladeName + '.nh' with open(filename, 'w') as outF: outF.write(rNextstrainToNewick(node) + ';\n') - outF.close() # File with samples and their clades, labs and variant paths apostropheSRe = re.compile("'s"); firstLetterRe = re.compile('(\w)\w+'); spacePunctRe = re.compile('\W'); def abbreviateLab(lab): """Lab names are very long and sometimes differ by punctuation or typos. Abbreviate for easier comparison.""" labAbbrev = apostropheSRe.sub('', lab) labAbbrev = firstLetterRe.sub(r'\1', labAbbrev, count=0) labAbbrev = spacePunctRe.sub('', labAbbrev, count=0) return labAbbrev with open('nextstrainSamples.varPaths', 'w') as outF: for sample in samples: lab = sample['lab'] labAbbrev = abbreviateLab(lab) outF.write('\t'.join([sampleName(sample), sample['clade'], labAbbrev, lab, sample['varStr']]) + '\n'); - outF.close() # Narrow down variants to "informative" set (bi-allelic, each allele supported by # sufficient number of samples): minSamples = 2 discardedAlleles = [] blacklist = [] informativeVariants = [] for mv in mergedVars: pv, alts, altCounts, sampleAlleles, backMutSamples = mv pos, varNameMerged, ref, altStr = pv recurrentAlts = [] for alt, altCount in zip(alts, altCounts): if (altCount < minSamples): discardedAlleles.append([ chrom, pos-1, pos, ref + str(pos) + alt ]) else: recurrentAlts.append(alt) if (len(recurrentAlts) > 1): multiRecurrentName = ','.join([ ref + str(pos) + alt for alt in recurrentAlts ]) blacklist.append([ chrom, pos-1, pos, multiRecurrentName ]) elif (len(recurrentAlts) == 1): informativeVariants.append([ pos, ref, recurrentAlts[0] ]) # Dump out BED files for the categories: with open('nextstrainDiscarded.bed', 'w') as outF: for da in discardedAlleles: outF.write('\t'.join(map(str, da)) + '\n'); - outF.close() with open('nextstrainBlacklisted.bed', 'w') as outF: for bl in blacklist: outF.write('\t'.join(map(str, bl)) + '\n'); - outF.close() with open('nextstrainInformative.bed', 'w') as outF: for iv in informativeVariants: pos, ref, alt = iv outF.write('\t'.join(map(str, [ chrom, pos-1, pos, ref + str(pos) + alt ])) + '\n') - outF.close() # Compute parsimony score for tree at each informative variant. anyDiscordantVariants = [] def refAltCosts(node, pos, ref, alt, parentValue): nodeValue = parentValue mutCount = 0 # If there are mutations on this branch, and one of them is at pos and is ref or alt # (i.e. not a discarded allele that we're ignoring), then change nodeValue. if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and node['branch_attrs']['mutations'].get('nuc')): for varName in node['branch_attrs']['mutations']['nuc']: m = snvRe.match(varName) if (m): @@ -578,49 +568,45 @@ for iv in informativeVariants: pos, ref, alt = iv varName = ref + str(pos) + alt rootRefCost, rootAltCost, mutCount = refAltCosts(ncov['tree'], pos, ref, alt, ref) parsimonyScores.append([pos, min(rootRefCost, rootAltCost)]) mutationCounts.append([pos, mutCount]) rootLabel = 0 if (rootRefCost <= rootAltCost) else 1 if (rootLabel != 0): # Note: so far this has not happened. Seems like it would be pretty lame if it did. rootDiscordantVariants.append([chrom, pos-1, pos, varName]) # Write out files with discordant nodes, parsimony/mutation counts (identical!): with open('nextstrainRootDiscordant.bed', 'w') as outF: for dv in rootDiscordantVariants: outF.write('\t'.join(dv) + '\n'); - outF.close(); with open('nextstrainAnyDiscordant.txt', 'w') as outF: for varName in anyDiscordantVariants: outF.write(varName + '\n'); - outF.close() # bedGraph for parsimony scores and mutation counts with open('nextstrainParsimony.bedGraph', 'w') as outF: for ps in parsimonyScores: pos, score = ps outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n') - outF.close() with open('nextstrainMutCounts.bedGraph', 'w') as outF: for ps in mutationCounts: pos, score = ps outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n') - outF.close() # Informative-only VCF with open('nextstrainRecurrentBiallelic.vcf', 'w') as outF: writeVcfHeaderExceptSamples(outF) outF.write('\t'.join(sampleNames) + '\n') for iv in informativeVariants: pos, ref, alt = iv varName = ref + str(pos) + alt # Ignore any discarded variants at this position, but handle back-mutations if any, # by calling mergeVariants on this and its back-mutation backMutVarName = alt + str(pos) + ref variants = [ [ pos, varName, ref, alt ] ] if (variantCounts.get(backMutVarName)): variants.append([ pos, backMutVarName, alt, ref ]) pv, alts, altCounts, sampleAlleles, backMutSamples = mergeVariants(variants) @@ -629,16 +615,15 @@ ', but got [' + ', '.join(alts) + ']') info = 'AC=' + str(altCounts[0]) info += ';AN=' + str(sampleCount) aaChange = tallyAaChanges(varName) if (len(aaChange)): info += ';AACHANGE=' + aaChange if (len(backMutSamples)): info += ';BACKMUTS=' + ','.join(backMutSamples) genotypes = [] for sample, alIx in zip(samples, sampleAlleles): gt = str(alIx) genotypes.append(gt + ':' + sample['clade']) outF.write('\t'.join([ '\t'.join([ chrom, str(pos), varName, ref, alt, '.', 'PASS', info, 'GT']), '\t'.join(genotypes) ]) + '\n') - outF.close()