src/hg/utils/otto/nextstrainNcov/nextstrain.py 0b18ad3971852efb98daabd60d55ec1cc1711612

0b18ad3971852efb98daabd60d55ec1cc1711612
angie
  Thu May 14 13:46:59 2020 -0700
Python learning: the whole point of 'with open(...) as f' is that you don't have to remember to close f.

diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py
index 3012d76..775fad5 100755
--- src/hg/utils/otto/nextstrainNcov/nextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/nextstrain.py
@@ -10,31 +10,30 @@
 chrom = 'NC_045512v2'
 
 # Genes (to which AA mutations are relative):
 genePos = {}
 geneBed = []
 for gene,anno in ncov['meta']['genome_annotations'].items():
     if (gene != 'nuc'):
         genePos[gene] = anno['start'] - 1
         geneBed.append([chrom, anno['start']-1, anno['end'], gene])
 def bedStart(bed):
     return bed[1]
 geneBed.sort(key=bedStart)
 with open('nextstrainGene.bed', 'w') as outG:
     for bed in geneBed:
         outG.write('\t'.join(map(str, bed)) + '\n')
-    outG.close()
 
 # Variants and "clades"
 
 snvRe = re.compile('^([ACGT])([0-9]+)([ACGT])$')
 snvAaRe = re.compile('^([A-Z*])([0-9]+)([A-Z*])$')
 
 clades = {}
 cladeNodes = {}
 variantCounts = {}
 variantAaChanges = {}
 samples = []
 
 cladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207',
                 'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81',
                 'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53',
@@ -328,31 +327,30 @@
         info = 'AC=' + ','.join(map(str, altCounts)) + ';AN=' + str(sampleCount)
         varNameMerged = pv[1]
         aaChange = tallyAaChanges(varNameMerged)
         if (len(aaChange)):
             info += ';AACHANGE=' + aaChange
         if (len(backMutSamples)):
             info += ';BACKMUTS=' + ','.join(backMutSamples)
         genotypes = []
         for sample, alIx in zip(samples, sampleAlleles):
             gt = str(alIx)
             genotypes.append(gt + ':' + sample['clade'])
         outC.write('\t'.join([ chrom,
                                '\t'.join(map(str, pv)),
                                '\t'.join(['.', 'PASS', info, 'GT:CLADE']),
                                '\t'.join(genotypes) ]) + '\n')
-    outC.close()
 
 # Assign samples to clades; a sample can appear in multiple clades if they are nested.
 cladeSamples = {}
 cladeSampleCounts = {}
 cladeSampleNames = {}
 
 def sampleIdsFromNode(node, ids):
     """Fill in a dict of IDs of all samples found under node."""
     kids = node.get('children')
     if (kids):
         for kid in kids:
             sampleIdsFromNode(kid, ids)
     else:
         sampleId = node['node_attrs']['gisaid_epi_isl']['value']
         ids[sampleId] = 1
@@ -385,51 +383,49 @@
                         acTotal += 1
             if (acTotal > 0):
                 info = 'AC=' + ','.join(map(str, altCounts))
                 info += ';AN=' + str(cladeSampleCounts[cladeName])
                 aaChange = tallyAaChanges(varNameMerged)
                 if (len(aaChange)):
                     info += ';AACHANGE=' + aaChange
                 cladeBackMuts = [ sampleName for sampleName in backMutSamples
                                   if sampleName in cladeSampleNames[cladeName] ]
                 if (len(cladeBackMuts)):
                     info += ';BACKMUTS=' + ','.join(cladeBackMuts)
                 outV.write('\t'.join([ chrom,
                                        '\t'.join(map(str, pv)),
                                        '\t'.join(['.', 'PASS', info, 'GT']),
                                        '\t'.join(genotypes) ]) + '\n')
-        outV.close()
 
 # BED+ file for clades
 with open('nextstrainClade.bed', 'w') as outC:
     for name, clade in clades.items():
         if (clade.get('thickStart')):
             outC.write('\t'.join(map(str,
                                      [ chrom, 0, 29903, name, 0, '.',
                                        clade['thickStart'], clade['thickEnd'], clade['color'],
                                        len(clade['varSizes']) + 2,
                                        '1,' + ','.join(map(str, clade['varSizes'])) + ',1,',
                                        '0,' + ','.join(map(str, clade['varStarts'])) + ',29902,',
                                        clade['varNames'],
                                        numDateToYmdStr(clade['dateInferred']),
                                        numDateToYmdStr(clade['dateConfMin']),
                                        numDateToYmdStr(clade['dateConfMax']),
                                        clade['countryInferred'],
                                        clade['countryConf'],
                                        cladeSampleCounts[name],
                                        ', '.join(cladeSampleNames[name]) ])) + '\n')
-    outC.close()
 
 # Newick-formatted tree of samples for VCF display
 def cladeRgbFromName(cladeName):
     """Look up the r,g,b string color for clade; convert to int RGB."""
     rgbCommaStr = cladeColorFromName(cladeName)
     r, g, b = [ int(x) for x in rgbCommaStr.split(',') ]
     rgb = (r << 16) | (g << 8) | b
     return rgb
 
 def rNextstrainToNewick(node, parentClade=None, parentVarStr=''):
     """Recursively descend ncov.tree and build Newick tree string of samples to file"""
     kids = node.get('children')
     if (kids):
         # Make a more concise variant path string than the one we make for the clade track,
         # to embed in internal node labels for Yatish's tree explorations.
@@ -456,97 +452,91 @@
         descendants = ','.join([ rNextstrainToNewick(child, cladeName, varStr) for child in kids ])
         label = '#'.join([cladeName, varStr])
         treeString = '(' + descendants + ')' + label + ':' + color
     else:
         nodeAttrs = node['node_attrs']
         gId = nodeAttrs['gisaid_epi_isl']['value']
         name = node['name']
         date = numDateToMonthDay(nodeAttrs['num_date']['value'])
         cladeName = nodeAttrs['clade_membership']['value']
         color = str(cladeRgbFromName(cladeName))
         treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color
     return treeString
 
 with open('nextstrain.nh', 'w') as outF:
     outF.write(rNextstrainToNewick(ncov['tree']) + ';\n')
-    outF.close()
 
 for cladeName, node in cladeNodes.items():
     filename = 'nextstrain' + cladeName + '.nh'
     with open(filename, 'w') as outF:
         outF.write(rNextstrainToNewick(node) + ';\n')
-        outF.close()
 
 # File with samples and their clades, labs and variant paths
 
 apostropheSRe = re.compile("'s");
 firstLetterRe = re.compile('(\w)\w+');
 spacePunctRe = re.compile('\W');
 
 def abbreviateLab(lab):
     """Lab names are very long and sometimes differ by punctuation or typos.  Abbreviate for easier comparison."""
     labAbbrev = apostropheSRe.sub('', lab)
     labAbbrev = firstLetterRe.sub(r'\1', labAbbrev, count=0)
     labAbbrev = spacePunctRe.sub('', labAbbrev, count=0)
     return labAbbrev
 
 with open('nextstrainSamples.varPaths', 'w') as outF:
     for sample in samples:
         lab = sample['lab']
         labAbbrev = abbreviateLab(lab)
         outF.write('\t'.join([sampleName(sample), sample['clade'], labAbbrev, lab,
                               sample['varStr']]) + '\n');
-    outF.close()
 
 # Narrow down variants to "informative" set (bi-allelic, each allele supported by
 # sufficient number of samples):
 minSamples = 2
 discardedAlleles = []
 blacklist = []
 informativeVariants = []
 
 for mv in mergedVars:
     pv, alts, altCounts, sampleAlleles, backMutSamples = mv
     pos, varNameMerged, ref, altStr = pv
     recurrentAlts = []
     for alt, altCount in zip(alts, altCounts):
         if (altCount < minSamples):
             discardedAlleles.append([ chrom, pos-1, pos, ref + str(pos) + alt ])
         else:
             recurrentAlts.append(alt)
     if (len(recurrentAlts) > 1):
         multiRecurrentName = ','.join([ ref + str(pos) + alt for alt in recurrentAlts ])
         blacklist.append([ chrom, pos-1, pos, multiRecurrentName ])
     elif (len(recurrentAlts) == 1):
         informativeVariants.append([ pos, ref, recurrentAlts[0] ])
 
 # Dump out BED files for the categories:
 with open('nextstrainDiscarded.bed', 'w') as outF:
     for da in discardedAlleles:
         outF.write('\t'.join(map(str, da)) + '\n');
-    outF.close()
 
 with open('nextstrainBlacklisted.bed', 'w') as outF:
     for bl in blacklist:
         outF.write('\t'.join(map(str, bl)) + '\n');
-    outF.close()
 
 with open('nextstrainInformative.bed', 'w') as outF:
     for iv in informativeVariants:
         pos, ref, alt = iv
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, ref + str(pos) + alt ])) + '\n')
-    outF.close()
 
 # Compute parsimony score for tree at each informative variant.
 
 anyDiscordantVariants = []
 
 def refAltCosts(node, pos, ref, alt, parentValue):
     nodeValue = parentValue
     mutCount = 0
     # If there are mutations on this branch, and one of them is at pos and is ref or alt
     # (i.e. not a discarded allele that we're ignoring), then change nodeValue.
     if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
         node['branch_attrs']['mutations'].get('nuc')):
         for varName in node['branch_attrs']['mutations']['nuc']:
             m = snvRe.match(varName)
             if (m):
@@ -578,49 +568,45 @@
 for iv in informativeVariants:
     pos, ref, alt = iv
     varName = ref + str(pos) + alt
     rootRefCost, rootAltCost, mutCount = refAltCosts(ncov['tree'], pos, ref, alt, ref)
     parsimonyScores.append([pos, min(rootRefCost, rootAltCost)])
     mutationCounts.append([pos, mutCount])
     rootLabel = 0 if (rootRefCost <= rootAltCost) else 1
     if (rootLabel != 0):
         # Note: so far this has not happened.  Seems like it would be pretty lame if it did.
         rootDiscordantVariants.append([chrom, pos-1, pos, varName])
 
 # Write out files with discordant nodes, parsimony/mutation counts (identical!):
 with open('nextstrainRootDiscordant.bed', 'w') as outF:
     for dv in rootDiscordantVariants:
         outF.write('\t'.join(dv) + '\n');
-    outF.close();
 
 with open('nextstrainAnyDiscordant.txt', 'w') as outF:
     for varName in anyDiscordantVariants:
         outF.write(varName + '\n');
-    outF.close()
 
 # bedGraph for parsimony scores and mutation counts
 with open('nextstrainParsimony.bedGraph', 'w') as outF:
     for ps in parsimonyScores:
         pos, score = ps
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n')
-    outF.close()
 
 with open('nextstrainMutCounts.bedGraph', 'w') as outF:
     for ps in mutationCounts:
         pos, score = ps
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n')
-    outF.close()
 
 # Informative-only VCF
 with open('nextstrainRecurrentBiallelic.vcf', 'w') as outF:
     writeVcfHeaderExceptSamples(outF)
     outF.write('\t'.join(sampleNames) + '\n')
     for iv in informativeVariants:
         pos, ref, alt = iv
         varName = ref + str(pos) + alt
         # Ignore any discarded variants at this position, but handle back-mutations if any,
         # by calling mergeVariants on this and its back-mutation
         backMutVarName = alt + str(pos) + ref
         variants = [ [ pos, varName, ref, alt ] ]
         if (variantCounts.get(backMutVarName)):
             variants.append([ pos, backMutVarName, alt, ref ])
         pv, alts, altCounts, sampleAlleles, backMutSamples = mergeVariants(variants)
@@ -629,16 +615,15 @@
                  ', but got [' + ', '.join(alts) + ']')
         info = 'AC=' + str(altCounts[0])
         info += ';AN=' + str(sampleCount)
         aaChange = tallyAaChanges(varName)
         if (len(aaChange)):
             info += ';AACHANGE=' + aaChange
         if (len(backMutSamples)):
             info += ';BACKMUTS=' + ','.join(backMutSamples)
         genotypes = []
         for sample, alIx in zip(samples, sampleAlleles):
             gt = str(alIx)
             genotypes.append(gt + ':' + sample['clade'])
         outF.write('\t'.join([ '\t'.join([ chrom, str(pos), varName, ref, alt,
                                            '.', 'PASS', info, 'GT']),
                                '\t'.join(genotypes) ]) + '\n')
-    outF.close()