src/hg/utils/otto/nextstrainNcov/nextstrain.py f070f343168fcb94a21b832d1cb20104b0670186

f070f343168fcb94a21b832d1cb20104b0670186
angie
  Fri Jun 30 15:08:56 2023 -0700
There really should never be a clade-defining mutation on the first or last base of the genome (low/no coverage in most genomes), but this morning it happened, so at least prevent illegal overlapping blocks in nextstrainClade bed output.

diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py
index de8b47d..f941e52 100755
--- src/hg/utils/otto/nextstrainNcov/nextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/nextstrain.py
@@ -1,785 +1,791 @@
 #!/usr/bin/env python3
 
 import json
 import re
 from warnings import warn
 from collections import defaultdict
 
 with open('ncov.json') as f:
     ncov = json.load(f)
 
 chrom = 'NC_045512v2'
 
 # Genes (to which AA mutations are relative):
 genePos = {}
 geneBed = []
 for gene,anno in ncov['meta']['genome_annotations'].items():
     if (gene != 'nuc'):
         genePos[gene] = anno['start'] - 1
         geneBed.append([chrom, anno['start']-1, anno['end'], gene])
 def bedStart(bed):
     return bed[1]
 geneBed.sort(key=bedStart)
 with open('nextstrainGene.bed', 'w') as outG:
     for bed in geneBed:
         outG.write('\t'.join(map(str, bed)) + '\n')
 
 # Variants and clades
 
 snvRe = re.compile('^([ACGT-])([0-9]+)([ACGT-])$')
 snvAaRe = re.compile('^([A-Z*-])([0-9]+)([A-Z*-])$')
 
 newClades = {}
 variantCounts = {}
 variantAaChanges = {}
 samples = []
 
 # Clades from Feb. 2023:
 # nColors=23
 # head -$nColors ~/github/ncov/defaults/color_schemes.tsv | tail -1 \
 # | perl -wne 'chomp; @cols = grep { s/^#//; } split;
 #              foreach $hexC (@cols) {
 #                if ($hexC =~ m/^(..)(..)(..)$/) {
 #                  print sprintf("%d,%d,%d\n", hex($1), hex($2), hex($3));
 #                } else { die $hexC; } }'
 newCladeColors = { '20H (Beta, V2)':  '94,29,157',
                    '20I (Alpha, V1)': '74,39,178',
                    '20J (Gamma, V3)': '64,58,196',
                    '21A (Delta)' :    '63,82,205',
                    '21I (Delta)':     '65,105,207',
                    '21J (Delta)':     '69,126,203',
                    '21B (Kappa)':     '76,144,192',
                    '21C (Epsilon)':   '85,158,177',
                    '21D (Eta)':       '95,169,160',
                    '21E (Theta)':     '107,177,142',
                    '21F (Iota)':      '121,183,124',
                    '21G (Lambda)':    '137,187,107',
                    '21H (Mu)':        '153,189,93',
                    '21M (Omicron)':   '170,189,82', # B.1.1.529
                    '21K (Omicron)':   '187,188,73', # BA.1
                    '21L (Omicron)':   '203,184,66', # BA.2
                    '22A (Omicron)':   '215,175,62', # BA.4
                    '22B (Omicron)':   '224,162,58', # BA.5
                    '22C (Omicron)':   '230,146,55', # BA.2.12.1
                    '22D (Omicron)':   '230,123,50', # BA.2.75
                    '22E (Omicron)':   '227,97,45',  # BQ.1
                    '22F (Omicron)':   '223,69,40',  # XBB
                    '23A (Omicron)':   '219,40,35',  # XBB.1.5
                    # Grayscale for pre-VoC lineages
                    '19A':       '216,216,216',
                    '19B':       '209,209,209',
                    '20A':       '202,202,202',
                    '20B':       '195,195,195',
                    '20C':       '188,188,188',
                    '20D':       '181,181,181',
                    '20E (EU1)': '174,174,174',
                    '20F':       '167,167,167',
                    '20G':       '160,160,160',
          }
 
 def cladeColorFromName(cladeName, cladeColors):
     color = cladeColors.get(cladeName);
     if (not color):
         color = '0,0,0'
     return color
 
 def cladeFromVariants(name, variants, varStr):
     """Extract bed12 info from an object whose keys are SNV variant names"""
     clade = {}
     snvEnds = []
     # Watch out for back-mutations which make invalid BED because then we have multiple "blocks"
     # at the same position.  Instead, make a back-mutation cancel out the mutation because the
     # mutation is not found at this node.
     changesByPos = defaultdict(list)
     ixsToRemove = []
     for varName in variants:
         m = snvRe.match(varName)
         if (m):
             ref, pos, alt = m.groups()
             prevMut = changesByPos[pos]
             if (prevMut):
                 # If multi-allelic, leave the position in the list; but if back-mutation,
                 # remove the original mutation.  In either case, don't add this pos again.
                 prevIx, prevRef, prevAlt = prevMut
                 if (prevAlt == ref and prevRef == alt):
                     ixsToRemove.append(prevIx)
                     changesByPos[pos] = []
             else:
                 ix = len(snvEnds)
                 changesByPos[pos] = (ix, ref, alt)
                 snvEnds.append(int(pos))
         else:
             warn("cladeFromVariants: no match for SNV '%s'" % (varName))
     if ixsToRemove:
         ixsToRemove.sort(reverse=True)
         for ix in ixsToRemove:
             del snvEnds[ix]
     if snvEnds:
         snvEnds.sort()
         snvStarts = [ e-1 for e in snvEnds ]
         snvSizes = [ 1 for e in snvEnds ]
         clade['thickStart'] = min(snvStarts)
         clade['thickEnd'] = max(snvEnds)
         clade['name'] = name
         clade['varSizes'] = snvSizes
         clade['varStarts'] = snvStarts
         clade['varNames'] = varStr
     return clade
 
 def addDatesToClade(clade, numDateAttrs):
     """Add the numeric dates from ncov.json node_attrs.num_date to clade record"""
     if (numDateAttrs):
         clade['dateInferred'] = numDateAttrs['value']
         clade['dateConfMin'] = numDateAttrs['confidence'][0]
         clade['dateConfMax'] = numDateAttrs['confidence'][1]
     else:
         clade['dateInferred'] = clade['dateConfMin'] = clade['dateConfMax'] = ''
 
 def addCountryToClade(clade, countryAttrs):
     """Add country data from ncov.json node_attrs.country to clade"""
     clade['countryInferred'] = countryAttrs['value']
     conf = countryAttrs.get('confidence')
     clade['countryConf'] = ', '.join([ "%s: %0.5f" % (country, conf)
                                        for country, conf in conf.items()]) if conf else ''
 
 def processClade(branch, tag, branchVariants, branchVarStr, clades):
     """If this is the first time we've seen clade, add it to clades"""
     nodeAttrs = branch['node_attrs']
     if (nodeAttrs.get(tag)):
         cladeName = nodeAttrs[tag]['value']
         if (cladeName != 'unassigned' and not cladeName in clades):
             clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr)
             addDatesToClade(clades[cladeName], nodeAttrs.get('num_date'))
             if (nodeAttrs.get('country')):
                 addCountryToClade(clades[cladeName], nodeAttrs['country'])
             elif (nodeAttrs.get('division')):
                 addCountryToClade(clades[cladeName], nodeAttrs['division'])
             clades[cladeName]['topNode'] = branch
 
 def numDateToYmd(numDate):
     """Convert numeric date (decimal year) to integer year, month, day"""
     year = int(numDate)
     isLeapYear = 1 if (year % 4 == 0) else 0
     # Get rid of the year
     numDate -= year
     # Convert to Julian day
     daysInYear = 366 if isLeapYear else 365
     jDay = int(numDate * daysInYear) + 1
     if (jDay > 334 + isLeapYear):
         month, day = 11, (jDay - 334 - isLeapYear)
     elif (jDay > 304 + isLeapYear):
         month, day = 10, (jDay - 304 - isLeapYear)
     elif (jDay > 273 + isLeapYear):
         month, day = 9, (jDay - 273 - isLeapYear)
     elif (jDay > 243 + isLeapYear):
         month, day = 8, (jDay - 243 - isLeapYear)
     elif (jDay > 212 + isLeapYear):
         month, day = 7, (jDay - 212 - isLeapYear)
     elif (jDay > 181 + isLeapYear):
         month, day = 6, (jDay - 181 - isLeapYear)
     elif (jDay > 151 + isLeapYear):
         month, day = 5, (jDay - 151 - isLeapYear)
     elif (jDay > 120 + isLeapYear):
         month, day = 4, (jDay - 120 - isLeapYear)
     elif (jDay > 90 + isLeapYear):
         month, day = 3, (jDay - 90 - isLeapYear)
     elif (jDay > 59 + isLeapYear):
         month, day = 2, (jDay - 59 - isLeapYear)
     elif (jDay > 31):
         month, day = 1, (jDay - 31)
     else:
         month, day = 0, jDay
     return year, month, day
 
 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
 
 def numDateToMonthDay(numDate):
     """Transform decimal year timestamp to string with only month and day"""
     if not numDate:
         return '?'
     year, month, day = numDateToYmd(numDate)
     return months[month] + str(day)
 
 def numDateToYmdStr(numDate):
     """Convert decimal year to YY-MM-DD string"""
     if numDate:
         year, month, day = numDateToYmd(numDate)
         return "%02d-%02d-%02d" % (year, month+1, day)
     else:
         return ''
 
 def rUnpackNextstrainTree(branch, parentVariants, parentVarStr):
     """Recursively descend ncov.tree and build data structures for genome browser tracks"""
     # Gather variants specific to this node/branch (if any)
     localVariants = []
     if (branch.get('branch_attrs') and branch['branch_attrs'].get('mutations') and
         branch['branch_attrs']['mutations'].get('nuc')):
         # Nucleotide variants specific to this branch
         for varName in branch['branch_attrs']['mutations']['nuc']:
             if (snvRe.match(varName)):
                 localVariants.append(varName)
                 if (not variantCounts.get(varName)):
                     variantCounts[varName] = 0;
             else:
                 warn("rUnpackNextstrainTree: no snvRe match for '%s'" % (varName))
         # Amino acid variants: figure out which nucleotide variant goes with each
         for geneName in branch['branch_attrs']['mutations'].keys():
             if (geneName != 'nuc'):
                 for change in branch['branch_attrs']['mutations'][geneName]:
                     # Get nucleotide coords & figure out which nuc var this aa change corresponds to
                     aaM = snvAaRe.match(change)
                     if (aaM):
                         aaRef, aaPos, aaAlt = aaM.groups()
                         varStartMin = (int(aaPos) - 1) * 3
                         if (genePos.get(geneName)):
                             cdsStart = genePos.get(geneName)
                             varStartMin += cdsStart
                             varStartMax = varStartMin + 2
                             for varName in localVariants:
                                 ref, pos, alt = snvRe.match(varName).groups()
                                 pos = int(pos) - 1
                                 if (pos >= varStartMin and pos <= varStartMax):
                                     variantAaChanges[varName] = geneName + ':' + change
                         else:
                             warn("Can't find start for gene " + geneName)
     # Inherit parent variants
     branchVariants = parentVariants.copy()
     # Add variants specific to this branch (if any)
     for varName in localVariants:
         branchVariants[varName] = 1
     # Make an ordered variant string as David requested: semicolons between nodes,
     # comma-separated within a node:
     branchVarStr = ''
     for varName in localVariants:
         if (len(branchVarStr)):
             branchVarStr += ', '
         branchVarStr += varName
         aaVar = variantAaChanges.get(varName)
         if (aaVar):
             branchVarStr += ' (' + aaVar + ')'
     if (len(parentVarStr) and len(branchVarStr)):
         branchVarStr = parentVarStr + '; ' + branchVarStr
     elif (not len(branchVarStr)):
         branchVarStr = parentVarStr
     processClade(branch, 'clade_membership', branchVariants, branchVarStr, newClades)
     kids = branch.get('children')
     if (kids):
         for child in kids:
             rUnpackNextstrainTree(child, branchVariants, branchVarStr);
     else:
         for varName in branchVariants:
             variantCounts[varName] += 1
         nodeAttrs = branch['node_attrs']
         if (nodeAttrs.get('submitting_lab')):
             lab = nodeAttrs['submitting_lab']['value']
         else:
             lab = ''
         epiNode = nodeAttrs.get('gisaid_epi_isl')
         epiId = epiNode['value'] if epiNode else branch['name']
         numDateNode = nodeAttrs.get('num_date')
         numDate = numDateNode['value'] if numDateNode else ''
         samples.append({ 'id': epiId,
                          'name': branch['name'],
                          'clade': nodeAttrs['clade_membership']['value'],
                          'date': numDateToMonthDay(numDate),
                          'lab': lab,
                          'variants': branchVariants,
                          'varStr': branchVarStr })
 
 rUnpackNextstrainTree(ncov['tree'], {}, '')
 
 def sampleName(sample):
     if sample['id'] != sample['name']:
         return '|'.join([sample['id'], sample['name'], sample['date']])
     else:
         return '|'.join([sample['name'], sample['date']])
 
 sampleCount = len(samples)
 sampleNames = [ sampleName(sample)  for sample in samples ]
 
 # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order
 parsedVars = []
 for varName in variantCounts.keys():
     m = snvRe.match(varName)
     if (m):
         ref, pos, alt = m.groups()
         parsedVars.append([int(pos), varName, ref, alt])
     else:
         warn("Can't match " + varName)
 # Sort by position
 def parsedVarPos(pv):
     return pv[0]
 parsedVars.sort(key=parsedVarPos)
 
 def parsedVarAlleleCount(pv):
     if not variantCounts.get(pv[1]):
         print(f'No variantCount for pv {pv}')
         exit(1)
     return variantCounts[pv[1]]
 
 def mergeVariants(pvList):
     """Given a list of parsedVars [pos, varName, ref, alt] at the same pos, resolve the actual
     allele at each sample, handling back-mutations and serial mutations."""
     # Sort by descending allele count, assuming that the ref allele of the most frequently
     # observed variant is the true ref allele.  For back-muts, ref and alt are swapped;
     # for serial muts, alt of the first is ref of the second.
     pvList.sort(key=parsedVarAlleleCount)
     pvList.reverse()
     pos, varName, trueRef, firstAlt = pvList[0]
     alts = []
     sampleAlleles = [ 0 for sample in samples ]
     backMutSamples = []
     for pv in pvList:
         thisPos, thisName, thisRef, thisAlt = pv
         count = variantCounts[thisName]
         mergedName = trueRef + str(pos) + thisAlt
         if (thisPos != pos):
             warn("mergeVariants: inconsistent pos " + pos + " and " + thisPos)
         if (thisAlt == trueRef):
             # Back-mutation, not a true alt
             alIx = 0
         elif (thisAlt == '-'):
             # Deletion but we'll pretend it's no-call for VCF
             alIx = -1
         else:
             # Add to list of alts - unless it's an alt we've already seen, but from a different
             # serial mutation.  For example, there might be T>A but also T>G+G>A; don't add A twice.
             if (not thisAlt in alts):
                 alts.append(thisAlt)
                 if (thisName != varName):
                     if varName == trueRef + str(pos) + '-':
                         varName = mergedName
                     else:
                         varName += "," + mergedName
             alIx = alts.index(thisAlt) + 1
         for ix, sample in enumerate(samples):
             if (sample['variants'].get(thisName)):
                 sampleAlleles[ix] = alIx
                 if (alIx == 0 and thisRef != '-'):
                     backMutSamples.append(sampleName(sample))
         if thisRef != trueRef:
             # If this is a new merged mutation, add it to variantCounts.  If merging makes an
             # already-seen mutation then add to the existing count.
             if variantCounts.get(mergedName):
                 variantCounts[mergedName] += count
             else:
                 variantCounts[mergedName] = count
     # After handling back- and serial mutations, figure out true counts of each alt allele:
     altCounts = [ 0 for alt in alts ]
     for alIx in sampleAlleles:
         if (alIx > 0):
             altCounts[alIx - 1] += 1
     return [ [pos, varName, trueRef, ','.join(alts)],
              alts, altCounts, sampleAlleles, backMutSamples ]
 
 mergedVars = []
 
 variantsAtPos = []
 for pv in parsedVars:
     pos = pv[0]
     if (len(variantsAtPos) == 0 or pos == variantsAtPos[0][0]):
         variantsAtPos.append(pv)
     else:
         mv = mergeVariants(variantsAtPos)
         if (mv[1]):
             mergedVars.append(mv)
         variantsAtPos = [pv]
 mv = mergeVariants(variantsAtPos)
 if (mv[1]):
     mergedVars.append(mv)
 
 def writeVcfHeaderExceptSamples(outF):
     """Write VCF header lines -- except for sample names (this ends with a \t not a \n)."""
     outF.write('##fileformat=VCFv4.3\n')
     outF.write('##source=nextstrain.org\n')
     outF.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) +
                '\t');
 
 def tallyAaChanges(varNameMerged):
     """If any of the merged variants cause a coding change, then produce a comma-sep string in same order as variants with corresponding change(s) or '-' if a variant does not cause a coding change.  If none of the variants cause a coding change, return the empty string."""
     varNames = varNameMerged.split(',')
     gotAaChange = 0
     aaChanges = []
     for varName in varNames:
         aaChange = variantAaChanges.get(varName)
         if (aaChange):
             gotAaChange = 1
             aaChanges.append(aaChange)
         else:
             aaChanges.append('-');
     if (gotAaChange):
         aaChangeStr = ','.join(aaChanges)
     else:
         aaChangeStr = ''
     return aaChangeStr
 
 # VCF
 def vcfForAll(fileName, cladeTag):
     with open(fileName, 'w') as outC:
         writeVcfHeaderExceptSamples(outC)
         outC.write('\t'.join(sampleNames) + '\n')
         for mv in mergedVars:
             pv, alts, altCounts, sampleAlleles, backMutSamples = mv
             info = 'AC=' + ','.join(map(str, altCounts)) + ';AN=' + str(sampleCount)
             varNameMerged = pv[1]
             aaChange = tallyAaChanges(varNameMerged)
             if (len(aaChange)):
                 info += ';AACHANGE=' + aaChange
             if (len(backMutSamples)):
                 info += ';BACKMUTS=' + ','.join(backMutSamples)
             genotypes = []
             for sample, alIx in zip(samples, sampleAlleles):
                 gt = '.' if alIx < 0 else str(alIx)
                 genotypes.append(gt + ':' + sample[cladeTag])
             outC.write('\t'.join([ chrom,
                                    '\t'.join(map(str, pv)),
                                    '\t'.join(['.', 'PASS', info, 'GT:CLADE']),
                                    '\t'.join(genotypes) ]) + '\n')
 
 vcfForAll('nextstrainSamples.vcf', 'clade')
 # I'll skip writing an enormous file with wasteful clades-in-genotypes... really need
 # a sample metadata file!
 
 # Assign samples to clades; a sample can appear in multiple clades if they are nested.
 
 def sampleIdsFromNode(node, cladeTops=()):
     """Return a list of IDs of all samples found under node."""
     kids = node.get('children')
     if (kids):
         sampleIds = []
         for kid in kids:
             if (kid not in cladeTops):
                 sampleIds += sampleIdsFromNode(kid, cladeTops)
     else:
         epiNode = node['node_attrs'].get('gisaid_epi_isl')
         sampleId = epiNode['value'] if epiNode else node['name']
         sampleIds = [sampleId]
     return sampleIds
 
 def sanitizeFileName(filename):
     """Remove or replace characters that cause trouble in filenames"""
     filename = filename.replace('/', '_').replace(' ', '_')
     filename = filename.replace('(', '').replace(')', '')
     filename = filename.replace(',', '');
     return filename
 
 cladeSampleCounts = {}
 cladeSampleNames = {}
 
 def vcfForClades(clades, cladeTops=()):
     """Given a set of clades, dump out VCF for each clade.
     Stop at nodes in cladeTops (for Nextstrain's new clade scheme where 19A is root, 20A fully
     contains 20B and 20C, etc."""
     for cladeName in clades:
         node = clades[cladeName]['topNode']
         cladeSampleIds = set(sampleIdsFromNode(node, cladeTops))
         cladeSampleList = [ sample for sample in samples if sample['id'] in cladeSampleIds ]
         cladeSampleCounts[cladeName] = len(cladeSampleList)
         cladeSampleNames[cladeName] = [ sampleName(sample) for sample in cladeSampleList ]
         sanitizedCladeName = sanitizeFileName(cladeName)
         with open('nextstrainSamples' + sanitizedCladeName + '.vcf', 'w') as outV:
             writeVcfHeaderExceptSamples(outV)
             outV.write('\t'.join(cladeSampleNames[cladeName]) + '\n')
             for mv in mergedVars:
                 pv, alts, overallAltCounts, sampleAlleles, backMutSamples = mv
                 varNameMerged = pv[1]
                 genotypes = []
                 altCounts = [ 0 for alt in alts ]
                 acTotal=0
                 for sample, alIx in zip(samples, sampleAlleles):
                     if (sample['id'] in cladeSampleIds):
                         gt = '.' if (alIx < 0) else str(alIx)
                         genotypes.append(gt + ':' + cladeName)
                         if (alIx > 0):
                             altCounts[alIx - 1] += 1
                             acTotal += 1
                 if (acTotal > 0):
                     info = 'AC=' + ','.join(map(str, altCounts))
                     info += ';AN=' + str(cladeSampleCounts[cladeName])
                     aaChange = tallyAaChanges(varNameMerged)
                     if (len(aaChange)):
                         info += ';AACHANGE=' + aaChange
                     cladeBackMuts = [ sampleName for sampleName in backMutSamples
                                       if sampleName in cladeSampleNames[cladeName] ]
                     if (len(cladeBackMuts)):
                         info += ';BACKMUTS=' + ','.join(cladeBackMuts)
                     outV.write('\t'.join([ chrom,
                                            '\t'.join(map(str, pv)),
                                            '\t'.join(['.', 'PASS', info, 'GT:CLADE']),
                                            '\t'.join(genotypes) ]) + '\n')
 
 def bedForClades(fileName, clades, cladeColors):
     """Make a BED file summarizing each clade"""
     with open(fileName, 'w') as outC:
         for name, clade in clades.items():
             if (not clade.get('thickStart')):
                 # "Clade" 19A encompasses the entire tree (minus the parts assigned to
                 # other "clades").  It has no identifying variants, and (as of June 7)
                 # no dates assigned.
                 clade['thickStart'] = clade['thickEnd'] = 0
                 clade['varStarts'] = clade['varSizes'] = []
                 clade['varNames'] = ''
                 clade['dateInferred'] = clade['dateConfMin'] = clade['dateConfMax'] = 0
             countryConf = clade.get('countryConf')
             if (not countryConf):
                 countryConf = ''
             countryInferred = clade.get('countryInferred')
             if (not countryInferred):
                 countryInferred = ''
+            # Add placeholder blocks at first and last base of genome, but don't duplicate
+            varStarts = clade['varStarts']
+            if len(varStarts) == 0 or varStarts[0] != 0:
+                varStarts = [0] + varStarts
+            if varStarts[-1] != 29902:
+                varStarts = varStarts + [29902]
             outC.write('\t'.join(map(str,
                                      [ chrom, 0, 29903, name, 0, '.',
                                        clade['thickStart'], clade['thickEnd'],
                                        cladeColorFromName(name, cladeColors),
-                                       len(clade['varSizes']) + 2,
-                                       ','.join(map(str, ([1] + clade['varSizes']) + [1])),
-                                       ','.join(map(str, ([0] + clade['varStarts']) + [29902])),
+                                       len(varStarts),
+                                       ','.join(map(str, [1 for x in varStarts])),
+                                       ','.join(map(str, varStarts)),
                                        clade['varNames'],
                                        numDateToYmdStr(clade['dateInferred']),
                                        numDateToYmdStr(clade['dateConfMin']),
                                        numDateToYmdStr(clade['dateConfMax']),
                                        countryInferred,
                                        countryConf,
                                        cladeSampleCounts[name],
                                        ', '.join(cladeSampleNames[name]) ])) + '\n')
 
 newCladeTops = [ newClades[cladeName]['topNode'] for cladeName in newClades ]
 vcfForClades(newClades, newCladeTops)
 bedForClades('nextstrainClade.bed', newClades, newCladeColors)
 
 # Newick-formatted tree of samples for VCF display
 def cladeRgbFromName(cladeName, cladeColors):
     """Look up the r,g,b string color for clade; convert to int RGB."""
     rgbCommaStr = cladeColorFromName(cladeName, cladeColors)
     r, g, b = [ int(x) for x in rgbCommaStr.split(',') ]
     rgb = (r << 16) | (g << 8) | b
     return rgb
 
 def rNextstrainToNewick(node, cladeColors, cladeTops=(), parentClade=None, parentVarStr=''):
     """Recursively descend ncov.tree and build Newick tree string of samples to file.
     Exclude nodes in cladeTops."""
     kids = node.get('children')
     if (kids):
         # Make a more concise variant path string than the one we make for the clade track,
         # to embed in internal node labels for Yatish's tree explorations.
         localVariants = []
         if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
             node['branch_attrs']['mutations'].get('nuc')):
             # Nucleotide variants specific to this branch
             for varName in node['branch_attrs']['mutations']['nuc']:
                 if (snvRe.match(varName)):
                     localVariants.append(varName)
         varStr = '+'.join(localVariants)
         if (len(parentVarStr) and len(varStr)):
             varStr = '$'.join([parentVarStr, varStr])
         elif (not len(varStr)):
             varStr = parentVarStr
         nodeAttrs = node['node_attrs']
         if (nodeAttrs.get('clade_membership')):
             cladeName = nodeAttrs['clade_membership']['value']
         elif (parentClade):
             cladeName = parentClade
         else:
             cladeName = 'unassigned'
         color = str(cladeRgbFromName(cladeName, cladeColors))
         cladeShortened = cladeName.split(' ', 1)[0]
         cladeShortened = cladeShortened.split('/', 1)[0]
         descendants = ','.join([ rNextstrainToNewick(child, cladeColors, cladeTops, cladeName,
                                                      varStr)
                                  for child in kids if child not in cladeTops ])
         treeString = '(' + descendants + '):' + color
     else:
         nodeAttrs = node['node_attrs']
         epiNode = nodeAttrs.get('gisaid_epi_isl')
         gId = epiNode['value'] if epiNode else node['name']
         name = node['name']
         numDateNode = nodeAttrs.get('num_date')
         date = numDateToMonthDay(numDateNode['value'] if numDateNode else '')
         cladeName = nodeAttrs['clade_membership']['value']
         color = str(cladeRgbFromName(cladeName, cladeColors))
         treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color
     return treeString
 
 with open('nextstrain.nh', 'w') as outF:
     outF.write(rNextstrainToNewick(ncov['tree'], newCladeColors) + ';\n')
 
 
 def newickForClades(clades, cladeColors, cladeTops=()):
     for cladeName in clades:
         filename = 'nextstrain' + sanitizeFileName(cladeName) + '.nh'
         node = clades[cladeName]['topNode']
         with open(filename, 'w') as outF:
             outF.write(rNextstrainToNewick(node, cladeColors, cladeTops) + ';\n')
 
 newickForClades(newClades, newCladeColors, newCladeTops)
 
 # File with samples and their clades, labs and variant paths
 
 apostropheSRe = re.compile("'s");
 firstLetterRe = re.compile('(\w)\w+');
 spacePunctRe = re.compile('\W');
 
 def abbreviateLab(lab):
     """Lab names are very long and sometimes differ by punctuation or typos.  Abbreviate for easier comparison."""
     labAbbrev = apostropheSRe.sub('', lab)
     labAbbrev = firstLetterRe.sub(r'\1', labAbbrev, count=0)
     labAbbrev = spacePunctRe.sub('', labAbbrev, count=0)
     return labAbbrev
 
 with open('nextstrainSamples.varPaths', 'w') as outF:
     errCount = 0
     for sample in samples:
         lab = sample['lab']
         labAbbrev = abbreviateLab(lab)
         try:
             outF.write('\t'.join([sampleName(sample), sample['clade'], labAbbrev, lab,
                                   sample['varStr']]) + '\n');
         except:
             if (errCount == 0):
                 print("Problem writing varPaths for sample '", sampleName(sample), "', varStr '",
                       sample['varStr'], "'")
             errCount += 1
 
 # Narrow down variants to "informative" set (bi-allelic, each allele supported by
 # sufficient number of samples):
 minSamples = 2
 discardedAlleles = []
 blacklist = []
 informativeVariants = []
 
 for mv in mergedVars:
     pv, alts, altCounts, sampleAlleles, backMutSamples = mv
     pos, varNameMerged, ref, altStr = pv
     recurrentAlts = []
     recurrentAltCounts = []
     for alt, altCount in zip(alts, altCounts):
         if (altCount < minSamples):
             discardedAlleles.append([ chrom, pos-1, pos, ref + str(pos) + alt ])
         else:
             recurrentAlts.append(alt)
             recurrentAltCounts.append(altCount)
     if (len(recurrentAlts) > 1):
         multiRecurrentName = ','.join([ ref + str(pos) + alt for alt in recurrentAlts ])
         blacklist.append([ chrom, pos-1, pos, multiRecurrentName ])
     elif (len(recurrentAlts) == 1):
         informativeVariants.append([ pos, ref, recurrentAlts[0] ])
 
 # Dump out BED files for the categories:
 with open('nextstrainDiscarded.bed', 'w') as outF:
     for da in discardedAlleles:
         outF.write('\t'.join(map(str, da)) + '\n');
 
 with open('nextstrainBlacklisted.bed', 'w') as outF:
     for bl in blacklist:
         outF.write('\t'.join(map(str, bl)) + '\n');
 
 with open('nextstrainInformative.bed', 'w') as outF:
     for iv in informativeVariants:
         pos, ref, alt = iv
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, ref + str(pos) + alt ])) + '\n')
 
 # Compute parsimony score for tree at each informative variant.
 
 anyDiscordantVariants = []
 
 def refAltCosts(node, pos, ref, alt, parentValue):
     nodeValue = parentValue
     mutCount = 0
     # If there are mutations on this branch, and one of them is at pos and is ref or alt
     # (i.e. not a discarded allele that we're ignoring), then change nodeValue.
     if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
         node['branch_attrs']['mutations'].get('nuc')):
         for varName in node['branch_attrs']['mutations']['nuc']:
             m = snvRe.match(varName)
             if (m):
                 mRef, mPos, mAlt = m.groups()
                 mPos = int(mPos)
                 if (mPos == pos and (mAlt == ref or mAlt == alt)):
                     nodeValue = mAlt
                     mutCount += 1
             else:
                 warn("refAltCosts: no snvRe match for '%s'" % (varName))
     kids = node.get('children')
     if (kids):
         refCost, altCost = [0, 0]
         for kid in kids:
             kidRefCost, kidAltCost, kidMutCount = refAltCosts(kid, pos, ref, alt, nodeValue)
             refCost += min(kidRefCost, kidAltCost+1)
             altCost += min(kidRefCost+1, kidAltCost)
             mutCount += kidMutCount
     else:
         refCost = 0 if (nodeValue == ref) else 1
         altCost = 1 if (nodeValue == ref) else 0
     if ((refCost < altCost and nodeValue == alt) or
         (altCost < refCost and nodeValue == ref)):
         anyDiscordantVariants.append('\t'.join([ref + str(pos) + alt, node['name']]))
     return refCost, altCost, mutCount
 
 parsimonyScores = []
 mutationCounts = []
 rootDiscordantVariants = []
 
 for iv in informativeVariants:
     pos, ref, alt = iv
     varName = ref + str(pos) + alt
     rootRefCost, rootAltCost, mutCount = refAltCosts(ncov['tree'], pos, ref, alt, ref)
     parsimonyScores.append([pos, min(rootRefCost, rootAltCost)])
     mutationCounts.append([pos, mutCount])
     rootLabel = 0 if (rootRefCost <= rootAltCost) else 1
     if (rootLabel != 0):
         # Note: so far this has not happened.  Seems like it would be pretty lame if it did.
         rootDiscordantVariants.append([chrom, pos-1, pos, varName])
 
 # Write out files with discordant nodes, parsimony/mutation counts (identical!):
 with open('nextstrainRootDiscordant.bed', 'w') as outF:
     for dv in rootDiscordantVariants:
         outF.write('\t'.join(dv) + '\n');
 
 with open('nextstrainAnyDiscordant.txt', 'w') as outF:
     for varName in anyDiscordantVariants:
         outF.write(varName + '\n');
 
 # bedGraph for parsimony scores and mutation counts
 with open('nextstrainParsimony.bedGraph', 'w') as outF:
     for ps in parsimonyScores:
         pos, score = ps
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n')
 
 with open('nextstrainMutCounts.bedGraph', 'w') as outF:
     for ps in mutationCounts:
         pos, score = ps
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n')
 
 # Informative-only VCF
 with open('nextstrainRecurrentBiallelic.vcf', 'w') as outF:
     writeVcfHeaderExceptSamples(outF)
     outF.write('\t'.join(sampleNames) + '\n')
     for iv in informativeVariants:
         pos, ref, alt = iv
         varName = ref + str(pos) + alt
         # Ignore any discarded variants at this position, but handle back-mutations if any,
         # by calling mergeVariants on this and its back-mutation
         #*** This doesn't work when iv has already been merged -- samples have the original pv,
         #*** not the merged iv, so altCounts end up being 0 for those.  But nobody uses this anyway,
         #*** should probably get rid of it...
         backMutVarName = alt + str(pos) + ref
         variants = [ [ pos, varName, ref, alt ] ]
         if (variantCounts.get(backMutVarName)):
             variants.append([ pos, backMutVarName, alt, ref ])
         pv, alts, altCounts, sampleAlleles, backMutSamples = mergeVariants(variants)
         if (len(alts) != 1):
             warn('Expected exactly one alt from merging ' + varName + ' and ' + backMutVarName +
                  ', but got [' + ', '.join(alts) + ']')
         info = 'AC=' + str(altCounts[0])
         info += ';AN=' + str(sampleCount)
         aaChange = tallyAaChanges(varName)
         if (len(aaChange)):
             info += ';AACHANGE=' + aaChange
         if (len(backMutSamples)):
             info += ';BACKMUTS=' + ','.join(backMutSamples)
         genotypes = []
         for sample, alIx in zip(samples, sampleAlleles):
             gt = '.' if (alIx < 0) else str(alIx)
             genotypes.append(gt + ':' + sample['clade'])
         outF.write('\t'.join([ '\t'.join([ chrom, str(pos), varName, ref, alt,
                                            '.', 'PASS', info, 'GT:CLADE']),
                                '\t'.join(genotypes) ]) + '\n')