src/hg/utils/otto/nextstrainNcov/nextstrain.py 1af2fddea7e6ede49cc2b334a6c8e78df998c9fa

1af2fddea7e6ede49cc2b334a6c8e78df998c9fa
angie
  Mon Jun 8 16:15:28 2020 -0700
Nextstrain changed to a new set of clade definitions, but they still keep the old clade assignments around in a different tag.  Make subtracks for both old and new sets of clades.  refs #25188

diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py
index 37b0be1..710773c 100755
--- src/hg/utils/otto/nextstrainNcov/nextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/nextstrain.py
@@ -1,646 +1,711 @@
 #!/usr/bin/env python3
 
 import json
 import re
 from warnings import warn
 from collections import defaultdict
 
 with open('ncov.json') as f:
     ncov = json.load(f)
 
 chrom = 'NC_045512v2'
 
 # Genes (to which AA mutations are relative):
 genePos = {}
 geneBed = []
 for gene,anno in ncov['meta']['genome_annotations'].items():
     if (gene != 'nuc'):
         genePos[gene] = anno['start'] - 1
         geneBed.append([chrom, anno['start']-1, anno['end'], gene])
 def bedStart(bed):
     return bed[1]
 geneBed.sort(key=bedStart)
 with open('nextstrainGene.bed', 'w') as outG:
     for bed in geneBed:
         outG.write('\t'.join(map(str, bed)) + '\n')
 
 # Variants and "clades"
 
 snvRe = re.compile('^([ACGT])([0-9]+)([ACGT])$')
 snvAaRe = re.compile('^([A-Z*])([0-9]+)([A-Z*])$')
 
-clades = {}
-cladeNodes = {}
+newClades = {}
+oldClades = {}
 variantCounts = {}
 variantAaChanges = {}
 samples = []
 
-cladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207',
+# Clades from March 15th, 2020 to early morning of June 2nd, 2020:
+oldCladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207',
                    'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81',
                    'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53',
                    'B4': '249,53,41' }
 
-def cladeColorFromName(cladeName):
+# Clades from late morning of June 2nd, 2020:
+newCladeColors = { '19A': '76,135,232',
+                   '19B': '110,194,178',
+                   '20A': '168,214,110',
+                   '20B': '232,206,75',
+                   '20C': '255,146,58' }
+
+def cladeColorFromName(cladeName, cladeColors):
     color = cladeColors.get(cladeName);
     if (not color):
         color = '0,0,0'
     return color
 
 def cladeFromVariants(name, variants, varStr):
     """Extract bed12 info from an object whose keys are SNV variant names"""
     clade = {}
     snvEnds = []
     # Watch out for back-mutations which make invalid BED because then we have multiple "blocks"
     # at the same position.  Instead, make a back-mutation cancel out the mutation because the
     # mutation is not found at this node.
     changesByPos = defaultdict(list)
     ixsToRemove = []
     for varName in variants:
         m = snvRe.match(varName)
         if (m):
             ref, pos, alt = m.groups()
             prevMut = changesByPos[pos]
             if (prevMut):
                 # If multi-allelic, leave the position in the list; but if back-mutation,
                 # remove the original mutation.  In either case, don't add this pos again.
                 prevIx, prevRef, prevAlt = prevMut
                 if (prevAlt == ref and prevRef == alt):
                     ixsToRemove.append(prevIx)
                     changesByPos[pos] = []
             else:
                 ix = len(snvEnds)
                 changesByPos[pos] = (ix, ref, alt)
                 snvEnds.append(int(pos))
     if ixsToRemove:
         ixsToRemove.sort(reverse=True)
         for ix in ixsToRemove:
             del snvEnds[ix]
     if snvEnds:
         snvEnds.sort()
         snvStarts = [ e-1 for e in snvEnds ]
         snvSizes = [ 1 for e in snvEnds ]
         clade['thickStart'] = min(snvStarts)
         clade['thickEnd'] = max(snvEnds)
         clade['name'] = name
-        clade['color'] = cladeColorFromName(name)
         clade['varSizes'] = snvSizes
         clade['varStarts'] = snvStarts
         clade['varNames'] = varStr
     return clade
 
 def addDatesToClade(clade, numDateAttrs):
     """Add the numeric dates from ncov.json node_attrs.num_date to clade record"""
     clade['dateInferred'] = numDateAttrs['value']
     clade['dateConfMin'] = numDateAttrs['confidence'][0]
     clade['dateConfMax'] = numDateAttrs['confidence'][1]
 
 def addCountryToClade(clade, countryAttrs):
     """Add country data from ncov.json node_attrs.country to clade"""
     clade['countryInferred'] = countryAttrs['value']
-    confString = ''
-    for country, conf in countryAttrs['confidence'].items():
-        if (len(confString)):
-            confString += ', '
-        confString += "%s: %0.5f" % (country, conf)
-    clade['countryConf'] = confString
+    conf = countryAttrs.get('confidence')
+    clade['countryConf'] = ', '.join([ "%s: %0.5f" % (country, conf)
+                                       for country, conf in conf.items()]) if conf else ''
+
+def processClade(branch, tag, branchVariants, branchVarStr, clades):
+    """If this is the first time we've seen (old or new) clade, add it to clades"""
+    nodeAttrs = branch['node_attrs']
+    if (nodeAttrs.get(tag)):
+        cladeName = nodeAttrs[tag]['value']
+        if (cladeName != 'unassigned' and not cladeName in clades):
+            clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr)
+            addDatesToClade(clades[cladeName], nodeAttrs['num_date'])
+            if (nodeAttrs.get('country')):
+                addCountryToClade(clades[cladeName], nodeAttrs['country'])
+            elif (nodeAttrs.get('division')):
+                addCountryToClade(clades[cladeName], nodeAttrs['division'])
+            clades[cladeName]['topNode'] = branch
 
 def numDateToYmd(numDate):
     """Convert numeric date (decimal year) to integer year, month, day"""
     year = int(numDate)
     isLeapYear = 1 if (year % 4 == 0) else 0
     # Get rid of the year
     numDate -= year
     # Convert to Julian day
     daysInYear = 366 if isLeapYear else 365
     jDay = int(numDate * daysInYear) + 1
     if (jDay > 334 + isLeapYear):
         month, day = 11, (jDay - 334 - isLeapYear)
     elif (jDay > 304 + isLeapYear):
         month, day = 10, (jDay - 304 - isLeapYear)
     elif (jDay > 273 + isLeapYear):
         month, day = 9, (jDay - 273 - isLeapYear)
     elif (jDay > 243 + isLeapYear):
         month, day = 8, (jDay - 243 - isLeapYear)
     elif (jDay > 212 + isLeapYear):
         month, day = 7, (jDay - 212 - isLeapYear)
     elif (jDay > 181 + isLeapYear):
         month, day = 6, (jDay - 181 - isLeapYear)
     elif (jDay > 151 + isLeapYear):
         month, day = 5, (jDay - 151 - isLeapYear)
     elif (jDay > 120 + isLeapYear):
         month, day = 4, (jDay - 120 - isLeapYear)
     elif (jDay > 90 + isLeapYear):
         month, day = 3, (jDay - 90 - isLeapYear)
     elif (jDay > 59 + isLeapYear):
         month, day = 2, (jDay - 59 - isLeapYear)
     elif (jDay > 31):
         month, day = 1, (jDay - 31)
     else:
         month, day = 0, jDay
     return year, month, day
 
 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
 
 def numDateToMonthDay(numDate):
     """Transform decimal year timestamp to string with only month and day"""
     year, month, day = numDateToYmd(numDate)
     return months[month] + str(day)
 
 def numDateToYmdStr(numDate):
     """Convert decimal year to YY-MM-DD string"""
+    if numDate:
         year, month, day = numDateToYmd(numDate)
         return "%02d-%02d-%02d" % (year, month+1, day)
+    else:
+        return ''
 
 def rUnpackNextstrainTree(branch, parentVariants, parentVarStr):
     """Recursively descend ncov.tree and build data structures for genome browser tracks"""
     # Gather variants specific to this node/branch (if any)
     localVariants = []
     if (branch.get('branch_attrs') and branch['branch_attrs'].get('mutations') and
         branch['branch_attrs']['mutations'].get('nuc')):
         # Nucleotide variants specific to this branch
         for varName in branch['branch_attrs']['mutations']['nuc']:
             if (snvRe.match(varName)):
                 localVariants.append(varName)
                 if (not variantCounts.get(varName)):
                     variantCounts[varName] = 0;
         # Amino acid variants: figure out which nucleotide variant goes with each
         for geneName in branch['branch_attrs']['mutations'].keys():
             if (geneName != 'nuc'):
                 for change in branch['branch_attrs']['mutations'][geneName]:
                     # Get nucleotide coords & figure out which nuc var this aa change corresponds to
                     aaM = snvAaRe.match(change)
                     if (aaM):
                         aaRef, aaPos, aaAlt = aaM.groups()
                         varStartMin = (int(aaPos) - 1) * 3
                         if (genePos.get(geneName)):
                             cdsStart = genePos.get(geneName)
                             varStartMin += cdsStart
                             varStartMax = varStartMin + 2
                             for varName in localVariants:
                                 ref, pos, alt = snvRe.match(varName).groups()
                                 pos = int(pos) - 1
                                 if (pos >= varStartMin and pos <= varStartMax):
                                     variantAaChanges[varName] = geneName + ':' + change
                         else:
                             warn("Can't find start for gene " + geneName)
     # Inherit parent variants
     branchVariants = parentVariants.copy()
     # Add variants specific to this branch (if any)
     for varName in localVariants:
         branchVariants[varName] = 1
     # Make an ordered variant string as David requested: semicolons between nodes,
     # comma-separated within a node:
     branchVarStr = ''
     for varName in localVariants:
         if (len(branchVarStr)):
             branchVarStr += ', '
         branchVarStr += varName
         aaVar = variantAaChanges.get(varName)
         if (aaVar):
             branchVarStr += ' (' + aaVar + ')'
     if (len(parentVarStr) and len(branchVarStr)):
         branchVarStr = parentVarStr + '; ' + branchVarStr
     elif (not len(branchVarStr)):
         branchVarStr = parentVarStr
-    nodeAttrs = branch['node_attrs']
-    if (nodeAttrs.get('clade_membership')):
-        cladeName = nodeAttrs['clade_membership']['value']
-        if (cladeName != 'unassigned' and not cladeName in clades):
-            clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr)
-            addDatesToClade(clades[cladeName], nodeAttrs['num_date'])
-            if (nodeAttrs.get('country')):
-                addCountryToClade(clades[cladeName], nodeAttrs['country'])
-            elif (nodeAttrs.get('division')):
-                addCountryToClade(clades[cladeName], nodeAttrs['division'])
-            else:
-                warn('No country or division for new clade ' + cladeName)
-            cladeNodes[cladeName] = branch
+    processClade(branch, 'clade_membership', branchVariants, branchVarStr, newClades)
+    processClade(branch, 'legacy_clade_membership', branchVariants, branchVarStr, oldClades)
     kids = branch.get('children')
     if (kids):
         for child in kids:
             rUnpackNextstrainTree(child, branchVariants, branchVarStr);
     else:
         for varName in branchVariants:
             variantCounts[varName] += 1
+        nodeAttrs = branch['node_attrs']
         if (nodeAttrs.get('submitting_lab')):
             lab = nodeAttrs['submitting_lab']['value']
         else:
             lab = ''
+        if (nodeAttrs.get('legacy_clade_membership')):
+            oldClade = nodeAttrs['legacy_clade_membership']['value']
+        else:
+            oldClade = ''
         samples.append({ 'id': nodeAttrs['gisaid_epi_isl']['value'],
                          'name': branch['name'],
                          'clade': nodeAttrs['clade_membership']['value'],
+                         'oldClade': oldClade,
                          'date': numDateToMonthDay(nodeAttrs['num_date']['value']),
                          'lab': lab,
                          'variants': branchVariants,
                          'varStr': branchVarStr })
 
 rUnpackNextstrainTree(ncov['tree'], {}, '')
 
 def sampleName(sample):
     return '|'.join([sample['id'], sample['name'], sample['date']])
 
 sampleCount = len(samples)
 sampleNames = [ sampleName(sample)  for sample in samples ]
 
 # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order
 parsedVars = []
 for varName in variantCounts.keys():
     m = snvRe.match(varName)
     if (m):
         ref, pos, alt = m.groups()
         parsedVars.append([int(pos), varName, ref, alt])
     else:
         warn("Can't match " + varName)
 # Sort by position
 def parsedVarPos(pv):
     return pv[0]
 parsedVars.sort(key=parsedVarPos)
 
 def parsedVarAlleleCount(pv):
     return variantCounts[pv[1]]
 
 def mergeVariants(pvList):
     """Given a list of parsedVars [pos, varName, ref, alt] at the same pos, resolve the actual allele at each sample, handling back-mutations and serial mutations."""
     # Sort by descending allele count, assuming that the ref allele of the most frequently
     # observed variant is the true ref allele.  For back-muts, ref and alt are swapped;
     # for serial muts, alt of the first is ref of the second.
     pvList.sort(key=parsedVarAlleleCount)
     pvList.reverse()
     pos, varName, trueRef, firstAlt = pvList[0]
     alts = []
     sampleAlleles = [ 0 for sample in samples ]
     backMutSamples = []
     for pv in pvList:
         thisPos, thisName, thisRef, thisAlt = pv
         if (thisPos != pos):
             warn("mergeVariants: inconsistent pos " + pos + " and " + thisPos)
         if (thisAlt == trueRef):
             # Back-mutation, not a true alt
             alIx = 0
         else:
             # Add to list of alts - unless it's an alt we've already seen, but from a different
             # serial mutation.  For example, there might be T>A but also T>G+G>A; don't add A twice.
             if (not thisAlt in alts):
                 alts.append(thisAlt)
                 if (thisName != varName):
                     varName += "," + thisName
             alIx = alts.index(thisAlt) + 1
         for ix, sample in enumerate(samples):
             if (sample['variants'].get(thisName)):
                 sampleAlleles[ix] = alIx
                 if (alIx == 0):
                     backMutSamples.append(sampleName(sample))
     # After handling back- and serial mutations, figure out true counts of each alt allele:
     altCounts = [ 0 for alt in alts ]
     for alIx in sampleAlleles:
         if (alIx > 0):
             altCounts[alIx - 1] += 1
     return [ [pos, varName, trueRef, ','.join(alts)],
              alts, altCounts, sampleAlleles, backMutSamples ]
 
 mergedVars = []
 
 variantsAtPos = []
 for pv in parsedVars:
     pos = pv[0]
     if (len(variantsAtPos) == 0 or pos == variantsAtPos[0][0]):
         variantsAtPos.append(pv)
     else:
         mergedVars.append(mergeVariants(variantsAtPos))
         variantsAtPos = [pv]
 mergedVars.append(mergeVariants(variantsAtPos))
 
 def writeVcfHeaderExceptSamples(outF):
     """Write VCF header lines -- except for sample names (this ends with a \t not a \n)."""
     outF.write('##fileformat=VCFv4.3\n')
     outF.write('##source=nextstrain.org\n')
     outF.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) +
                '\t');
 
 def tallyAaChanges(varNameMerged):
     """If any of the merged variants cause a coding change, then produce a comma-sep string in same order as variants with corresponding change(s) or '-' if a variant does not cause a coding change.  If none of the variants cause a coding change, return the empty string."""
     varNames = varNameMerged.split(',')
     gotAaChange = 0
     aaChanges = []
     for varName in varNames:
         aaChange = variantAaChanges.get(varName)
         if (aaChange):
             gotAaChange = 1
             aaChanges.append(aaChange)
         else:
             aaChanges.append('-');
     if (gotAaChange):
         aaChangeStr = ','.join(aaChanges)
     else:
         aaChangeStr = ''
     return aaChangeStr
 
 # VCF
-with open('nextstrainSamples.vcf', 'w') as outC:
+def vcfForAll(fileName, cladeTag):
+    with open(fileName, 'w') as outC:
         writeVcfHeaderExceptSamples(outC)
         outC.write('\t'.join(sampleNames) + '\n')
         for mv in mergedVars:
             pv, alts, altCounts, sampleAlleles, backMutSamples = mv
             info = 'AC=' + ','.join(map(str, altCounts)) + ';AN=' + str(sampleCount)
             varNameMerged = pv[1]
             aaChange = tallyAaChanges(varNameMerged)
             if (len(aaChange)):
                 info += ';AACHANGE=' + aaChange
             if (len(backMutSamples)):
                 info += ';BACKMUTS=' + ','.join(backMutSamples)
             genotypes = []
             for sample, alIx in zip(samples, sampleAlleles):
                 gt = str(alIx)
-            genotypes.append(gt + ':' + sample['clade'])
+                genotypes.append(gt + ':' + sample[cladeTag])
             outC.write('\t'.join([ chrom,
                                    '\t'.join(map(str, pv)),
                                    '\t'.join(['.', 'PASS', info, 'GT:CLADE']),
                                    '\t'.join(genotypes) ]) + '\n')
 
+vcfForAll('nextstrainSamples.vcf', 'clade')
+# I'll skip writing an enormous file with wasteful clades-in-genotypes... really need
+# a sample metadata file!
+
 # Assign samples to clades; a sample can appear in multiple clades if they are nested.
-cladeSamples = {}
-cladeSampleCounts = {}
-cladeSampleNames = {}
 
-def sampleIdsFromNode(node, ids):
-    """Fill in a dict of IDs of all samples found under node."""
+def sampleIdsFromNode(node, cladeTops=()):
+    """Return a list of IDs of all samples found under node."""
     kids = node.get('children')
     if (kids):
+        sampleIds = []
         for kid in kids:
-            sampleIdsFromNode(kid, ids)
+            if (kid not in cladeTops):
+                sampleIds += sampleIdsFromNode(kid, cladeTops)
     else:
         sampleId = node['node_attrs']['gisaid_epi_isl']['value']
-        ids[sampleId] = 1
+        sampleIds = [sampleId]
+    return sampleIds
 
-for cladeName, node in cladeNodes.items():
-    sampleIds = {}
-    sampleIdsFromNode(node, sampleIds)
-    cladeSampleList = [ sample for sample in samples if sample['id'] in sampleIds ]
-    cladeSamples[cladeName] = sampleIds
+cladeSampleCounts = {}
+cladeSampleNames = {}
+
+def vcfForClades(clades, cladeTops=()):
+    """Given a set of clades (old or new), dump out VCF for each clade.
+    Stop at nodes in cladeTops (for Nextstrain's new clade scheme where 19A is root, 20A fully
+    contains 20B and 20C, etc."""
+    for cladeName in clades:
+        node = clades[cladeName]['topNode']
+        cladeSampleIds = set(sampleIdsFromNode(node, cladeTops))
+        cladeSampleList = [ sample for sample in samples if sample['id'] in cladeSampleIds ]
         cladeSampleCounts[cladeName] = len(cladeSampleList)
         cladeSampleNames[cladeName] = [ sampleName(sample) for sample in cladeSampleList ]
-
-# Per-clade VCF subset
-for cladeName, cladeSampleIds in cladeSamples.items():
         with open('nextstrainSamples' + cladeName + '.vcf', 'w') as outV:
             writeVcfHeaderExceptSamples(outV)
             outV.write('\t'.join(cladeSampleNames[cladeName]) + '\n')
             for mv in mergedVars:
                 pv, alts, overallAltCounts, sampleAlleles, backMutSamples = mv
                 varNameMerged = pv[1]
                 genotypes = []
                 altCounts = [ 0 for alt in alts ]
                 acTotal=0
                 for sample, alIx in zip(samples, sampleAlleles):
                     if (sample['id'] in cladeSampleIds):
                         gt = str(alIx)
                         genotypes.append(gt + ':' + cladeName)
                         if (alIx > 0):
                             altCounts[alIx - 1] += 1
                             acTotal += 1
                 if (acTotal > 0):
                     info = 'AC=' + ','.join(map(str, altCounts))
                     info += ';AN=' + str(cladeSampleCounts[cladeName])
                     aaChange = tallyAaChanges(varNameMerged)
                     if (len(aaChange)):
                         info += ';AACHANGE=' + aaChange
                     cladeBackMuts = [ sampleName for sampleName in backMutSamples
                                       if sampleName in cladeSampleNames[cladeName] ]
                     if (len(cladeBackMuts)):
                         info += ';BACKMUTS=' + ','.join(cladeBackMuts)
                     outV.write('\t'.join([ chrom,
                                            '\t'.join(map(str, pv)),
                                            '\t'.join(['.', 'PASS', info, 'GT:CLADE']),
                                            '\t'.join(genotypes) ]) + '\n')
 
-# BED+ file for clades
-with open('nextstrainClade.bed', 'w') as outC:
+def bedForClades(fileName, clades, cladeColors):
+    """Make a BED file summarizing each clade"""
+    with open(fileName, 'w') as outC:
         for name, clade in clades.items():
-        if (clade.get('thickStart')):
+            if (not clade.get('thickStart')):
+                # "Clade" 19A encompasses the entire tree (minus the parts assigned to
+                # other "clades").  It has no identifying variants, and (as of June 7)
+                # no dates assigned.
+                clade['thickStart'] = clade['thickEnd'] = 0
+                clade['varStarts'] = clade['varSizes'] = []
+                clade['varNames'] = ''
+                clade['dateInferred'] = clade['dateConfMin'] = clade['dateConfMax'] = 0
+            countryConf = clade.get('countryConf')
+            if (not countryConf):
+                countryConf = ''
+            countryInferred = clade.get('countryInferred')
+            if (not countryInferred):
+                countryInferred = ''
             outC.write('\t'.join(map(str,
                                      [ chrom, 0, 29903, name, 0, '.',
-                                       clade['thickStart'], clade['thickEnd'], clade['color'],
+                                       clade['thickStart'], clade['thickEnd'],
+                                       cladeColorFromName(name, cladeColors),
                                        len(clade['varSizes']) + 2,
-                                       '1,' + ','.join(map(str, clade['varSizes'])) + ',1,',
-                                       '0,' + ','.join(map(str, clade['varStarts'])) + ',29902,',
+                                       ','.join(map(str, ([1] + clade['varSizes']) + [1])),
+                                       ','.join(map(str, ([0] + clade['varStarts']) + [29902])),
                                        clade['varNames'],
                                        numDateToYmdStr(clade['dateInferred']),
                                        numDateToYmdStr(clade['dateConfMin']),
                                        numDateToYmdStr(clade['dateConfMax']),
-                                       clade['countryInferred'],
-                                       clade['countryConf'],
+                                       countryInferred,
+                                       countryConf,
                                        cladeSampleCounts[name],
                                        ', '.join(cladeSampleNames[name]) ])) + '\n')
 
+if (len(oldClades) == 0):
+    # This ncov.json must be from when old clades were the new clades, and the new clades
+    # had not yet arrived.  Revert to old colors and don't exclude subclades.
+    newCladeColors = oldCladeColors
+    newCladeTops = ()
+else:
+    newCladeTops = [ newClades[cladeName]['topNode'] for cladeName in newClades ]
+vcfForClades(newClades, newCladeTops)
+bedForClades('nextstrainClade.bed', newClades, newCladeColors)
+if (len(oldClades)):
+    vcfForClades(oldClades)
+    bedForClades('nextstrainOldClade.bed', oldClades, oldCladeColors)
+
 # Newick-formatted tree of samples for VCF display
-def cladeRgbFromName(cladeName):
+def cladeRgbFromName(cladeName, cladeColors):
     """Look up the r,g,b string color for clade; convert to int RGB."""
-    rgbCommaStr = cladeColorFromName(cladeName)
+    rgbCommaStr = cladeColorFromName(cladeName, cladeColors)
     r, g, b = [ int(x) for x in rgbCommaStr.split(',') ]
     rgb = (r << 16) | (g << 8) | b
     return rgb
 
-def rNextstrainToNewick(node, parentClade=None, parentVarStr=''):
-    """Recursively descend ncov.tree and build Newick tree string of samples to file"""
+def rNextstrainToNewick(node, cladeColors, cladeTops=(), parentClade=None, parentVarStr=''):
+    """Recursively descend ncov.tree and build Newick tree string of samples to file.
+    Exclude nodes in cladeTops."""
     kids = node.get('children')
     if (kids):
         # Make a more concise variant path string than the one we make for the clade track,
         # to embed in internal node labels for Yatish's tree explorations.
         localVariants = []
         if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
             node['branch_attrs']['mutations'].get('nuc')):
             # Nucleotide variants specific to this branch
             for varName in node['branch_attrs']['mutations']['nuc']:
                 if (snvRe.match(varName)):
                     localVariants.append(varName)
         varStr = '+'.join(localVariants)
         if (len(parentVarStr) and len(varStr)):
             varStr = '$'.join([parentVarStr, varStr])
         elif (not len(varStr)):
             varStr = parentVarStr
         nodeAttrs = node['node_attrs']
         if (nodeAttrs.get('clade_membership')):
             cladeName = nodeAttrs['clade_membership']['value']
         elif (parentClade):
             cladeName = parentClade
         else:
             cladeName = 'unassigned'
-        color = str(cladeRgbFromName(cladeName))
-        descendants = ','.join([ rNextstrainToNewick(child, cladeName, varStr) for child in kids ])
+        color = str(cladeRgbFromName(cladeName, cladeColors))
+        descendants = ','.join([ rNextstrainToNewick(child, cladeColors, cladeTops, cladeName,
+                                                     varStr)
+                                 for child in kids if child not in cladeTops ])
         label = '#'.join([cladeName, varStr])
         treeString = '(' + descendants + ')' + label + ':' + color
     else:
         nodeAttrs = node['node_attrs']
         gId = nodeAttrs['gisaid_epi_isl']['value']
         name = node['name']
         date = numDateToMonthDay(nodeAttrs['num_date']['value'])
         cladeName = nodeAttrs['clade_membership']['value']
-        color = str(cladeRgbFromName(cladeName))
+        color = str(cladeRgbFromName(cladeName, cladeColors))
         treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color
     return treeString
 
 with open('nextstrain.nh', 'w') as outF:
-    outF.write(rNextstrainToNewick(ncov['tree']) + ';\n')
+    outF.write(rNextstrainToNewick(ncov['tree'], newCladeColors) + ';\n')
 
-for cladeName, node in cladeNodes.items():
+if (len(oldClades)):
+    with open('nextstrainOldCladeColors.nh', 'w') as outF:
+        outF.write(rNextstrainToNewick(ncov['tree'], oldCladeColors) + ';\n')
+
+def newickForClades(clades, cladeColors, cladeTops=()):
+    for cladeName in clades:
         filename = 'nextstrain' + cladeName + '.nh'
+        node = clades[cladeName]['topNode']
         with open(filename, 'w') as outF:
-        outF.write(rNextstrainToNewick(node) + ';\n')
+            outF.write(rNextstrainToNewick(node, cladeColors, cladeTops) + ';\n')
+
+newickForClades(newClades, newCladeColors, newCladeTops)
+if (len(oldClades)):
+    newickForClades(oldClades, oldCladeColors)
 
 # File with samples and their clades, labs and variant paths
 
 apostropheSRe = re.compile("'s");
 firstLetterRe = re.compile('(\w)\w+');
 spacePunctRe = re.compile('\W');
 
 def abbreviateLab(lab):
     """Lab names are very long and sometimes differ by punctuation or typos.  Abbreviate for easier comparison."""
     labAbbrev = apostropheSRe.sub('', lab)
     labAbbrev = firstLetterRe.sub(r'\1', labAbbrev, count=0)
     labAbbrev = spacePunctRe.sub('', labAbbrev, count=0)
     return labAbbrev
 
 with open('nextstrainSamples.varPaths', 'w') as outF:
     for sample in samples:
         lab = sample['lab']
         labAbbrev = abbreviateLab(lab)
         outF.write('\t'.join([sampleName(sample), sample['clade'], labAbbrev, lab,
                               sample['varStr']]) + '\n');
 
 # Narrow down variants to "informative" set (bi-allelic, each allele supported by
 # sufficient number of samples):
 minSamples = 2
 discardedAlleles = []
 blacklist = []
 informativeVariants = []
 
 for mv in mergedVars:
     pv, alts, altCounts, sampleAlleles, backMutSamples = mv
     pos, varNameMerged, ref, altStr = pv
     recurrentAlts = []
     for alt, altCount in zip(alts, altCounts):
         if (altCount < minSamples):
             discardedAlleles.append([ chrom, pos-1, pos, ref + str(pos) + alt ])
         else:
             recurrentAlts.append(alt)
     if (len(recurrentAlts) > 1):
         multiRecurrentName = ','.join([ ref + str(pos) + alt for alt in recurrentAlts ])
         blacklist.append([ chrom, pos-1, pos, multiRecurrentName ])
     elif (len(recurrentAlts) == 1):
         informativeVariants.append([ pos, ref, recurrentAlts[0] ])
 
 # Dump out BED files for the categories:
 with open('nextstrainDiscarded.bed', 'w') as outF:
     for da in discardedAlleles:
         outF.write('\t'.join(map(str, da)) + '\n');
 
 with open('nextstrainBlacklisted.bed', 'w') as outF:
     for bl in blacklist:
         outF.write('\t'.join(map(str, bl)) + '\n');
 
 with open('nextstrainInformative.bed', 'w') as outF:
     for iv in informativeVariants:
         pos, ref, alt = iv
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, ref + str(pos) + alt ])) + '\n')
 
 # Compute parsimony score for tree at each informative variant.
 
 anyDiscordantVariants = []
 
 def refAltCosts(node, pos, ref, alt, parentValue):
     nodeValue = parentValue
     mutCount = 0
     # If there are mutations on this branch, and one of them is at pos and is ref or alt
     # (i.e. not a discarded allele that we're ignoring), then change nodeValue.
     if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
         node['branch_attrs']['mutations'].get('nuc')):
         for varName in node['branch_attrs']['mutations']['nuc']:
             m = snvRe.match(varName)
             if (m):
                 mRef, mPos, mAlt = m.groups()
                 mPos = int(mPos)
                 if (mPos == pos and (mAlt == ref or mAlt == alt)):
                     nodeValue = mAlt
                     mutCount += 1
     kids = node.get('children')
     if (kids):
         refCost, altCost = [0, 0]
         for kid in kids:
             kidRefCost, kidAltCost, kidMutCount = refAltCosts(kid, pos, ref, alt, nodeValue)
             refCost += min(kidRefCost, kidAltCost+1)
             altCost += min(kidRefCost+1, kidAltCost)
             mutCount += kidMutCount
     else:
         refCost = 0 if (nodeValue == ref) else 1
         altCost = 1 if (nodeValue == ref) else 0
     if ((refCost < altCost and nodeValue == alt) or
         (altCost < refCost and nodeValue == ref)):
         anyDiscordantVariants.append('\t'.join([ref + str(pos) + alt, node['name']]))
     return refCost, altCost, mutCount
 
 parsimonyScores = []
 mutationCounts = []
 rootDiscordantVariants = []
 
 for iv in informativeVariants:
     pos, ref, alt = iv
     varName = ref + str(pos) + alt
     rootRefCost, rootAltCost, mutCount = refAltCosts(ncov['tree'], pos, ref, alt, ref)
     parsimonyScores.append([pos, min(rootRefCost, rootAltCost)])
     mutationCounts.append([pos, mutCount])
     rootLabel = 0 if (rootRefCost <= rootAltCost) else 1
     if (rootLabel != 0):
         # Note: so far this has not happened.  Seems like it would be pretty lame if it did.
         rootDiscordantVariants.append([chrom, pos-1, pos, varName])
 
 # Write out files with discordant nodes, parsimony/mutation counts (identical!):
 with open('nextstrainRootDiscordant.bed', 'w') as outF:
     for dv in rootDiscordantVariants:
         outF.write('\t'.join(dv) + '\n');
 
 with open('nextstrainAnyDiscordant.txt', 'w') as outF:
     for varName in anyDiscordantVariants:
         outF.write(varName + '\n');
 
 # bedGraph for parsimony scores and mutation counts
 with open('nextstrainParsimony.bedGraph', 'w') as outF:
     for ps in parsimonyScores:
         pos, score = ps
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n')
 
 with open('nextstrainMutCounts.bedGraph', 'w') as outF:
     for ps in mutationCounts:
         pos, score = ps
         outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n')
 
 # Informative-only VCF
 with open('nextstrainRecurrentBiallelic.vcf', 'w') as outF:
     writeVcfHeaderExceptSamples(outF)
     outF.write('\t'.join(sampleNames) + '\n')
     for iv in informativeVariants:
         pos, ref, alt = iv
         varName = ref + str(pos) + alt
         # Ignore any discarded variants at this position, but handle back-mutations if any,
         # by calling mergeVariants on this and its back-mutation
         backMutVarName = alt + str(pos) + ref
         variants = [ [ pos, varName, ref, alt ] ]
         if (variantCounts.get(backMutVarName)):
             variants.append([ pos, backMutVarName, alt, ref ])
         pv, alts, altCounts, sampleAlleles, backMutSamples = mergeVariants(variants)
         if (len(alts) != 1):
             warn('Expected exactly one alt from merging ' + varName + ' and ' + backMutVarName +
                  ', but got [' + ', '.join(alts) + ']')
         info = 'AC=' + str(altCounts[0])
         info += ';AN=' + str(sampleCount)
         aaChange = tallyAaChanges(varName)
         if (len(aaChange)):
             info += ';AACHANGE=' + aaChange
         if (len(backMutSamples)):
             info += ';BACKMUTS=' + ','.join(backMutSamples)
         genotypes = []
         for sample, alIx in zip(samples, sampleAlleles):
             gt = str(alIx)
             genotypes.append(gt + ':' + sample['clade'])
         outF.write('\t'.join([ '\t'.join([ chrom, str(pos), varName, ref, alt,
                                            '.', 'PASS', info, 'GT:CLADE']),
                                '\t'.join(genotypes) ]) + '\n')