6b76c158adf80081e080a8ebcb9157721663f1f5 angie Sun Feb 26 11:12:43 2023 -0800 Updated nextstrain clades & coloring, got rid of extra files for ancient oldClades, fixed bug that caused errors for reverted deletions, got rid of unused and now incredibly verbose mutation-labels in newick. diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 3913611..e181536 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -12,61 +12,71 @@ # Genes (to which AA mutations are relative): genePos = {} geneBed = [] for gene,anno in ncov['meta']['genome_annotations'].items(): if (gene != 'nuc'): genePos[gene] = anno['start'] - 1 geneBed.append([chrom, anno['start']-1, anno['end'], gene]) def bedStart(bed): return bed[1] geneBed.sort(key=bedStart) with open('nextstrainGene.bed', 'w') as outG: for bed in geneBed: outG.write('\t'.join(map(str, bed)) + '\n') -# Variants and "clades" +# Variants and clades snvRe = re.compile('^([ACGT-])([0-9]+)([ACGT-])$') snvAaRe = re.compile('^([A-Z*-])([0-9]+)([A-Z*-])$') newClades = {} -oldClades = {} variantCounts = {} variantAaChanges = {} samples = [] -# Clades from March 15th, 2020 to early morning of June 2nd, 2020: -oldCladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207', - 'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81', - 'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53', - 'B4': '249,53,41' } - -# Clades from Nov. 2021: -newCladeColors = { '20H (Beta, V2)': '68,51,190', - '20I (Alpha, V1)': '62,90,207', - '20J (Gamma, V3)': '69,127,203', - '21A (Delta)' : '82,154,182', - '21I (Delta)': '100,173,152', - '21J (Delta)': '123,183,122', - '21B (Kappa)': '150,189,96', - '21C (Epsilon)': '179,189,77', - '21D (Eta)': '205,182,66', - '21E (Theta)': '223,164,59', - '21F (Iota)': '230,132,52', - '21G (Lambda)': '226,88,44', - '21H (Mu)': '219,40,35', +# Clades from Feb. 2023: +# nColors=23 +# head -$nColors ~/github/ncov/defaults/color_schemes.tsv | tail -1 \ +# | perl -wne 'chomp; @cols = grep { s/^#//; } split; +# foreach $hexC (@cols) { +# if ($hexC =~ m/^(..)(..)(..)$/) { +# print sprintf("%d,%d,%d\n", hex($1), hex($2), hex($3)); +# } else { die $hexC; } }' +newCladeColors = { '20H (Beta, V2)': '94,29,157', + '20I (Alpha, V1)': '74,39,178', + '20J (Gamma, V3)': '64,58,196', + '21A (Delta)' : '63,82,205', + '21I (Delta)': '65,105,207', + '21J (Delta)': '69,126,203', + '21B (Kappa)': '76,144,192', + '21C (Epsilon)': '85,158,177', + '21D (Eta)': '95,169,160', + '21E (Theta)': '107,177,142', + '21F (Iota)': '121,183,124', + '21G (Lambda)': '137,187,107', + '21H (Mu)': '153,189,93', + '21M (Omicron)': '170,189,82', # B.1.1.529 + '21K (Omicron)': '187,188,73', # BA.1 + '21L (Omicron)': '203,184,66', # BA.2 + '22A (Omicron)': '215,175,62', # BA.4 + '22B (Omicron)': '224,162,58', # BA.5 + '22C (Omicron)': '230,146,55', # BA.2.12.1 + '22D (Omicron)': '230,123,50', # BA.2.75 + '22E (Omicron)': '227,97,45', # BQ.1 + '22F (Omicron)': '223,69,40', # XBB + '23A (Omicron)': '219,40,35', # XBB.1.5 # Grayscale for pre-VoC lineages '19A': '216,216,216', '19B': '209,209,209', '20A': '202,202,202', '20B': '195,195,195', '20C': '188,188,188', '20D': '181,181,181', '20E (EU1)': '174,174,174', '20F': '167,167,167', '20G': '160,160,160', } def cladeColorFromName(cladeName, cladeColors): color = cladeColors.get(cladeName); if (not color): @@ -121,31 +131,31 @@ if (numDateAttrs): clade['dateInferred'] = numDateAttrs['value'] clade['dateConfMin'] = numDateAttrs['confidence'][0] clade['dateConfMax'] = numDateAttrs['confidence'][1] else: clade['dateInferred'] = clade['dateConfMin'] = clade['dateConfMax'] = '' def addCountryToClade(clade, countryAttrs): """Add country data from ncov.json node_attrs.country to clade""" clade['countryInferred'] = countryAttrs['value'] conf = countryAttrs.get('confidence') clade['countryConf'] = ', '.join([ "%s: %0.5f" % (country, conf) for country, conf in conf.items()]) if conf else '' def processClade(branch, tag, branchVariants, branchVarStr, clades): - """If this is the first time we've seen (old or new) clade, add it to clades""" + """If this is the first time we've seen clade, add it to clades""" nodeAttrs = branch['node_attrs'] if (nodeAttrs.get(tag)): cladeName = nodeAttrs[tag]['value'] if (cladeName != 'unassigned' and not cladeName in clades): clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr) addDatesToClade(clades[cladeName], nodeAttrs.get('num_date')) if (nodeAttrs.get('country')): addCountryToClade(clades[cladeName], nodeAttrs['country']) elif (nodeAttrs.get('division')): addCountryToClade(clades[cladeName], nodeAttrs['division']) clades[cladeName]['topNode'] = branch def numDateToYmd(numDate): """Convert numeric date (decimal year) to integer year, month, day""" year = int(numDate) @@ -240,131 +250,137 @@ # Make an ordered variant string as David requested: semicolons between nodes, # comma-separated within a node: branchVarStr = '' for varName in localVariants: if (len(branchVarStr)): branchVarStr += ', ' branchVarStr += varName aaVar = variantAaChanges.get(varName) if (aaVar): branchVarStr += ' (' + aaVar + ')' if (len(parentVarStr) and len(branchVarStr)): branchVarStr = parentVarStr + '; ' + branchVarStr elif (not len(branchVarStr)): branchVarStr = parentVarStr processClade(branch, 'clade_membership', branchVariants, branchVarStr, newClades) - processClade(branch, 'legacy_clade_membership', branchVariants, branchVarStr, oldClades) kids = branch.get('children') if (kids): for child in kids: rUnpackNextstrainTree(child, branchVariants, branchVarStr); else: for varName in branchVariants: variantCounts[varName] += 1 nodeAttrs = branch['node_attrs'] if (nodeAttrs.get('submitting_lab')): lab = nodeAttrs['submitting_lab']['value'] else: lab = '' - if (nodeAttrs.get('legacy_clade_membership')): - oldClade = nodeAttrs['legacy_clade_membership']['value'] - else: - oldClade = '' epiNode = nodeAttrs.get('gisaid_epi_isl') epiId = epiNode['value'] if epiNode else branch['name'] numDateNode = nodeAttrs.get('num_date') numDate = numDateNode['value'] if numDateNode else '' samples.append({ 'id': epiId, 'name': branch['name'], 'clade': nodeAttrs['clade_membership']['value'], - 'oldClade': oldClade, 'date': numDateToMonthDay(numDate), 'lab': lab, 'variants': branchVariants, 'varStr': branchVarStr }) rUnpackNextstrainTree(ncov['tree'], {}, '') def sampleName(sample): + if sample['id'] != sample['name']: return '|'.join([sample['id'], sample['name'], sample['date']]) + else: + return '|'.join([sample['name'], sample['date']]) sampleCount = len(samples) sampleNames = [ sampleName(sample) for sample in samples ] # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order parsedVars = [] for varName in variantCounts.keys(): m = snvRe.match(varName) if (m): ref, pos, alt = m.groups() parsedVars.append([int(pos), varName, ref, alt]) else: warn("Can't match " + varName) # Sort by position def parsedVarPos(pv): return pv[0] parsedVars.sort(key=parsedVarPos) def parsedVarAlleleCount(pv): + if not variantCounts.get(pv[1]): + print(f'No variantCount for pv {pv}') + exit(1) return variantCounts[pv[1]] def mergeVariants(pvList): """Given a list of parsedVars [pos, varName, ref, alt] at the same pos, resolve the actual allele at each sample, handling back-mutations and serial mutations.""" # Sort by descending allele count, assuming that the ref allele of the most frequently # observed variant is the true ref allele. For back-muts, ref and alt are swapped; # for serial muts, alt of the first is ref of the second. pvList.sort(key=parsedVarAlleleCount) pvList.reverse() pos, varName, trueRef, firstAlt = pvList[0] alts = [] sampleAlleles = [ 0 for sample in samples ] backMutSamples = [] for pv in pvList: thisPos, thisName, thisRef, thisAlt = pv + count = variantCounts[thisName] + mergedName = trueRef + str(pos) + thisAlt if (thisPos != pos): warn("mergeVariants: inconsistent pos " + pos + " and " + thisPos) if (thisAlt == trueRef): # Back-mutation, not a true alt alIx = 0 elif (thisAlt == '-'): - # No-call, not a true alt + # Deletion but we'll pretend it's no-call for VCF alIx = -1 else: # Add to list of alts - unless it's an alt we've already seen, but from a different # serial mutation. For example, there might be T>A but also T>G+G>A; don't add A twice. if (not thisAlt in alts): alts.append(thisAlt) if (thisName != varName): - varName += "," + thisName + if varName == trueRef + str(pos) + '-': + varName = mergedName + else: + varName += "," + mergedName alIx = alts.index(thisAlt) + 1 for ix, sample in enumerate(samples): if (sample['variants'].get(thisName)): sampleAlleles[ix] = alIx if (alIx == 0 and thisRef != '-'): backMutSamples.append(sampleName(sample)) + if thisName != varName: + # If this is a new merged mutation, add it to variantCounts. If merging makes an + # already-seen mutation then add to the existing count. + if variantCounts.get(mergedName): + variantCounts[mergedName] += count + else: + variantCounts[mergedName] = count # After handling back- and serial mutations, figure out true counts of each alt allele: altCounts = [ 0 for alt in alts ] for alIx in sampleAlleles: if (alIx > 0): altCounts[alIx - 1] += 1 - startDash = trueRef + str(pos) + '-,' - dashToDash = startDash + '-' - if varName.startswith(dashToDash): - varName = trueRef + varName[len(dashToDash):] - elif varName.startswith(startDash): - varName = varName[len(startDash):] return [ [pos, varName, trueRef, ','.join(alts)], alts, altCounts, sampleAlleles, backMutSamples ] mergedVars = [] variantsAtPos = [] for pv in parsedVars: pos = pv[0] if (len(variantsAtPos) == 0 or pos == variantsAtPos[0][0]): variantsAtPos.append(pv) else: mv = mergeVariants(variantsAtPos) if (mv[1]): mergedVars.append(mv) variantsAtPos = [pv] @@ -432,37 +448,38 @@ if (kids): sampleIds = [] for kid in kids: if (kid not in cladeTops): sampleIds += sampleIdsFromNode(kid, cladeTops) else: epiNode = node['node_attrs'].get('gisaid_epi_isl') sampleId = epiNode['value'] if epiNode else node['name'] sampleIds = [sampleId] return sampleIds def sanitizeFileName(filename): """Remove or replace characters that cause trouble in filenames""" filename = filename.replace('/', '_').replace(' ', '_') filename = filename.replace('(', '').replace(')', '') + filename = filename.replace(',', ''); return filename cladeSampleCounts = {} cladeSampleNames = {} def vcfForClades(clades, cladeTops=()): - """Given a set of clades (old or new), dump out VCF for each clade. + """Given a set of clades, dump out VCF for each clade. Stop at nodes in cladeTops (for Nextstrain's new clade scheme where 19A is root, 20A fully contains 20B and 20C, etc.""" for cladeName in clades: node = clades[cladeName]['topNode'] cladeSampleIds = set(sampleIdsFromNode(node, cladeTops)) cladeSampleList = [ sample for sample in samples if sample['id'] in cladeSampleIds ] cladeSampleCounts[cladeName] = len(cladeSampleList) cladeSampleNames[cladeName] = [ sampleName(sample) for sample in cladeSampleList ] sanitizedCladeName = sanitizeFileName(cladeName) with open('nextstrainSamples' + sanitizedCladeName + '.vcf', 'w') as outV: writeVcfHeaderExceptSamples(outV) outV.write('\t'.join(cladeSampleNames[cladeName]) + '\n') for mv in mergedVars: pv, alts, overallAltCounts, sampleAlleles, backMutSamples = mv varNameMerged = pv[1] @@ -516,88 +533,39 @@ len(clade['varSizes']) + 2, ','.join(map(str, ([1] + clade['varSizes']) + [1])), ','.join(map(str, ([0] + clade['varStarts']) + [29902])), clade['varNames'], numDateToYmdStr(clade['dateInferred']), numDateToYmdStr(clade['dateConfMin']), numDateToYmdStr(clade['dateConfMax']), countryInferred, countryConf, cladeSampleCounts[name], ', '.join(cladeSampleNames[name]) ])) + '\n') newCladeTops = [ newClades[cladeName]['topNode'] for cladeName in newClades ] vcfForClades(newClades, newCladeTops) bedForClades('nextstrainClade.bed', newClades, newCladeColors) -if (len(oldClades)): - vcfForClades(oldClades) - bedForClades('nextstrainOldClade.bed', oldClades, oldCladeColors) # Newick-formatted tree of samples for VCF display def cladeRgbFromName(cladeName, cladeColors): """Look up the r,g,b string color for clade; convert to int RGB.""" rgbCommaStr = cladeColorFromName(cladeName, cladeColors) r, g, b = [ int(x) for x in rgbCommaStr.split(',') ] rgb = (r << 16) | (g << 8) | b return rgb -def resolveVarStrDashMuts(varStr): - """If there is a sequence of mutations like T28144- followed by -28144C, get rid of the T>- - and replace the ->C with T>C. Get rid of solo basePos- because they can occur in very long - lists when samples have long stretches of Ns, not helpful.""" - if (not varStr): - return varStr - nodes = [ node.split('+') for node in varStr.split('$') ] - posWithDash = defaultdict(list) - for node in nodes: - for varName in node: - (ref, pos, alt) = snvRe.match(varName).groups() - if ref == '-' or alt == '-': - posWithDash[pos].append([ref, alt]) - if (len(posWithDash)): - removals = set() - replacements = {} - for pos, changes in posWithDash.items(): - firstRef, firstAlt = changes[0] - if (len(changes) == 1): - if (firstAlt == '-'): - removals.add(firstRef + pos + firstAlt) - elif (len(changes) == 2): - secondRef, secondAlt = changes[1] - if (firstAlt == '-' and secondRef == '-'): - removals.add(firstRef + pos + firstAlt) - replacements[secondRef + pos + secondAlt] = firstRef + pos + secondAlt - else: - warn("resolveVarStrDashMuts: what do I do with %s%s%s, %s%s%s in %s?" % - (firstRef, pos, firstAlt, secondRef, pos, secondAlt, varStr)) - else: - warn("resolveVarStrDashMuts: more changes at %s than expected: %s" % - (pos, varStr)) - if (removals or replacements): - newNodes = [] - for node in nodes: - newVars = [] - for varName in node: - if varName in replacements: - newVars.append(replacements[varName]) - elif varName not in removals: - newVars.append(varName) - if (newVars): - newNodes.append(newVars) - varStr = '$'.join([ '+'.join(node) for node in newNodes ]) - return varStr - def rNextstrainToNewick(node, cladeColors, cladeTops=(), parentClade=None, parentVarStr=''): """Recursively descend ncov.tree and build Newick tree string of samples to file. Exclude nodes in cladeTops.""" kids = node.get('children') if (kids): # Make a more concise variant path string than the one we make for the clade track, # to embed in internal node labels for Yatish's tree explorations. localVariants = [] if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and node['branch_attrs']['mutations'].get('nuc')): # Nucleotide variants specific to this branch for varName in node['branch_attrs']['mutations']['nuc']: if (snvRe.match(varName)): localVariants.append(varName) varStr = '+'.join(localVariants) @@ -606,61 +574,55 @@ elif (not len(varStr)): varStr = parentVarStr nodeAttrs = node['node_attrs'] if (nodeAttrs.get('clade_membership')): cladeName = nodeAttrs['clade_membership']['value'] elif (parentClade): cladeName = parentClade else: cladeName = 'unassigned' color = str(cladeRgbFromName(cladeName, cladeColors)) cladeShortened = cladeName.split(' ', 1)[0] cladeShortened = cladeShortened.split('/', 1)[0] descendants = ','.join([ rNextstrainToNewick(child, cladeColors, cladeTops, cladeName, varStr) for child in kids if child not in cladeTops ]) - label = '#'.join([cladeShortened, resolveVarStrDashMuts(varStr)]) - treeString = '(' + descendants + ')' + label + ':' + color + treeString = '(' + descendants + '):' + color else: nodeAttrs = node['node_attrs'] epiNode = nodeAttrs.get('gisaid_epi_isl') gId = epiNode['value'] if epiNode else node['name'] name = node['name'] numDateNode = nodeAttrs.get('num_date') date = numDateToMonthDay(numDateNode['value'] if numDateNode else '') cladeName = nodeAttrs['clade_membership']['value'] color = str(cladeRgbFromName(cladeName, cladeColors)) treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color return treeString with open('nextstrain.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree'], newCladeColors) + ';\n') -if (len(oldClades)): - with open('nextstrainOldCladeColors.nh', 'w') as outF: - outF.write(rNextstrainToNewick(ncov['tree'], oldCladeColors) + ';\n') def newickForClades(clades, cladeColors, cladeTops=()): for cladeName in clades: filename = 'nextstrain' + sanitizeFileName(cladeName) + '.nh' node = clades[cladeName]['topNode'] with open(filename, 'w') as outF: outF.write(rNextstrainToNewick(node, cladeColors, cladeTops) + ';\n') newickForClades(newClades, newCladeColors, newCladeTops) -if (len(oldClades)): - newickForClades(oldClades, oldCladeColors) # File with samples and their clades, labs and variant paths apostropheSRe = re.compile("'s"); firstLetterRe = re.compile('(\w)\w+'); spacePunctRe = re.compile('\W'); def abbreviateLab(lab): """Lab names are very long and sometimes differ by punctuation or typos. Abbreviate for easier comparison.""" labAbbrev = apostropheSRe.sub('', lab) labAbbrev = firstLetterRe.sub(r'\1', labAbbrev, count=0) labAbbrev = spacePunctRe.sub('', labAbbrev, count=0) return labAbbrev with open('nextstrainSamples.varPaths', 'w') as outF: @@ -688,32 +650,30 @@ pv, alts, altCounts, sampleAlleles, backMutSamples = mv pos, varNameMerged, ref, altStr = pv recurrentAlts = [] recurrentAltCounts = [] for alt, altCount in zip(alts, altCounts): if (altCount < minSamples): discardedAlleles.append([ chrom, pos-1, pos, ref + str(pos) + alt ]) else: recurrentAlts.append(alt) recurrentAltCounts.append(altCount) if (len(recurrentAlts) > 1): multiRecurrentName = ','.join([ ref + str(pos) + alt for alt in recurrentAlts ]) blacklist.append([ chrom, pos-1, pos, multiRecurrentName ]) elif (len(recurrentAlts) == 1): informativeVariants.append([ pos, ref, recurrentAlts[0] ]) - if varNameMerged not in variantCounts: - variantCounts[varNameMerged] = recurrentAltCounts[0] # Dump out BED files for the categories: with open('nextstrainDiscarded.bed', 'w') as outF: for da in discardedAlleles: outF.write('\t'.join(map(str, da)) + '\n'); with open('nextstrainBlacklisted.bed', 'w') as outF: for bl in blacklist: outF.write('\t'.join(map(str, bl)) + '\n'); with open('nextstrainInformative.bed', 'w') as outF: for iv in informativeVariants: pos, ref, alt = iv outF.write('\t'.join(map(str, [ chrom, pos-1, pos, ref + str(pos) + alt ])) + '\n') @@ -786,30 +746,33 @@ with open('nextstrainMutCounts.bedGraph', 'w') as outF: for ps in mutationCounts: pos, score = ps outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n') # Informative-only VCF with open('nextstrainRecurrentBiallelic.vcf', 'w') as outF: writeVcfHeaderExceptSamples(outF) outF.write('\t'.join(sampleNames) + '\n') for iv in informativeVariants: pos, ref, alt = iv varName = ref + str(pos) + alt # Ignore any discarded variants at this position, but handle back-mutations if any, # by calling mergeVariants on this and its back-mutation + #*** This doesn't work when iv has already been merged -- samples have the original pv, + #*** not the merged iv, so altCounts end up being 0 for those. But nobody uses this anyway, + #*** should probably get rid of it... backMutVarName = alt + str(pos) + ref variants = [ [ pos, varName, ref, alt ] ] if (variantCounts.get(backMutVarName)): variants.append([ pos, backMutVarName, alt, ref ]) pv, alts, altCounts, sampleAlleles, backMutSamples = mergeVariants(variants) if (len(alts) != 1): warn('Expected exactly one alt from merging ' + varName + ' and ' + backMutVarName + ', but got [' + ', '.join(alts) + ']') info = 'AC=' + str(altCounts[0]) info += ';AN=' + str(sampleCount) aaChange = tallyAaChanges(varName) if (len(aaChange)): info += ';AACHANGE=' + aaChange if (len(backMutSamples)): info += ';BACKMUTS=' + ','.join(backMutSamples)