5696c2a99447dd591aef26258f4dbef57e4e5d82 angie Thu Jan 14 15:39:40 2021 -0800 Nextstrain added new year-letter clades, and their JSON no longer includes the old clades; update script and subtrack stanzas. diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 620af5b..076c068 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -29,36 +29,44 @@ snvRe = re.compile('^([ACGT-])([0-9]+)([ACGT-])$') snvAaRe = re.compile('^([A-Z*-])([0-9]+)([A-Z*-])$') newClades = {} oldClades = {} variantCounts = {} variantAaChanges = {} samples = [] # Clades from March 15th, 2020 to early morning of June 2nd, 2020: oldCladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207', 'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81', 'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53', 'B4': '249,53,41' } -# Clades from late morning of June 2nd, 2020: -newCladeColors = { '19A': '76,135,232', - '19B': '110,194,178', - '20A': '168,214,110', - '20B': '232,206,75', - '20C': '255,146,58' } +# Clades from Jan. 2021: +newCladeColors = { '19A': '61,21,199', + '19B': '56,72,230', + '20A': '64,124,223', + '20B': '80,163,190', + '20C': '103,189,147', + '20D': '132,206,108', + '20E (EU1)': '171,201,81', + '20F': '211,204,64', + '20G': '242,185,54', + '20H/501Y.V2': '252,147,48', + '20I/501Y.V1': '252,89,41', + '20J/501Y.V3': '243,28,32', + } def cladeColorFromName(cladeName, cladeColors): color = cladeColors.get(cladeName); if (not color): color = '0,0,0' return color def cladeFromVariants(name, variants, varStr): """Extract bed12 info from an object whose keys are SNV variant names""" clade = {} snvEnds = [] # Watch out for back-mutations which make invalid BED because then we have multiple "blocks" # at the same position. Instead, make a back-mutation cancel out the mutation because the # mutation is not found at this node. changesByPos = defaultdict(list) @@ -409,44 +417,51 @@ def sampleIdsFromNode(node, cladeTops=()): """Return a list of IDs of all samples found under node.""" kids = node.get('children') if (kids): sampleIds = [] for kid in kids: if (kid not in cladeTops): sampleIds += sampleIdsFromNode(kid, cladeTops) else: epiNode = node['node_attrs'].get('gisaid_epi_isl') sampleId = epiNode['value'] if epiNode else node['name'] sampleIds = [sampleId] return sampleIds +def sanitizeFileName(filename): + """Remove or replace characters that cause trouble in filenames""" + filename = filename.replace('/', '_').replace(' ', '_') + filename = filename.replace('(', '').replace(')', '') + return filename + cladeSampleCounts = {} cladeSampleNames = {} def vcfForClades(clades, cladeTops=()): """Given a set of clades (old or new), dump out VCF for each clade. Stop at nodes in cladeTops (for Nextstrain's new clade scheme where 19A is root, 20A fully contains 20B and 20C, etc.""" for cladeName in clades: node = clades[cladeName]['topNode'] cladeSampleIds = set(sampleIdsFromNode(node, cladeTops)) cladeSampleList = [ sample for sample in samples if sample['id'] in cladeSampleIds ] cladeSampleCounts[cladeName] = len(cladeSampleList) cladeSampleNames[cladeName] = [ sampleName(sample) for sample in cladeSampleList ] - with open('nextstrainSamples' + cladeName + '.vcf', 'w') as outV: + sanitizedCladeName = sanitizeFileName(cladeName) + with open('nextstrainSamples' + sanitizedCladeName + '.vcf', 'w') as outV: writeVcfHeaderExceptSamples(outV) outV.write('\t'.join(cladeSampleNames[cladeName]) + '\n') for mv in mergedVars: pv, alts, overallAltCounts, sampleAlleles, backMutSamples = mv varNameMerged = pv[1] genotypes = [] altCounts = [ 0 for alt in alts ] acTotal=0 for sample, alIx in zip(samples, sampleAlleles): if (sample['id'] in cladeSampleIds): gt = '.' if (alIx < 0) else str(alIx) genotypes.append(gt + ':' + cladeName) if (alIx > 0): altCounts[alIx - 1] += 1 acTotal += 1 @@ -487,36 +502,30 @@ [ chrom, 0, 29903, name, 0, '.', clade['thickStart'], clade['thickEnd'], cladeColorFromName(name, cladeColors), len(clade['varSizes']) + 2, ','.join(map(str, ([1] + clade['varSizes']) + [1])), ','.join(map(str, ([0] + clade['varStarts']) + [29902])), clade['varNames'], numDateToYmdStr(clade['dateInferred']), numDateToYmdStr(clade['dateConfMin']), numDateToYmdStr(clade['dateConfMax']), countryInferred, countryConf, cladeSampleCounts[name], ', '.join(cladeSampleNames[name]) ])) + '\n') -if (len(oldClades) == 0): - # This ncov.json must be from when old clades were the new clades, and the new clades - # had not yet arrived. Revert to old colors and don't exclude subclades. - newCladeColors = oldCladeColors - newCladeTops = () -else: newCladeTops = [ newClades[cladeName]['topNode'] for cladeName in newClades ] vcfForClades(newClades, newCladeTops) bedForClades('nextstrainClade.bed', newClades, newCladeColors) if (len(oldClades)): vcfForClades(oldClades) bedForClades('nextstrainOldClade.bed', oldClades, oldCladeColors) # Newick-formatted tree of samples for VCF display def cladeRgbFromName(cladeName, cladeColors): """Look up the r,g,b string color for clade; convert to int RGB.""" rgbCommaStr = cladeColorFromName(cladeName, cladeColors) r, g, b = [ int(x) for x in rgbCommaStr.split(',') ] rgb = (r << 16) | (g << 8) | b return rgb @@ -581,57 +590,59 @@ if (snvRe.match(varName)): localVariants.append(varName) varStr = '+'.join(localVariants) if (len(parentVarStr) and len(varStr)): varStr = '$'.join([parentVarStr, varStr]) elif (not len(varStr)): varStr = parentVarStr nodeAttrs = node['node_attrs'] if (nodeAttrs.get('clade_membership')): cladeName = nodeAttrs['clade_membership']['value'] elif (parentClade): cladeName = parentClade else: cladeName = 'unassigned' color = str(cladeRgbFromName(cladeName, cladeColors)) + cladeShortened = cladeName.split(' ', 1)[0] + cladeShortened = cladeShortened.split('/', 1)[0] descendants = ','.join([ rNextstrainToNewick(child, cladeColors, cladeTops, cladeName, varStr) for child in kids if child not in cladeTops ]) - label = '#'.join([cladeName, resolveVarStrDashMuts(varStr)]) + label = '#'.join([cladeShortened, resolveVarStrDashMuts(varStr)]) treeString = '(' + descendants + ')' + label + ':' + color else: nodeAttrs = node['node_attrs'] epiNode = nodeAttrs.get('gisaid_epi_isl') gId = epiNode['value'] if epiNode else node['name'] name = node['name'] numDateNode = nodeAttrs.get('num_date') date = numDateToMonthDay(numDateNode['value'] if numDateNode else '') cladeName = nodeAttrs['clade_membership']['value'] color = str(cladeRgbFromName(cladeName, cladeColors)) treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color return treeString with open('nextstrain.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree'], newCladeColors) + ';\n') if (len(oldClades)): with open('nextstrainOldCladeColors.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree'], oldCladeColors) + ';\n') def newickForClades(clades, cladeColors, cladeTops=()): for cladeName in clades: - filename = 'nextstrain' + cladeName + '.nh' + filename = 'nextstrain' + sanitizeFileName(cladeName) + '.nh' node = clades[cladeName]['topNode'] with open(filename, 'w') as outF: outF.write(rNextstrainToNewick(node, cladeColors, cladeTops) + ';\n') newickForClades(newClades, newCladeColors, newCladeTops) if (len(oldClades)): newickForClades(oldClades, oldCladeColors) # File with samples and their clades, labs and variant paths apostropheSRe = re.compile("'s"); firstLetterRe = re.compile('(\w)\w+'); spacePunctRe = re.compile('\W'); def abbreviateLab(lab): @@ -647,40 +658,44 @@ labAbbrev = abbreviateLab(lab) outF.write('\t'.join([sampleName(sample), sample['clade'], labAbbrev, lab, sample['varStr']]) + '\n'); # Narrow down variants to "informative" set (bi-allelic, each allele supported by # sufficient number of samples): minSamples = 2 discardedAlleles = [] blacklist = [] informativeVariants = [] for mv in mergedVars: pv, alts, altCounts, sampleAlleles, backMutSamples = mv pos, varNameMerged, ref, altStr = pv recurrentAlts = [] + recurrentAltCounts = [] for alt, altCount in zip(alts, altCounts): if (altCount < minSamples): discardedAlleles.append([ chrom, pos-1, pos, ref + str(pos) + alt ]) else: recurrentAlts.append(alt) + recurrentAltCounts.append(altCount) if (len(recurrentAlts) > 1): multiRecurrentName = ','.join([ ref + str(pos) + alt for alt in recurrentAlts ]) blacklist.append([ chrom, pos-1, pos, multiRecurrentName ]) elif (len(recurrentAlts) == 1): informativeVariants.append([ pos, ref, recurrentAlts[0] ]) + if varNameMerged not in variantCounts: + variantCounts[varNameMerged] = recurrentAltCounts[0] # Dump out BED files for the categories: with open('nextstrainDiscarded.bed', 'w') as outF: for da in discardedAlleles: outF.write('\t'.join(map(str, da)) + '\n'); with open('nextstrainBlacklisted.bed', 'w') as outF: for bl in blacklist: outF.write('\t'.join(map(str, bl)) + '\n'); with open('nextstrainInformative.bed', 'w') as outF: for iv in informativeVariants: pos, ref, alt = iv outF.write('\t'.join(map(str, [ chrom, pos-1, pos, ref + str(pos) + alt ])) + '\n')