abf5dba0ce1f1cf2d3f15624272bde0854aec681 angie Sat Jun 20 14:24:44 2020 -0700 Don't ignore the 'mutations' that are changes between [ACGT] and '-' (missing info), because they can occasionally happen in the path of a nucleotide change, for example T28144- to -T28144C. fixes #25751 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index a49cef1..620af5b 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -14,32 +14,32 @@ genePos = {} geneBed = [] for gene,anno in ncov['meta']['genome_annotations'].items(): if (gene != 'nuc'): genePos[gene] = anno['start'] - 1 geneBed.append([chrom, anno['start']-1, anno['end'], gene]) def bedStart(bed): return bed[1] geneBed.sort(key=bedStart) with open('nextstrainGene.bed', 'w') as outG: for bed in geneBed: outG.write('\t'.join(map(str, bed)) + '\n') # Variants and "clades" -snvRe = re.compile('^([ACGT])([0-9]+)([ACGT])$') -snvAaRe = re.compile('^([A-Z*])([0-9]+)([A-Z*])$') +snvRe = re.compile('^([ACGT-])([0-9]+)([ACGT-])$') +snvAaRe = re.compile('^([A-Z*-])([0-9]+)([A-Z*-])$') newClades = {} oldClades = {} variantCounts = {} variantAaChanges = {} samples = [] # Clades from March 15th, 2020 to early morning of June 2nd, 2020: oldCladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207', 'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81', 'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53', 'B4': '249,53,41' } # Clades from late morning of June 2nd, 2020: newCladeColors = { '19A': '76,135,232', @@ -67,30 +67,32 @@ m = snvRe.match(varName) if (m): ref, pos, alt = m.groups() prevMut = changesByPos[pos] if (prevMut): # If multi-allelic, leave the position in the list; but if back-mutation, # remove the original mutation. In either case, don't add this pos again. prevIx, prevRef, prevAlt = prevMut if (prevAlt == ref and prevRef == alt): ixsToRemove.append(prevIx) changesByPos[pos] = [] else: ix = len(snvEnds) changesByPos[pos] = (ix, ref, alt) snvEnds.append(int(pos)) + else: + warn("cladeFromVariants: no match for SNV '%s'" % (varName)) if ixsToRemove: ixsToRemove.sort(reverse=True) for ix in ixsToRemove: del snvEnds[ix] if snvEnds: snvEnds.sort() snvStarts = [ e-1 for e in snvEnds ] snvSizes = [ 1 for e in snvEnds ] clade['thickStart'] = min(snvStarts) clade['thickEnd'] = max(snvEnds) clade['name'] = name clade['varSizes'] = snvSizes clade['varStarts'] = snvStarts clade['varNames'] = varStr return clade @@ -177,30 +179,32 @@ else: return '' def rUnpackNextstrainTree(branch, parentVariants, parentVarStr): """Recursively descend ncov.tree and build data structures for genome browser tracks""" # Gather variants specific to this node/branch (if any) localVariants = [] if (branch.get('branch_attrs') and branch['branch_attrs'].get('mutations') and branch['branch_attrs']['mutations'].get('nuc')): # Nucleotide variants specific to this branch for varName in branch['branch_attrs']['mutations']['nuc']: if (snvRe.match(varName)): localVariants.append(varName) if (not variantCounts.get(varName)): variantCounts[varName] = 0; + else: + warn("rUnpackNextstrainTree: no snvRe match for '%s'" % (varName)) # Amino acid variants: figure out which nucleotide variant goes with each for geneName in branch['branch_attrs']['mutations'].keys(): if (geneName != 'nuc'): for change in branch['branch_attrs']['mutations'][geneName]: # Get nucleotide coords & figure out which nuc var this aa change corresponds to aaM = snvAaRe.match(change) if (aaM): aaRef, aaPos, aaAlt = aaM.groups() varStartMin = (int(aaPos) - 1) * 3 if (genePos.get(geneName)): cdsStart = genePos.get(geneName) varStartMin += cdsStart varStartMax = varStartMin + 2 for varName in localVariants: ref, pos, alt = snvRe.match(varName).groups() @@ -273,79 +277,93 @@ m = snvRe.match(varName) if (m): ref, pos, alt = m.groups() parsedVars.append([int(pos), varName, ref, alt]) else: warn("Can't match " + varName) # Sort by position def parsedVarPos(pv): return pv[0] parsedVars.sort(key=parsedVarPos) def parsedVarAlleleCount(pv): return variantCounts[pv[1]] def mergeVariants(pvList): - """Given a list of parsedVars [pos, varName, ref, alt] at the same pos, resolve the actual allele at each sample, handling back-mutations and serial mutations.""" + """Given a list of parsedVars [pos, varName, ref, alt] at the same pos, resolve the actual + allele at each sample, handling back-mutations and serial mutations.""" # Sort by descending allele count, assuming that the ref allele of the most frequently # observed variant is the true ref allele. For back-muts, ref and alt are swapped; # for serial muts, alt of the first is ref of the second. pvList.sort(key=parsedVarAlleleCount) pvList.reverse() pos, varName, trueRef, firstAlt = pvList[0] alts = [] sampleAlleles = [ 0 for sample in samples ] backMutSamples = [] for pv in pvList: thisPos, thisName, thisRef, thisAlt = pv if (thisPos != pos): warn("mergeVariants: inconsistent pos " + pos + " and " + thisPos) if (thisAlt == trueRef): # Back-mutation, not a true alt alIx = 0 + elif (thisAlt == '-'): + # No-call, not a true alt + alIx = -1 else: # Add to list of alts - unless it's an alt we've already seen, but from a different # serial mutation. For example, there might be T>A but also T>G+G>A; don't add A twice. if (not thisAlt in alts): alts.append(thisAlt) if (thisName != varName): varName += "," + thisName alIx = alts.index(thisAlt) + 1 for ix, sample in enumerate(samples): if (sample['variants'].get(thisName)): sampleAlleles[ix] = alIx - if (alIx == 0): + if (alIx == 0 and thisRef != '-'): backMutSamples.append(sampleName(sample)) # After handling back- and serial mutations, figure out true counts of each alt allele: altCounts = [ 0 for alt in alts ] for alIx in sampleAlleles: if (alIx > 0): altCounts[alIx - 1] += 1 + startDash = trueRef + str(pos) + '-,' + dashToDash = startDash + '-' + if varName.startswith(dashToDash): + varName = trueRef + varName[len(dashToDash):] + elif varName.startswith(startDash): + varName = varName[len(startDash):] return [ [pos, varName, trueRef, ','.join(alts)], alts, altCounts, sampleAlleles, backMutSamples ] mergedVars = [] variantsAtPos = [] for pv in parsedVars: pos = pv[0] if (len(variantsAtPos) == 0 or pos == variantsAtPos[0][0]): variantsAtPos.append(pv) else: - mergedVars.append(mergeVariants(variantsAtPos)) + mv = mergeVariants(variantsAtPos) + if (mv[1]): + mergedVars.append(mv) variantsAtPos = [pv] -mergedVars.append(mergeVariants(variantsAtPos)) +mv = mergeVariants(variantsAtPos) +if (mv[1]): + mergedVars.append(mv) def writeVcfHeaderExceptSamples(outF): """Write VCF header lines -- except for sample names (this ends with a \t not a \n).""" outF.write('##fileformat=VCFv4.3\n') outF.write('##source=nextstrain.org\n') outF.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) + '\t'); def tallyAaChanges(varNameMerged): """If any of the merged variants cause a coding change, then produce a comma-sep string in same order as variants with corresponding change(s) or '-' if a variant does not cause a coding change. If none of the variants cause a coding change, return the empty string.""" varNames = varNameMerged.split(',') gotAaChange = 0 aaChanges = [] for varName in varNames: aaChange = variantAaChanges.get(varName) @@ -364,31 +382,31 @@ def vcfForAll(fileName, cladeTag): with open(fileName, 'w') as outC: writeVcfHeaderExceptSamples(outC) outC.write('\t'.join(sampleNames) + '\n') for mv in mergedVars: pv, alts, altCounts, sampleAlleles, backMutSamples = mv info = 'AC=' + ','.join(map(str, altCounts)) + ';AN=' + str(sampleCount) varNameMerged = pv[1] aaChange = tallyAaChanges(varNameMerged) if (len(aaChange)): info += ';AACHANGE=' + aaChange if (len(backMutSamples)): info += ';BACKMUTS=' + ','.join(backMutSamples) genotypes = [] for sample, alIx in zip(samples, sampleAlleles): - gt = str(alIx) + gt = '.' if alIx < 0 else str(alIx) genotypes.append(gt + ':' + sample[cladeTag]) outC.write('\t'.join([ chrom, '\t'.join(map(str, pv)), '\t'.join(['.', 'PASS', info, 'GT:CLADE']), '\t'.join(genotypes) ]) + '\n') vcfForAll('nextstrainSamples.vcf', 'clade') # I'll skip writing an enormous file with wasteful clades-in-genotypes... really need # a sample metadata file! # Assign samples to clades; a sample can appear in multiple clades if they are nested. def sampleIdsFromNode(node, cladeTops=()): """Return a list of IDs of all samples found under node.""" kids = node.get('children') @@ -415,31 +433,31 @@ cladeSampleIds = set(sampleIdsFromNode(node, cladeTops)) cladeSampleList = [ sample for sample in samples if sample['id'] in cladeSampleIds ] cladeSampleCounts[cladeName] = len(cladeSampleList) cladeSampleNames[cladeName] = [ sampleName(sample) for sample in cladeSampleList ] with open('nextstrainSamples' + cladeName + '.vcf', 'w') as outV: writeVcfHeaderExceptSamples(outV) outV.write('\t'.join(cladeSampleNames[cladeName]) + '\n') for mv in mergedVars: pv, alts, overallAltCounts, sampleAlleles, backMutSamples = mv varNameMerged = pv[1] genotypes = [] altCounts = [ 0 for alt in alts ] acTotal=0 for sample, alIx in zip(samples, sampleAlleles): if (sample['id'] in cladeSampleIds): - gt = str(alIx) + gt = '.' if (alIx < 0) else str(alIx) genotypes.append(gt + ':' + cladeName) if (alIx > 0): altCounts[alIx - 1] += 1 acTotal += 1 if (acTotal > 0): info = 'AC=' + ','.join(map(str, altCounts)) info += ';AN=' + str(cladeSampleCounts[cladeName]) aaChange = tallyAaChanges(varNameMerged) if (len(aaChange)): info += ';AACHANGE=' + aaChange cladeBackMuts = [ sampleName for sampleName in backMutSamples if sampleName in cladeSampleNames[cladeName] ] if (len(cladeBackMuts)): info += ';BACKMUTS=' + ','.join(cladeBackMuts) outV.write('\t'.join([ chrom, @@ -490,69 +508,115 @@ newCladeTops = [ newClades[cladeName]['topNode'] for cladeName in newClades ] vcfForClades(newClades, newCladeTops) bedForClades('nextstrainClade.bed', newClades, newCladeColors) if (len(oldClades)): vcfForClades(oldClades) bedForClades('nextstrainOldClade.bed', oldClades, oldCladeColors) # Newick-formatted tree of samples for VCF display def cladeRgbFromName(cladeName, cladeColors): """Look up the r,g,b string color for clade; convert to int RGB.""" rgbCommaStr = cladeColorFromName(cladeName, cladeColors) r, g, b = [ int(x) for x in rgbCommaStr.split(',') ] rgb = (r << 16) | (g << 8) | b return rgb +def resolveVarStrDashMuts(varStr): + """If there is a sequence of mutations like T28144- followed by -28144C, get rid of the T>- + and replace the ->C with T>C. Get rid of solo basePos- because they can occur in very long + lists when samples have long stretches of Ns, not helpful.""" + if (not varStr): + return varStr + nodes = [ node.split('+') for node in varStr.split('$') ] + posWithDash = defaultdict(list) + for node in nodes: + for varName in node: + (ref, pos, alt) = snvRe.match(varName).groups() + if ref == '-' or alt == '-': + posWithDash[pos].append([ref, alt]) + if (len(posWithDash)): + removals = set() + replacements = {} + for pos, changes in posWithDash.items(): + firstRef, firstAlt = changes[0] + if (len(changes) == 1): + if (firstAlt == '-'): + removals.add(firstRef + pos + firstAlt) + elif (len(changes) == 2): + secondRef, secondAlt = changes[1] + if (firstAlt == '-' and secondRef == '-'): + removals.add(firstRef + pos + firstAlt) + replacements[secondRef + pos + secondAlt] = firstRef + pos + secondAlt + else: + warn("resolveVarStrDashMuts: what do I do with %s%s%s, %s%s%s in %s?" % + (firstRef, pos, firstAlt, secondRef, pos, secondAlt, varStr)) + else: + warn("resolveVarStrDashMuts: more changes at %s than expected: %s" % + (pos, varStr)) + if (removals or replacements): + newNodes = [] + for node in nodes: + newVars = [] + for varName in node: + if varName in replacements: + newVars.append(replacements[varName]) + elif varName not in removals: + newVars.append(varName) + if (newVars): + newNodes.append(newVars) + varStr = '$'.join([ '+'.join(node) for node in newNodes ]) + return varStr + def rNextstrainToNewick(node, cladeColors, cladeTops=(), parentClade=None, parentVarStr=''): """Recursively descend ncov.tree and build Newick tree string of samples to file. Exclude nodes in cladeTops.""" kids = node.get('children') if (kids): # Make a more concise variant path string than the one we make for the clade track, # to embed in internal node labels for Yatish's tree explorations. localVariants = [] if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and node['branch_attrs']['mutations'].get('nuc')): # Nucleotide variants specific to this branch for varName in node['branch_attrs']['mutations']['nuc']: if (snvRe.match(varName)): localVariants.append(varName) varStr = '+'.join(localVariants) if (len(parentVarStr) and len(varStr)): varStr = '$'.join([parentVarStr, varStr]) elif (not len(varStr)): varStr = parentVarStr nodeAttrs = node['node_attrs'] if (nodeAttrs.get('clade_membership')): cladeName = nodeAttrs['clade_membership']['value'] elif (parentClade): cladeName = parentClade else: cladeName = 'unassigned' color = str(cladeRgbFromName(cladeName, cladeColors)) descendants = ','.join([ rNextstrainToNewick(child, cladeColors, cladeTops, cladeName, varStr) for child in kids if child not in cladeTops ]) - label = '#'.join([cladeName, varStr]) + label = '#'.join([cladeName, resolveVarStrDashMuts(varStr)]) treeString = '(' + descendants + ')' + label + ':' + color else: nodeAttrs = node['node_attrs'] epiNode = nodeAttrs.get('gisaid_epi_isl') gId = epiNode['value'] if epiNode else node['name'] name = node['name'] numDateNode = nodeAttrs.get('num_date') - date = numDateNode['value'] if numDateNode else '' + date = numDateToMonthDay(numDateNode['value'] if numDateNode else '') cladeName = nodeAttrs['clade_membership']['value'] color = str(cladeRgbFromName(cladeName, cladeColors)) treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color return treeString with open('nextstrain.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree'], newCladeColors) + ';\n') if (len(oldClades)): with open('nextstrainOldCladeColors.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree'], oldCladeColors) + ';\n') def newickForClades(clades, cladeColors, cladeTops=()): for cladeName in clades: filename = 'nextstrain' + cladeName + '.nh' @@ -627,30 +691,32 @@ def refAltCosts(node, pos, ref, alt, parentValue): nodeValue = parentValue mutCount = 0 # If there are mutations on this branch, and one of them is at pos and is ref or alt # (i.e. not a discarded allele that we're ignoring), then change nodeValue. if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and node['branch_attrs']['mutations'].get('nuc')): for varName in node['branch_attrs']['mutations']['nuc']: m = snvRe.match(varName) if (m): mRef, mPos, mAlt = m.groups() mPos = int(mPos) if (mPos == pos and (mAlt == ref or mAlt == alt)): nodeValue = mAlt mutCount += 1 + else: + warn("refAltCosts: no snvRe match for '%s'" % (varName)) kids = node.get('children') if (kids): refCost, altCost = [0, 0] for kid in kids: kidRefCost, kidAltCost, kidMutCount = refAltCosts(kid, pos, ref, alt, nodeValue) refCost += min(kidRefCost, kidAltCost+1) altCost += min(kidRefCost+1, kidAltCost) mutCount += kidMutCount else: refCost = 0 if (nodeValue == ref) else 1 altCost = 1 if (nodeValue == ref) else 0 if ((refCost < altCost and nodeValue == alt) or (altCost < refCost and nodeValue == ref)): anyDiscordantVariants.append('\t'.join([ref + str(pos) + alt, node['name']])) return refCost, altCost, mutCount @@ -704,20 +770,20 @@ if (variantCounts.get(backMutVarName)): variants.append([ pos, backMutVarName, alt, ref ]) pv, alts, altCounts, sampleAlleles, backMutSamples = mergeVariants(variants) if (len(alts) != 1): warn('Expected exactly one alt from merging ' + varName + ' and ' + backMutVarName + ', but got [' + ', '.join(alts) + ']') info = 'AC=' + str(altCounts[0]) info += ';AN=' + str(sampleCount) aaChange = tallyAaChanges(varName) if (len(aaChange)): info += ';AACHANGE=' + aaChange if (len(backMutSamples)): info += ';BACKMUTS=' + ','.join(backMutSamples) genotypes = [] for sample, alIx in zip(samples, sampleAlleles): - gt = str(alIx) + gt = '.' if (alIx < 0) else str(alIx) genotypes.append(gt + ':' + sample['clade']) outF.write('\t'.join([ '\t'.join([ chrom, str(pos), varName, ref, alt, '.', 'PASS', info, 'GT:CLADE']), '\t'.join(genotypes) ]) + '\n')