5e880d2ed96fe5ad464fc9b98b07ea8dab70d7f6 angie Fri May 15 10:29:11 2020 -0700 Back-mutations on the path to clade nodes cause duplicate blocks in the nextstrainClade bed, which causes sporadic bedToBigBed errors. Handle those by removing both the forward and back mutations because they cancel each other out in terms of the alleles at the clade node. Leave them in the variant string as a reminder that the tree has funky back-mutations. refs #25188 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 08906aa..37b0be1 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -1,20 +1,21 @@ #!/usr/bin/env python3 import json import re from warnings import warn +from collections import defaultdict with open('ncov.json') as f: ncov = json.load(f) chrom = 'NC_045512v2' # Genes (to which AA mutations are relative): genePos = {} geneBed = [] for gene,anno in ncov['meta']['genome_annotations'].items(): if (gene != 'nuc'): genePos[gene] = anno['start'] - 1 geneBed.append([chrom, anno['start']-1, anno['end'], gene]) def bedStart(bed): return bed[1] @@ -33,47 +34,63 @@ variantCounts = {} variantAaChanges = {} samples = [] cladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207', 'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81', 'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53', 'B4': '249,53,41' } def cladeColorFromName(cladeName): color = cladeColors.get(cladeName); if (not color): color = '0,0,0' return color -def subtractStart(coord, start): - return coord - start - def cladeFromVariants(name, variants, varStr): """Extract bed12 info from an object whose keys are SNV variant names""" clade = {} snvEnds = [] - varNames = [] + # Watch out for back-mutations which make invalid BED because then we have multiple "blocks" + # at the same position. Instead, make a back-mutation cancel out the mutation because the + # mutation is not found at this node. + changesByPos = defaultdict(list) + ixsToRemove = [] for varName in variants: m = snvRe.match(varName) if (m): - snvEnds.append(int(m.group(2))) - varNames.append(varName) + ref, pos, alt = m.groups() + prevMut = changesByPos[pos] + if (prevMut): + # If multi-allelic, leave the position in the list; but if back-mutation, + # remove the original mutation. In either case, don't add this pos again. + prevIx, prevRef, prevAlt = prevMut + if (prevAlt == ref and prevRef == alt): + ixsToRemove.append(prevIx) + changesByPos[pos] = [] + else: + ix = len(snvEnds) + changesByPos[pos] = (ix, ref, alt) + snvEnds.append(int(pos)) + if ixsToRemove: + ixsToRemove.sort(reverse=True) + for ix in ixsToRemove: + del snvEnds[ix] if snvEnds: snvEnds.sort() - snvStarts = list(map(lambda x: x-1, snvEnds)) - snvSizes = list(map(lambda x: 1, snvEnds)) + snvStarts = [ e-1 for e in snvEnds ] + snvSizes = [ 1 for e in snvEnds ] clade['thickStart'] = min(snvStarts) clade['thickEnd'] = max(snvEnds) clade['name'] = name clade['color'] = cladeColorFromName(name) clade['varSizes'] = snvSizes clade['varStarts'] = snvStarts clade['varNames'] = varStr return clade def addDatesToClade(clade, numDateAttrs): """Add the numeric dates from ncov.json node_attrs.num_date to clade record""" clade['dateInferred'] = numDateAttrs['value'] clade['dateConfMin'] = numDateAttrs['confidence'][0] clade['dateConfMax'] = numDateAttrs['confidence'][1]