618f548682d24816a2898b4e29c5787ac591f67f angie Sun Feb 26 11:05:27 2023 -0800 Adding comments after code review, thanks Jonathan. diff --git src/hg/utils/otto/sarscov2phylo/branchSpecificMask.py src/hg/utils/otto/sarscov2phylo/branchSpecificMask.py index 52b2e59..26a6766 100755 --- src/hg/utils/otto/sarscov2phylo/branchSpecificMask.py +++ src/hg/utils/otto/sarscov2phylo/branchSpecificMask.py @@ -82,35 +82,40 @@ rep = spec[branch]['representative'] repNodes[rep] = '' samplePaths = tempfile.NamedTemporaryFile(delete=False) samplePaths.close() # matUtils SEGVs if given a full path output file name unless output dir is '/', need to fix that run(['matUtils', 'extract', '-i', pbIn, '-d', '/', '--sample-paths', samplePaths.name]) with open(samplePaths.name) as f: for line in f: try: [fullName, path] = line.rstrip().split('\t') except ValueError as e: continue name = fullName.split('|')[0] if repNodes.get(name) is not None: nodes = path.split(' ') + # In most cases we want the last node in the path [-1] nodeIx = -1 + # ... but if the last word starts with the sample name (with private mutations) + # then we do not want to mask just that sample, so backtrack to [-2] if nodes[-1].startswith(name): nodeIx = nodeIx - 1 + # ... and the spec might say to backtrack even more (e.g. parent or grandparent): nodeIx = nodeIx - getBacktrack(spec, name) nodeMuts = nodes[nodeIx] + # Strip to just the node ID, discard mutations node = nodeMuts.split(':')[0] repNodes[name] = node; # Make sure we found all of them for rep in repNodes: if repNodes[rep] == '': die("sample-paths file {samplePaths.name} does not have name {rep}") os.unlink(samplePaths.name) return repNodes def makeMaskFile(spec, repNodes, maskFileName): """Create a file to use as input for matUtils mask --mask-mutations, generated from spec with node IDs from repNodes.""" with open(maskFileName, 'w') as maskFile: for branch in spec: branchSpec = spec[branch]