b226cb3af001057dea4635de2d4c6c52f0ebe047 angie Sat Jun 20 14:34:08 2020 -0700 New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build. Updates to other VCF & tree utils. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/treeLineage.py src/hg/utils/otto/nextstrainNcov/treeLineage.py index 3dd87bc..54779e0 100755 --- src/hg/utils/otto/nextstrainNcov/treeLineage.py +++ src/hg/utils/otto/nextstrainNcov/treeLineage.py @@ -1,19 +1,19 @@ #!/usr/bin/env python3 import logging, argparse, sys -import lineageColors, newick, nextstrainVcf, utils, virusNames +import lineageColors, newick, vcf, utils, virusNames def assignColors(node, idLookup, labelToLineage): if (node['kids']): # Internal node: do all descendants have same lineage? kids = node['kids'] for kid in kids: assignColors(kid, idLookup, labelToLineage) kidLineages = set([ kid.get('lineage') for kid in kids ]); if (len(kidLineages) == 1): node['lineage'] = list(kidLineages)[0] else: node['lineage'] = '' kidColors = set([ kid.get('color') for kid in kids ]) if (len(kidColors) == 1): node['color'] = list(kidColors)[0] @@ -25,37 +25,37 @@ print("\t".join([ label, node['lineage'], node['color']])) else: # Leaf: look up lineage by label lineage = labelToLineage.get(virusNames.maybeLookupSeqName(node['label'], idLookup)) if (not lineage): logging.warn('No lineage for "' + node['label'] + '"') lineage = '' node['lineage'] = lineage node['color'] = "#%06x" % (lineageColors.lineageToColor(lineage)) print('\t'.join([ node['label'], lineage, node['color'] ])) def main(): parser = argparse.ArgumentParser(description=""" Read tree from Newick treeFile. Read sample IDs that are a concatenation of EPI ID, sample name and approximate date, -for resolving sampleFile IDs and lineageFile IDs, from a Nextstrain VCF file. +for resolving sampleFile IDs and lineageFile IDs, from a VCF file. Read lineage assignments from lineageFile. Figure out what lineage and color (if any) are assigned to each leaf, and then work back towards root assigning color to each named node whose descendants all have same color. Write out 3 tab-sep columns: sampleOrNode, lineage, lineageColor. """ ) - parser.add_argument('treeFile', help='File containing sample IDs') - parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data') + parser.add_argument('treeFile', help='Newick tree whose leaf labels are sample IDs') + parser.add_argument('vcfFile', help='VCF file with genotype columns for the sample samples') parser.add_argument('lineageFile', help='Two-column tab-sep file mapping sample to lineage') args = parser.parse_args() tree = newick.parseFile(args.treeFile) - (vcfSamples, vcfSampleClades) = nextstrainVcf.readVcfSamples(args.vcfFile) + vcfSamples = vcf.readSamples(args.vcfFile) idLookup = virusNames.makeIdLookup(vcfSamples) lineages = utils.dictFromFile(args.lineageFile) nsLineages = dict([ (virusNames.maybeLookupSeqName(name, idLookup), lin) for name, lin in lineages.items() ]) assignColors(tree, idLookup, nsLineages) main()