775f8e290daefbb3312fd1d8962344690550c887 angie Mon Jun 8 16:20:24 2020 -0700 Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/sampleCladeLineage.py src/hg/utils/otto/nextstrainNcov/sampleCladeLineage.py new file mode 100755 index 0000000..e3459fa --- /dev/null +++ src/hg/utils/otto/nextstrainNcov/sampleCladeLineage.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import logging, argparse, sys +import nextstrainVcf, utils, virusNames + +def main(): + parser = argparse.ArgumentParser(description=""" +Read samples and clade assignments from a Nextstrain VCF file. +Read lineage assignments from lineageFile. Write out 3 tab-sep columns: +NS sample ID, clade, lineage. +""" + ) + parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data') + parser.add_argument('lineageFile', help='Two-column tab-sep file mapping sample to lineage') + args = parser.parse_args() + + (vcfSamples, vcfSampleClades) = nextstrainVcf.readVcfSamples(args.vcfFile) + idLookup = virusNames.makeIdLookup(vcfSamples) + lineages = utils.dictFromFile(args.lineageFile) + nsLineages = dict([ (virusNames.maybeLookupSeqName(name, idLookup), lin) + for name, lin in lineages.items() ]) + for sample, clade in vcfSampleClades.items(): + lineage = nsLineages.get(sample) + if (not lineage): + lineage = '' + print('\t'.join([sample, clade, lineage])) + +main()