775f8e290daefbb3312fd1d8962344690550c887
angie
  Mon Jun 8 16:20:24 2020 -0700
Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/sampleCladeLineage.py src/hg/utils/otto/nextstrainNcov/sampleCladeLineage.py
new file mode 100755
index 0000000..e3459fa
--- /dev/null
+++ src/hg/utils/otto/nextstrainNcov/sampleCladeLineage.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+import logging, argparse, sys
+import nextstrainVcf, utils, virusNames
+
+def main():
+    parser = argparse.ArgumentParser(description="""
+Read samples and clade assignments from a Nextstrain VCF file.
+Read lineage assignments from lineageFile.  Write out 3 tab-sep columns:
+NS sample ID, clade, lineage.
+"""
+    )
+    parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data')
+    parser.add_argument('lineageFile', help='Two-column tab-sep file mapping sample to lineage')
+    args = parser.parse_args()
+
+    (vcfSamples, vcfSampleClades) = nextstrainVcf.readVcfSamples(args.vcfFile)
+    idLookup = virusNames.makeIdLookup(vcfSamples)
+    lineages = utils.dictFromFile(args.lineageFile)
+    nsLineages = dict([ (virusNames.maybeLookupSeqName(name, idLookup), lin)
+                        for name, lin in lineages.items() ])
+    for sample, clade in vcfSampleClades.items():
+        lineage = nsLineages.get(sample)
+        if (not lineage):
+            lineage = ''
+        print('\t'.join([sample, clade, lineage]))
+
+main()