src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py 775f8e290daefbb3312fd1d8962344690550c887

775f8e290daefbb3312fd1d8962344690550c887
angie
  Mon Jun 8 16:20:24 2020 -0700
Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py
new file mode 100644
index 0000000..d338368
--- /dev/null
+++ src/hg/utils/otto/nextstrainNcov/nextstrainVcf.py
@@ -0,0 +1,27 @@
+# Utils for dealing with VCF generated by nextstrain.py, with a particular style of ID
+# and clades appearing in genotypes...
+
+from collections import defaultdict
+
+def readVcfSamples(vcfFile):
+    """Read VCF sample IDs from the #CHROM line, and parse out clades from the first row GT cols"""
+    samples = []
+    sampleClades = defaultdict()
+    with open(vcfFile, 'r') as vcfF:
+        line = vcfF.readline().strip()
+        while (line):
+            if (line.startswith('#CHROM')):
+                samples = line.split('\t')[9:]
+            elif (not line.startswith('#')):
+                gts = line.split('\t')[9:]
+                if (len(gts) != len(samples)):
+                    die("VCF file '%s' has %d samples but %d genotypes in first row" %
+                        (vcfFile, len(samples), len(gts)));
+                for sample, gt in zip(samples, gts):
+                    gtVal, clade = gt.split(':')
+                    sampleClades[sample] = clade
+                break
+            line = vcfF.readline().strip()
+        vcfF.close()
+    return samples, sampleClades
+