775f8e290daefbb3312fd1d8962344690550c887
angie
  Mon Jun 8 16:20:24 2020 -0700
Extending and updating tree manipulation & SARS-CoV-2 lineage coloring python scripts and modules that I've been working on for David et al.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py
index 2e3a83a..b270821 100755
--- src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/treeIntersectNextstrain.py
@@ -1,44 +1,22 @@
 #!/usr/bin/env python3
 
 import logging, argparse, sys
 import random
 from collections import defaultdict
 from utils import die
-import newick, cladeColors, virusNames
-
-def readVcfSamples(vcfFile):
-    """Read VCF sample IDs from the #CHROM line, and parse out clades from the first row GT cols"""
-    samples = []
-    sampleClades = defaultdict()
-    with open(vcfFile, 'r') as vcfF:
-        line = vcfF.readline().strip()
-        while (line):
-            if (line.startswith('#CHROM')):
-                samples = line.split('\t')[9:]
-            elif (not line.startswith('#')):
-                gts = line.split('\t')[9:]
-                if (len(gts) != len(samples)):
-                    die("VCF file '%s' has %d samples but %d genotypes in first row" %
-                        (vcfFile, len(samples), len(gts)));
-                for sample, gt in zip(samples, gts):
-                    gtVal, clade = gt.split(':')
-                    sampleClades[sample] = clade
-                break
-            line = vcfF.readline().strip()
-        vcfF.close()
-    return samples, sampleClades
+import newick, cladeColors, nextstrainVcf, virusNames
 
 def treeIntersectIds(node, idLookup, sampleSet):
     """For each leaf in node, attempt to look up its label in idLookup; replace if found.
     Prune nodes with no matching leaves.  Store new leaf labels in sampleSet."""
     if (node['kids']):
         # Internal node: prune
         prunedKids = []
         for kid in (node['kids']):
             kidIntersected = treeIntersectIds(kid, idLookup, sampleSet)
             if (kidIntersected):
                 prunedKids.append(kidIntersected)
         if (len(prunedKids) > 1):
             node['kids'] = prunedKids
         elif (len(prunedKids) == 1):
             node = prunedKids[0]
@@ -92,31 +70,31 @@
             outF.close()
 
 def main():
     parser = argparse.ArgumentParser(description="""
 Read in tree, read in samples from VCF, attempt to match tree's leaf IDs with VCF IDs,
 prune tree to only branches with leaves found in VCF, output pruned tree with VCF IDs.
 """
     )
     parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain')
     parser.add_argument('vcfFile', help='VCF file derived from Nextstrain data')
     args = parser.parse_args()
     # Very large, deeply nested trees can exceed the default recursion limit of 1000.
     sys.setrecursionlimit(100000)
     # logging.basicConfig(level=logging.DEBUG, filename='intersect.log')
     tree = newick.parseFile(args.treeFile)
-    (vcfSamples, vcfSampleClades) = readVcfSamples(args.vcfFile)
+    (vcfSamples, vcfSampleClades) = nextstrainVcf.readVcfSamples(args.vcfFile)
     idLookup = virusNames.makeIdLookup(vcfSamples)
     for key in idLookup:
         values = idLookup[key]
         if (len(values) != 1):
             logging.warn('Duplicate name/component in VCF: ' + key + " -> " + ", ".join(values))
     sampleSet = set()
     tree = treeIntersectIds(tree, idLookup, sampleSet)
     cladeColors.addCladesAsBogusLength(tree, vcfSampleClades)
     newick.printTree(tree)
     if (len(sampleSet) < len(vcfSamples)):
         logging.warn("VCF has %d samples but pruned tree has %d leaves (%d VCF samples not found)" %
                      (len(vcfSamples), len(sampleSet), len(vcfSamples) - len(sampleSet)))
         vcfSampleSet = set(vcfSamples)
         vcfNotTree = vcfSampleSet - sampleSet
         logging.warn("Example VCF samples not found:\n" +