src/hg/utils/otto/nextstrainNcov/vcf.py b226cb3af001057dea4635de2d4c6c52f0ebe047

b226cb3af001057dea4635de2d4c6c52f0ebe047
angie
  Sat Jun 20 14:34:08 2020 -0700
New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build.  Updates to other VCF & tree utils.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/vcf.py src/hg/utils/otto/nextstrainNcov/vcf.py
new file mode 100644
index 0000000..c9b7142
--- /dev/null
+++ src/hg/utils/otto/nextstrainNcov/vcf.py
@@ -0,0 +1,85 @@
+
+import gzip
+import utils
+
+def countAlleles(columns):
+    """For each alt allele in the 5th column, keep a count of how many times that alt's index
+    appears in the genotype columns, for the AC tag in the INFO field.  For the AN tag, tally up
+    the genotype columns that don't have '.' (no call).  Return the total number of calls for AN
+    and an array of alternate allele counts for AC.
+    NOTE: this does not handle multiploid VCF where genotypes can be /- or |-separated."""
+    totalCalls = 0
+    alts = columns[4]
+    altCounts = [ 0 for alt in alts.split(',')]
+    for colVal in columns[9:]:
+        subVals = colVal.split(':')
+        gt = subVals[0]
+        if (gt is not '.'):
+            totalCalls += 1
+            altIx = int(gt)-1
+            if (altIx >= 0):
+                altCounts[altIx] += 1
+    return totalCalls, altCounts
+
+def infoReplaceAcAn(oldInfo, totalCalls, altCounts):
+    """If oldInfo includes AC and AN, strip them out.  Add AN (totalCalls) and AC (altCounts)."""
+    infoParts = oldInfo.split(';')
+    newInfoParts = []
+    for tagVal in infoParts:
+        (tag, val) = tagVal.split('=')
+        if (tag not in ('AC', 'AN')):
+            newInfoParts.append(tagVal)
+    newInfoParts.append('AC=' + ','.join(map(str, altCounts)))
+    newInfoParts.append('AN=%d' % (totalCalls))
+    return ';'.join(newInfoParts)
+
+def pruneToSamples(vcfIn, sampleSet, vcfOut):
+    """Copy data in vcfIn to vcfOut, but keeping only the samples in sampleSet.
+    Update AC and AN with new allele counts."""
+    with gzip.open(vcfIn, 'rt') if vcfIn.endswith('.gz') else open(vcfIn, 'r') as inF:
+        with open(vcfOut, 'w') as outF:
+            colOutCount = 0
+            for line in inF:
+                if (line.startswith('#CHROM')):
+                    colNamesIn = line.split('\t')
+                    colNamesIn[-1] = colNamesIn[-1].rstrip()
+                    colNamesOut = colNamesIn[0:9]
+                    columnsKept = [True for i in range(0, 9)]
+                    for vcfSample in colNamesIn[9:]:
+                        if (vcfSample in sampleSet):
+                            colNamesOut.append(vcfSample)
+                            columnsKept.append(True)
+                        else:
+                            columnsKept.append(False)
+                    colOutCount = len(colNamesOut)
+                    outF.write('\t'.join(colNamesOut) + '\n');
+                elif (line[0] == '#'):
+                    outF.write(line);
+                else:
+                    columnsIn = line.split('\t')
+                    columnsIn[-1] = columnsIn[-1].rstrip()
+                    columnsOut = []
+                    for ix, colVal in enumerate(columnsIn):
+                        if (columnsKept[ix]):
+                            columnsOut.append(colVal)
+                    # Update allele counts from new set of genotypes.
+                    (totalCalls, altCounts) = countAlleles(columnsOut)
+                    # If this set of genotypes has none of the alt alleles, skip it.
+                    if (sum(altCounts) > 0):
+                        columnsOut[7] = infoReplaceAcAn(columnsIn[7], totalCalls, altCounts)
+                        outF.write('\t'.join(columnsOut) + '\n')
+
+def readSamples(vcfFile):
+    """Read VCF sample IDs from the #CHROM line"""
+    samples = []
+    with gzip.open(vcfFile, 'rt') if vcfFile.endswith('.gz') else open(vcfFile, 'r') as vcfF:
+        for line in vcfF:
+            if (line.startswith('#CHROM')):
+                samples = line.split('\t')[9:]
+                samples[-1] = samples[-1].rstrip()
+                break
+            else:
+                if (len(line) > 100):
+                   utils.die("Line too long (%d) and no #chrom yet" % (len(line)))
+    return samples
+