edd8fa03488ce1ee3fa7b12b64de39ad0e3fc216
angie
  Wed Oct 21 12:57:36 2020 -0700
New utils for making the public-sequences-only version of the SARS-CoV-2
Phylogeny track.
Forgot to add these back in commit 0901c8be.  refs #26364

diff --git src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c
new file mode 100644
index 0000000..04071cf
--- /dev/null
+++ src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c
@@ -0,0 +1,166 @@
+/* vcfRenameAndPrune - Rename or remove samples from VCF with genotypes. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "obscure.h"
+#include "options.h"
+#include "vcf.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "vcfRenameAndPrune - Rename samples in VCF; if new name not found, remove sample.\n"
+  "usage:\n"
+  "   vcfRenameAndPrune vcfIn.vcf[.gz] renaming.tab vcfOut.vcf\n"
+//  "options:\n"
+//  "   -xxx=XXX\n"
+  "renaming.tab has two columns: old name (must uniquely match some sample\n"
+  "named in #CHROM header line) and new name.\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+void vcfRenameAndPrune(char *vcfInFile, char *renamingFile, char *vcfOutFile)
+/* vcfRenameAndPrune - Rename or remove samples from VCF with genotypes. */
+{
+struct hash *renaming = hashTwoColumnFile(renamingFile);
+struct lineFile *lf = lineFileOpen(vcfInFile, TRUE);
+FILE *outF = mustOpen(vcfOutFile, "w");
+int headerColCount = 0;
+int keeperCountMax = hashNumEntries(renaming);
+int keeperColumns[keeperCountMax];
+int keeperCount = 0;
+int keeperIx = 0;
+char *line;
+while (lineFileNext(lf, &line, NULL))
+    {
+    if (startsWith("#CHROM", line))
+        {
+        // Parse & replace sample names, build array of genotype columns that we're keeping
+        headerColCount = chopString(line, "\t", NULL, 0);
+        lineFileExpectAtLeast(lf, VCF_NUM_COLS_BEFORE_GENOTYPES+1, headerColCount);
+        char *words[headerColCount];
+        chopTabs(line, words);
+        fputs(words[0], outF);
+        int i;
+        for (i = 1;  i < VCF_NUM_COLS_BEFORE_GENOTYPES;  i++)
+            fprintf(outF, "\t%s", words[i]);
+        for (i = VCF_NUM_COLS_BEFORE_GENOTYPES;  i < headerColCount;  i++)
+            {
+            char *newName = hashFindVal(renaming, words[i]);
+            if (newName)
+                {
+                fprintf(outF, "\t%s", newName);
+                if (keeperIx >= keeperCountMax)
+                    lineFileAbort(lf, "Too many matching names in #CHROM line -- "
+                                  "duplicate values in input? "
+                                  "(%d values in renaming, too many matching names at column %d",
+                                  keeperCountMax, i);
+                keeperColumns[keeperIx++] = i;
+                }
+            }
+        keeperCount = keeperIx;
+        fputc('\n', outF);
+        verbose(2, "Found %d keepers (out of %d genotype columns and %d entries in %s)\n",
+                keeperCount, headerColCount - VCF_NUM_COLS_BEFORE_GENOTYPES, keeperCountMax,
+                renamingFile);
+        }
+    else if (line[0] == '#')
+        {
+        // Pass through other header lines
+        fputs(line, outF);
+        fputc('\n', outF);
+        }
+    else
+        {
+        // Data line: print out only the genotype columns that we're keeping
+        if (headerColCount == 0)
+            lineFileAbort(lf, "Missing #CHROM header line -- can't rename.");
+        char *words[headerColCount+1];
+        int wordCount = chopTabs(line, words);
+        lineFileExpectWords(lf, headerColCount, wordCount);
+        // Recompute the counts of reference and alternate alleles in genotypes that we're keeping.
+        // Keep only the alternate alleles that have a nonzero count.
+        // Discard a row if there are no alternate alleles with nonzero count.
+        char altCopy[strlen(words[4])+1];
+        safecpy(altCopy, sizeof altCopy, words[4]);
+        int altCount = chopString(altCopy, ",", NULL, 0);
+        char *alts[altCount];
+        chopCommas(altCopy, alts);
+        int newAltCount = 0;
+        char *newAlts[altCount];
+        int newAltCounts[altCount];
+        memset(newAltCounts, 0, sizeof newAltCounts);
+        int altIxOldToNew[altCount];
+        int i;
+        for (i = 0;  i < altCount;  i++)
+            altIxOldToNew[i] = -1;
+        int totalCalls = keeperCount;
+        for (i = 0;  i < keeperCount;  i++)
+            {
+            char *gt = words[keeperColumns[i]];
+            if (sameString(gt, "."))
+                totalCalls--;
+            else if (!isAllDigits(gt))
+                lineFileAbort(lf, "Expected genotype to be '.' or a number, but got '%s' "
+                              "in keeperColumns[%d] = column %d",
+                              gt, i, keeperColumns[i]);
+            else
+                {
+                int alIx = atoi(gt);
+                if (alIx > 0)
+                    {
+                    // Alternate allele; if this is the first time we've seen this one, add it to
+                    // newAlts.
+                    int oldAltIx = alIx - 1;
+                    int newAltIx = altIxOldToNew[oldAltIx];
+                    if (newAltIx < 0)
+                        {
+                        newAltIx = newAltCount++;
+                        newAlts[newAltIx] = alts[oldAltIx];
+                        newAltCounts[newAltIx] = 1;
+                        altIxOldToNew[oldAltIx] = newAltIx;
+                        }
+                    else
+                        newAltCounts[newAltIx]++;
+                    // Update gt, i.e. words[keeperColumns[i]], with the new allele index.
+                    int newAlIx = newAltIx + 1;
+                    safef(gt, strlen(gt)+1, "%d", newAlIx);
+                    }
+                }
+            }
+        if (newAltCount > 0)
+            {
+            // Write out line with updated alts, info, genotype columns
+            fprintf(outF, "%s\t%s\t%s\t%s\t%s", words[0], words[1], words[2], words[3],
+                    newAlts[0]);
+            for (i = 1;  i < newAltCount;  i++)
+                fprintf(outF, ",%s", newAlts[i]);
+            fprintf(outF, "\t%s\t%s\tAC=%d", words[5], words[6], newAltCounts[0]);
+            for (i = 1;  i < newAltCount;  i++)
+                fprintf(outF, ",%d", newAltCounts[i]);
+            fprintf(outF, ";AN=%d\tGT", totalCalls);
+            for (i = 0;  i < keeperCount;  i++)
+                fprintf(outF, "\t%s", words[keeperColumns[i]]);
+            fputc('\n', outF);
+            }
+        }
+    }
+lineFileClose(&lf);
+carefulClose(&outF);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+vcfRenameAndPrune(argv[1], argv[2], argv[3]);
+return 0;
+}