db42a8012f0dad34f7dbe68a948970d3a17e1524
angie
Mon Oct 26 09:38:10 2020 -0700
hashTwoColumnFile expects whitespace-sep not tab-sep -- update usage message for {phylo,vcf}RenameAndPrune.
diff --git src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c
index 04071cf..61a4240 100644
--- src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c
+++ src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c
@@ -1,166 +1,166 @@
/* vcfRenameAndPrune - Rename or remove samples from VCF with genotypes. */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "obscure.h"
#include "options.h"
#include "vcf.h"
void usage()
/* Explain usage and exit. */
{
errAbort(
"vcfRenameAndPrune - Rename samples in VCF; if new name not found, remove sample.\n"
"usage:\n"
- " vcfRenameAndPrune vcfIn.vcf[.gz] renaming.tab vcfOut.vcf\n"
+ " vcfRenameAndPrune vcfIn.vcf[.gz] renaming.txt vcfOut.vcf\n"
// "options:\n"
// " -xxx=XXX\n"
- "renaming.tab has two columns: old name (must uniquely match some sample\n"
- "named in #CHROM header line) and new name.\n"
+ "renaming.txt has two whitespace-separated columns: old name (must uniquely match\n"
+ "some sample named in #CHROM header line) and new name.\n"
);
}
/* Command line validation table. */
static struct optionSpec options[] = {
{NULL, 0},
};
void vcfRenameAndPrune(char *vcfInFile, char *renamingFile, char *vcfOutFile)
/* vcfRenameAndPrune - Rename or remove samples from VCF with genotypes. */
{
struct hash *renaming = hashTwoColumnFile(renamingFile);
struct lineFile *lf = lineFileOpen(vcfInFile, TRUE);
FILE *outF = mustOpen(vcfOutFile, "w");
int headerColCount = 0;
int keeperCountMax = hashNumEntries(renaming);
int keeperColumns[keeperCountMax];
int keeperCount = 0;
int keeperIx = 0;
char *line;
while (lineFileNext(lf, &line, NULL))
{
if (startsWith("#CHROM", line))
{
// Parse & replace sample names, build array of genotype columns that we're keeping
headerColCount = chopString(line, "\t", NULL, 0);
lineFileExpectAtLeast(lf, VCF_NUM_COLS_BEFORE_GENOTYPES+1, headerColCount);
char *words[headerColCount];
chopTabs(line, words);
fputs(words[0], outF);
int i;
for (i = 1; i < VCF_NUM_COLS_BEFORE_GENOTYPES; i++)
fprintf(outF, "\t%s", words[i]);
for (i = VCF_NUM_COLS_BEFORE_GENOTYPES; i < headerColCount; i++)
{
char *newName = hashFindVal(renaming, words[i]);
if (newName)
{
fprintf(outF, "\t%s", newName);
if (keeperIx >= keeperCountMax)
lineFileAbort(lf, "Too many matching names in #CHROM line -- "
"duplicate values in input? "
"(%d values in renaming, too many matching names at column %d",
keeperCountMax, i);
keeperColumns[keeperIx++] = i;
}
}
keeperCount = keeperIx;
fputc('\n', outF);
verbose(2, "Found %d keepers (out of %d genotype columns and %d entries in %s)\n",
keeperCount, headerColCount - VCF_NUM_COLS_BEFORE_GENOTYPES, keeperCountMax,
renamingFile);
}
else if (line[0] == '#')
{
// Pass through other header lines
fputs(line, outF);
fputc('\n', outF);
}
else
{
// Data line: print out only the genotype columns that we're keeping
if (headerColCount == 0)
lineFileAbort(lf, "Missing #CHROM header line -- can't rename.");
char *words[headerColCount+1];
int wordCount = chopTabs(line, words);
lineFileExpectWords(lf, headerColCount, wordCount);
// Recompute the counts of reference and alternate alleles in genotypes that we're keeping.
// Keep only the alternate alleles that have a nonzero count.
// Discard a row if there are no alternate alleles with nonzero count.
char altCopy[strlen(words[4])+1];
safecpy(altCopy, sizeof altCopy, words[4]);
int altCount = chopString(altCopy, ",", NULL, 0);
char *alts[altCount];
chopCommas(altCopy, alts);
int newAltCount = 0;
char *newAlts[altCount];
int newAltCounts[altCount];
memset(newAltCounts, 0, sizeof newAltCounts);
int altIxOldToNew[altCount];
int i;
for (i = 0; i < altCount; i++)
altIxOldToNew[i] = -1;
int totalCalls = keeperCount;
for (i = 0; i < keeperCount; i++)
{
char *gt = words[keeperColumns[i]];
if (sameString(gt, "."))
totalCalls--;
else if (!isAllDigits(gt))
lineFileAbort(lf, "Expected genotype to be '.' or a number, but got '%s' "
"in keeperColumns[%d] = column %d",
gt, i, keeperColumns[i]);
else
{
int alIx = atoi(gt);
if (alIx > 0)
{
// Alternate allele; if this is the first time we've seen this one, add it to
// newAlts.
int oldAltIx = alIx - 1;
int newAltIx = altIxOldToNew[oldAltIx];
if (newAltIx < 0)
{
newAltIx = newAltCount++;
newAlts[newAltIx] = alts[oldAltIx];
newAltCounts[newAltIx] = 1;
altIxOldToNew[oldAltIx] = newAltIx;
}
else
newAltCounts[newAltIx]++;
// Update gt, i.e. words[keeperColumns[i]], with the new allele index.
int newAlIx = newAltIx + 1;
safef(gt, strlen(gt)+1, "%d", newAlIx);
}
}
}
if (newAltCount > 0)
{
// Write out line with updated alts, info, genotype columns
fprintf(outF, "%s\t%s\t%s\t%s\t%s", words[0], words[1], words[2], words[3],
newAlts[0]);
for (i = 1; i < newAltCount; i++)
fprintf(outF, ",%s", newAlts[i]);
fprintf(outF, "\t%s\t%s\tAC=%d", words[5], words[6], newAltCounts[0]);
for (i = 1; i < newAltCount; i++)
fprintf(outF, ",%d", newAltCounts[i]);
fprintf(outF, ";AN=%d\tGT", totalCalls);
for (i = 0; i < keeperCount; i++)
fprintf(outF, "\t%s", words[keeperColumns[i]]);
fputc('\n', outF);
}
}
}
lineFileClose(&lf);
carefulClose(&outF);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 4)
usage();
vcfRenameAndPrune(argv[1], argv[2], argv[3]);
return 0;
}