edd8fa03488ce1ee3fa7b12b64de39ad0e3fc216 angie Wed Oct 21 12:57:36 2020 -0700 New utils for making the public-sequences-only version of the SARS-CoV-2 Phylogeny track. Forgot to add these back in commit 0901c8be. refs #26364 diff --git src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c new file mode 100644 index 0000000..04071cf --- /dev/null +++ src/hg/utils/vcfRenameAndPrune/vcfRenameAndPrune.c @@ -0,0 +1,166 @@ +/* vcfRenameAndPrune - Rename or remove samples from VCF with genotypes. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "obscure.h" +#include "options.h" +#include "vcf.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "vcfRenameAndPrune - Rename samples in VCF; if new name not found, remove sample.\n" + "usage:\n" + " vcfRenameAndPrune vcfIn.vcf[.gz] renaming.tab vcfOut.vcf\n" +// "options:\n" +// " -xxx=XXX\n" + "renaming.tab has two columns: old name (must uniquely match some sample\n" + "named in #CHROM header line) and new name.\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {NULL, 0}, +}; + +void vcfRenameAndPrune(char *vcfInFile, char *renamingFile, char *vcfOutFile) +/* vcfRenameAndPrune - Rename or remove samples from VCF with genotypes. */ +{ +struct hash *renaming = hashTwoColumnFile(renamingFile); +struct lineFile *lf = lineFileOpen(vcfInFile, TRUE); +FILE *outF = mustOpen(vcfOutFile, "w"); +int headerColCount = 0; +int keeperCountMax = hashNumEntries(renaming); +int keeperColumns[keeperCountMax]; +int keeperCount = 0; +int keeperIx = 0; +char *line; +while (lineFileNext(lf, &line, NULL)) + { + if (startsWith("#CHROM", line)) + { + // Parse & replace sample names, build array of genotype columns that we're keeping + headerColCount = chopString(line, "\t", NULL, 0); + lineFileExpectAtLeast(lf, VCF_NUM_COLS_BEFORE_GENOTYPES+1, headerColCount); + char *words[headerColCount]; + chopTabs(line, words); + fputs(words[0], outF); + int i; + for (i = 1; i < VCF_NUM_COLS_BEFORE_GENOTYPES; i++) + fprintf(outF, "\t%s", words[i]); + for (i = VCF_NUM_COLS_BEFORE_GENOTYPES; i < headerColCount; i++) + { + char *newName = hashFindVal(renaming, words[i]); + if (newName) + { + fprintf(outF, "\t%s", newName); + if (keeperIx >= keeperCountMax) + lineFileAbort(lf, "Too many matching names in #CHROM line -- " + "duplicate values in input? " + "(%d values in renaming, too many matching names at column %d", + keeperCountMax, i); + keeperColumns[keeperIx++] = i; + } + } + keeperCount = keeperIx; + fputc('\n', outF); + verbose(2, "Found %d keepers (out of %d genotype columns and %d entries in %s)\n", + keeperCount, headerColCount - VCF_NUM_COLS_BEFORE_GENOTYPES, keeperCountMax, + renamingFile); + } + else if (line[0] == '#') + { + // Pass through other header lines + fputs(line, outF); + fputc('\n', outF); + } + else + { + // Data line: print out only the genotype columns that we're keeping + if (headerColCount == 0) + lineFileAbort(lf, "Missing #CHROM header line -- can't rename."); + char *words[headerColCount+1]; + int wordCount = chopTabs(line, words); + lineFileExpectWords(lf, headerColCount, wordCount); + // Recompute the counts of reference and alternate alleles in genotypes that we're keeping. + // Keep only the alternate alleles that have a nonzero count. + // Discard a row if there are no alternate alleles with nonzero count. + char altCopy[strlen(words[4])+1]; + safecpy(altCopy, sizeof altCopy, words[4]); + int altCount = chopString(altCopy, ",", NULL, 0); + char *alts[altCount]; + chopCommas(altCopy, alts); + int newAltCount = 0; + char *newAlts[altCount]; + int newAltCounts[altCount]; + memset(newAltCounts, 0, sizeof newAltCounts); + int altIxOldToNew[altCount]; + int i; + for (i = 0; i < altCount; i++) + altIxOldToNew[i] = -1; + int totalCalls = keeperCount; + for (i = 0; i < keeperCount; i++) + { + char *gt = words[keeperColumns[i]]; + if (sameString(gt, ".")) + totalCalls--; + else if (!isAllDigits(gt)) + lineFileAbort(lf, "Expected genotype to be '.' or a number, but got '%s' " + "in keeperColumns[%d] = column %d", + gt, i, keeperColumns[i]); + else + { + int alIx = atoi(gt); + if (alIx > 0) + { + // Alternate allele; if this is the first time we've seen this one, add it to + // newAlts. + int oldAltIx = alIx - 1; + int newAltIx = altIxOldToNew[oldAltIx]; + if (newAltIx < 0) + { + newAltIx = newAltCount++; + newAlts[newAltIx] = alts[oldAltIx]; + newAltCounts[newAltIx] = 1; + altIxOldToNew[oldAltIx] = newAltIx; + } + else + newAltCounts[newAltIx]++; + // Update gt, i.e. words[keeperColumns[i]], with the new allele index. + int newAlIx = newAltIx + 1; + safef(gt, strlen(gt)+1, "%d", newAlIx); + } + } + } + if (newAltCount > 0) + { + // Write out line with updated alts, info, genotype columns + fprintf(outF, "%s\t%s\t%s\t%s\t%s", words[0], words[1], words[2], words[3], + newAlts[0]); + for (i = 1; i < newAltCount; i++) + fprintf(outF, ",%s", newAlts[i]); + fprintf(outF, "\t%s\t%s\tAC=%d", words[5], words[6], newAltCounts[0]); + for (i = 1; i < newAltCount; i++) + fprintf(outF, ",%d", newAltCounts[i]); + fprintf(outF, ";AN=%d\tGT", totalCalls); + for (i = 0; i < keeperCount; i++) + fprintf(outF, "\t%s", words[keeperColumns[i]]); + fputc('\n', outF); + } + } + } +lineFileClose(&lf); +carefulClose(&outF); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 4) + usage(); +vcfRenameAndPrune(argv[1], argv[2], argv[3]); +return 0; +}