55093dc24432e86e100890a206c4bab473e74276 angie Fri Oct 16 15:59:25 2020 -0700 faToVcf: add option -maskSites to remove variants at positions recommended for masking in in Problematic Sites VCF file. diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c index 9cda34f..ed9e691 100644 --- src/hg/utils/faToVcf/faToVcf.c +++ src/hg/utils/faToVcf/faToVcf.c @@ -6,51 +6,54 @@ #include "iupac.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort( "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n" "usage:\n" " faToVcf in.fa out.vcf\n" "options:\n" " -ambiguousToN Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n" " -excludeFile=file Exclude sequences named in file which has one sequence name per line\n" " -includeRef Include the reference in the genotype columns\n" " (default: omitted as redundant)\n" + " -maskSites=file Exclude variants in positions recommended for masking in file\n" + " (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n" " -minAc=N Ignore alternate alleles observed fewer than N times\n" " -noGenotypes Output 8-column VCF, without the sample genotype columns\n" " -ref=seqName Use seqName as the reference sequence; must be present in faFile\n" " (default: first sequence in faFile)\n" " -resolveAmbiguous For IUPAC ambiguous characters like R (A or G), if the character\n" " represents two bases and one is the reference base, convert it to the\n" " non-reference base. Otherwise convert it to N.\n" " -startOffset=N Add N bases to each position (for trimmed alignments)\n" " -vcfChrom=seqName Use seqName for the CHROM column in VCF (default: ref sequence)\n" "in.fa must contain a series of sequences with different names and the same length.\n" "Both N and - are treated as missing information.\n" ); } #define MAX_ALTS 256 /* Command line validation table. */ static struct optionSpec options[] = { { "ambiguousToN", OPTION_BOOLEAN }, { "excludeFile", OPTION_STRING }, { "includeRef", OPTION_BOOLEAN }, + { "maskSites", OPTION_STRING }, { "minAc", OPTION_INT }, { "noGenotypes", OPTION_BOOLEAN }, { "ref", OPTION_STRING }, { "resolveAmbiguous", OPTION_BOOLEAN }, { "startOffset", OPTION_INT}, { "vcfChrom", OPTION_STRING }, { NULL, 0 }, }; static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences) /* If -excludeFile was passed in, remove any sequences whose names are found in the file and * return the modified list. */ { char *excludeFile = optionVal("excludeFile", NULL); if (excludeFile) @@ -134,72 +137,112 @@ return sequences; } static void writeVcfHeader(FILE *outF, char *faFile, char *vcfFile, struct dnaSeq *sequences, boolean includeRef, boolean includeGenotypes) /* Write a simple VCF header with sequence names as genotype columns. */ { fprintf(outF, "##fileformat=VCFv4.2\n" "##reference=%s:%s\n" "##source=faToVcf %s %s\n" //#*** Should document options used. "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", faFile, sequences->name, faFile, vcfFile); if (includeGenotypes) { - printf("\tFORMAT"); + fprintf(outF, "\tFORMAT"); struct dnaSeq *seqsForGt = (includeRef ? sequences : sequences->next), *seq; for (seq = seqsForGt; seq != NULL; seq = seq->next) fprintf(outF, "\t%s", seq->name); } fprintf(outF, "\n"); } +static boolean *getMaskSites(char *maskSitesFile, struct dnaSeq *ref) +/* If -maskSites=file is given, parse sites to be masked from file (expecting VCF with 'mask' in + * the filter column at reference positions to be masked) and return an array of booleans indexed + * by reference position, TRUE for mask. Otherwise return NULL. */ +{ +boolean *maskSites = NULL; +if (maskSitesFile) + { + int chromSize = strlen(ref->dna) - countChars(ref->dna, '-'); + AllocArray(maskSites, chromSize); + struct lineFile *lf = lineFileOpen(maskSitesFile, TRUE); + char *line; + while (lineFileNext(lf, &line, NULL)) + { + if (line[0] == '#') + continue; + char *words[9]; + int wordCount = chopTabs(line, words); + lineFileExpectWords(lf, 8, wordCount); + if (!isAllDigits(words[1])) + lineFileAbort(lf, "Expected second column to contain numeric position but got '%s'", + words[1]); + int pos = atol(words[1]); + if (pos < 1 || pos > chromSize) + { + warn("Warning line %d of %s: " + "Expected second column to contain positions between 1 and %d but got %d", + lf->lineIx, lf->fileName, chromSize, pos); + } + else if (sameString(words[6], "mask")) + maskSites[pos-1] = TRUE; + } + lineFileClose(&lf); + } +return maskSites; +} + void faToVcf(char *faFile, char *vcfFile) /* faToVcf - Convert a FASTA alignment file to VCF. */ { struct dnaSeq *sequences = readSequences(faFile); int seqCount = slCount(sequences); int seqSize = sequences->size; boolean includeRef = optionExists("includeRef"); verbose(2, "Writing VCF to %s\n", vcfFile); FILE *outF = mustOpen(vcfFile, "w"); boolean noGenotypes = optionExists("noGenotypes"); writeVcfHeader(outF, faFile, vcfFile, sequences, includeRef, !noGenotypes); -char *refName = sequences->name; -char *vcfChrom = optionVal("vcfChrom", refName); +struct dnaSeq *refSeq = sequences; +char *vcfChrom = optionVal("vcfChrom", refSeq->name); int startOffset = optionInt("startOffset", 0); +boolean *maskSites = getMaskSites(optionVal("maskSites", NULL), refSeq); struct dnaSeq *seqsForGt = (includeRef ? sequences : sequences->next); int gtCount = includeRef ? seqCount : seqCount - 1; int *genotypes = needMem(sizeof(int) * gtCount); boolean *missing = needMem(sizeof(boolean) * gtCount); int baseIx, chromStart = 0; for (baseIx = 0, chromStart = 0; baseIx < seqSize; baseIx++, chromStart++) { - char ref = toupper(sequences->dna[baseIx]); + char ref = toupper(refSeq->dna[baseIx]); if (ref == '-') { // Insertion into reference -- ignore and adjust reference coordinate. chromStart--; continue; } if (ref == 'N') continue; + if (maskSites && maskSites[chromStart]) + continue; if (ref == 'U') ref = 'T'; char altAlleles[MAX_ALTS]; int altAlleleCounts[MAX_ALTS]; int altCount = 0; memset(altAlleles, 0, sizeof(altAlleles)); memset(genotypes, 0, sizeof(int) * gtCount); memset(missing, 0, sizeof(boolean) * gtCount); int nonNCount = seqCount; struct dnaSeq *seq; int gtIx; for (seq = seqsForGt, gtIx = 0; seq != NULL; seq = seq->next, gtIx++) { char al = toupper(seq->dna[baseIx]); if (al == 'U')