37ec02a06719324a9c6cb2b167893b9b0559fbdd angie Thu Sep 30 09:34:43 2021 -0700 Added -windowSize and -minAmbigInWindow options to mask bases with too many Ns/ambigs/gaps within +-windowSize bases. Idea taken from Nicola De Maio and Rob Lanfear's mask_seq.py script in sarscov2phylo. diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c index c6341a4..0ff67ae 100644 --- src/hg/utils/faToVcf/faToVcf.c +++ src/hg/utils/faToVcf/faToVcf.c @@ -1,75 +1,88 @@ /* faToVcf - Convert a FASTA alignment file to VCF. */ #include "common.h" #include "errCatch.h" #include "fa.h" #include "linefile.h" #include "hash.h" #include "iupac.h" #include "options.h" +// Command line option defaults +#define DEFAULT_MIN_AMBIG_IN_WINDOW 2 + void usage(int exitCode) /* Explain usage and exit. */ { -fputs( +fprintf(stderr, "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n" "usage:\n" " faToVcf in.fa out.vcf\n" "options:\n" " -ambiguousToN Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n" " -excludeFile=file Exclude sequences named in file which has one sequence name per line\n" " -includeNoAltN Include base positions with no alternate alleles observed, but at\n" " least one N (missing base / no-call)\n" " -includeRef Include the reference in the genotype columns\n" " (default: omitted as redundant)\n" " -maskSites=file Exclude variants in positions recommended for masking in file\n" " (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n" " -maxDiff=N Exclude sequences with more than N mismatches with the reference\n" + " (if -windowSize is used, sequences are masked accordingly first)\n" " -minAc=N Ignore alternate alleles observed fewer than N times\n" " -minAf=F Ignore alternate alleles observed in less than F of non-N bases\n" + " -minAmbigInWindow=N When -windowSize is provided, mask any base for which there are at\n" + " least this many N, ambiguous or gap characters within the window.\n" + " (default: %d)\n" " -noGenotypes Output 8-column VCF, without the sample genotype columns\n" " -ref=seqName Use seqName as the reference sequence; must be present in faFile\n" " (default: first sequence in faFile)\n" " -resolveAmbiguous For IUPAC ambiguous characters like R (A or G), if the character\n" " represents two bases and one is the reference base, convert it to the\n" " non-reference base. Otherwise convert it to N.\n" " -startOffset=N Add N bases to each position (for trimmed alignments)\n" " -vcfChrom=seqName Use seqName for the CHROM column in VCF (default: ref sequence)\n" + " -windowSize=N Mask any base for which there are at least -minAmbigWindow bases in a\n" + " window of +-N bases around the base. Masking approach adapted from\n" + " https://github.com/roblanf/sarscov2phylo/ file scripts/mask_seq.py\n" + " Use -windowSize=7 for same results.\n" "in.fa must contain a series of sequences with different names and the same length.\n" "Both N and - are treated as missing information.\n" - , stderr + , DEFAULT_MIN_AMBIG_IN_WINDOW ); exit(exitCode); } #define MAX_ALTS 256 /* Command line validation table. */ static struct optionSpec options[] = { { "ambiguousToN", OPTION_BOOLEAN }, { "excludeFile", OPTION_STRING }, { "includeNoAltN", OPTION_BOOLEAN }, { "includeRef", OPTION_BOOLEAN }, { "maskSites", OPTION_STRING }, { "maxDiff", OPTION_INT }, { "minAc", OPTION_INT }, { "minAf", OPTION_DOUBLE }, + { "minAmbigInWindow", OPTION_INT }, { "noGenotypes", OPTION_BOOLEAN }, { "ref", OPTION_STRING }, { "resolveAmbiguous", OPTION_BOOLEAN }, { "startOffset", OPTION_INT}, { "vcfChrom", OPTION_STRING }, + { "windowSize", OPTION_INT }, { "h", OPTION_BOOLEAN }, { "-help", OPTION_BOOLEAN }, { NULL, 0 }, }; static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences) /* If -excludeFile was passed in, remove any sequences whose names are found in the file and * return the modified list. */ { char *excludeFile = optionVal("excludeFile", NULL); if (excludeFile) { struct lineFile *lf = lineFileOpen(excludeFile, TRUE); struct hash *excludedSeqs = hashNew(0); char *line; @@ -137,30 +150,85 @@ nextSeq = seq->next; int diff = countDiffs(ref, seq); if (diff > maxDiff) excludeCount++; else slAddHead(&newList, seq); } slReverse(&newList); sequences = newList; verbose(2, "Excluded %d sequences with >%d differences with the reference (%d sequences remaining including reference)\n", excludeCount, maxDiff, slCount(sequences)); } return sequences; } +INLINE boolean isAmbig(char c) +/* Return TRUE if c is '-' or upper or lowercase IUPAC ambiguous including N. */ +{ +return (c == '-' || isIupacAmbiguous(c)); +} + +static void maskAmbigInWindow(struct dnaSeq *sequences) +/* If -windowSize was passed in (possibly with -minAmbigInWindow), mask any base with at least + * minAmbigInWindow N/ambiguous/gap characters within +- windowSize bases. + * Adapted from https://github.com/roblanf/sarscov2phylo/ file scripts/mask_seq.py which has + * a windowSize of 7, minAmbigInWindow of 2, counts only gap or N (not other ambiguous bases), + * and also masks the first and last 30 non-N bases. */ +{ +int windowSize = optionInt("windowSize", 0); +if (windowSize > 0) + { + int minAmbigInWindow = optionInt("minAmbigInWindow", DEFAULT_MIN_AMBIG_IN_WINDOW); + struct dnaSeq *seq; + for (seq = sequences->next; seq != NULL; seq = seq->next) + { + char *newSeq = needMem(seq->size + 1); + int numAmbig = 0; + int i; + // Initialize numAmbig + for (i = 0; i <= windowSize; i++) + { + if (isAmbig(seq->dna[i])) + numAmbig++; + } + // Decide whether to mask each base in seq + for (i = 0; i < seq->size; i++) + { + if (numAmbig >= minAmbigInWindow) + { + if (seq->dna[i] == '-') + newSeq[i] = '-'; + else + newSeq[i] = 'N'; + } + else + newSeq[i] = seq->dna[i]; + // Remove the leftmost base from numAmbig unless it's before the start of seq + if (i >= windowSize && isAmbig(seq->dna[i - windowSize])) + numAmbig--; + // Add the rightmost base for the next iteration unless it's past the end of seq + if (i < seq->size - windowSize - 1 && isAmbig(seq->dna[i + 1 + windowSize])) + numAmbig++; + } + safecpy(seq->dna, seq->size + 1, newSeq); + freeMem(newSeq); + } + } +} + + static struct dnaSeq *readSequences(char *faFile) /* Read all sequences in faFile. Make sure there are at least 2 sequences and that * all have the same length. If a reference sequence is specified and is not the first * sequence in the file, then move the reference sequence to the head of the list. */ { verbose(2, "Reading sequences from %s\n", faFile); struct dnaSeq *sequences = faReadAllMixed(faFile); int seqCount = slCount(sequences); verbose(2, "Read %d sequences.\n", seqCount); if (seqCount < 2) errAbort("faToVcf: expecting multiple sequences in %s but found only %d.", faFile, seqCount); int seqSize = sequences->size; struct dnaSeq *seq; @@ -181,30 +249,31 @@ { if (sameString(seq->next->name, refName)) { struct dnaSeq *ref = seq->next; seq->next = ref->next; ref->next = sequences; sequences = ref; break; } } if (differentString(sequences->name, refName)) errAbort("Could not find -ref sequence '%s' in %s.", refName, faFile); } sequences = removeExcludedSequences(sequences); +maskAmbigInWindow(sequences); sequences = filterMaxDiff(sequences); return sequences; } static void writeVcfHeader(FILE *outF, char *faFile, char *vcfFile, struct dnaSeq *sequences, boolean includeRef, boolean includeGenotypes) /* Write a simple VCF header with sequence names as genotype columns. */ { fprintf(outF, "##fileformat=VCFv4.2\n" "##reference=%s:%s\n" "##source=faToVcf %s %s\n" //#*** Should document options used. "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO", faFile, sequences->name, faFile, vcfFile); if (includeGenotypes)