9da4f41aff8bc9265c4538a3af39736faed03d51 angie Wed May 26 19:34:38 2021 -0700 Added -minAf (alt allele frequency ignoring N bases). diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c index d2eb9a4..8c9cbf3 100644 --- src/hg/utils/faToVcf/faToVcf.c +++ src/hg/utils/faToVcf/faToVcf.c @@ -10,54 +10,56 @@ void usage(int exitCode) /* Explain usage and exit. */ { fputs( "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n" "usage:\n" " faToVcf in.fa out.vcf\n" "options:\n" " -ambiguousToN Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n" " -excludeFile=file Exclude sequences named in file which has one sequence name per line\n" " -includeRef Include the reference in the genotype columns\n" " (default: omitted as redundant)\n" " -maskSites=file Exclude variants in positions recommended for masking in file\n" " (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n" " -minAc=N Ignore alternate alleles observed fewer than N times\n" + " -minAf=F Ignore alternate alleles observed in less than F of non-N bases\n" " -noGenotypes Output 8-column VCF, without the sample genotype columns\n" " -ref=seqName Use seqName as the reference sequence; must be present in faFile\n" " (default: first sequence in faFile)\n" " -resolveAmbiguous For IUPAC ambiguous characters like R (A or G), if the character\n" " represents two bases and one is the reference base, convert it to the\n" " non-reference base. Otherwise convert it to N.\n" " -startOffset=N Add N bases to each position (for trimmed alignments)\n" " -vcfChrom=seqName Use seqName for the CHROM column in VCF (default: ref sequence)\n" "in.fa must contain a series of sequences with different names and the same length.\n" "Both N and - are treated as missing information.\n" , stderr ); exit(exitCode); } #define MAX_ALTS 256 /* Command line validation table. */ static struct optionSpec options[] = { { "ambiguousToN", OPTION_BOOLEAN }, { "excludeFile", OPTION_STRING }, { "includeRef", OPTION_BOOLEAN }, { "maskSites", OPTION_STRING }, { "minAc", OPTION_INT }, + { "minAf", OPTION_DOUBLE }, { "noGenotypes", OPTION_BOOLEAN }, { "ref", OPTION_STRING }, { "resolveAmbiguous", OPTION_BOOLEAN }, { "startOffset", OPTION_INT}, { "vcfChrom", OPTION_STRING }, { "h", OPTION_BOOLEAN }, { "-help", OPTION_BOOLEAN }, { NULL, 0 }, }; static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences) /* If -excludeFile was passed in, remove any sequences whose names are found in the file and * return the modified list. */ { char *excludeFile = optionVal("excludeFile", NULL); @@ -234,31 +236,31 @@ chromStart--; continue; } if (ref == 'N') continue; if (maskSites && maskSites[chromStart]) continue; if (ref == 'U') ref = 'T'; char altAlleles[MAX_ALTS]; int altAlleleCounts[MAX_ALTS]; int altCount = 0; memset(altAlleles, 0, sizeof(altAlleles)); memset(genotypes, 0, sizeof(int) * gtCount); memset(missing, 0, sizeof(boolean) * gtCount); - int nonNCount = seqCount; + int nonNCount = gtCount; struct dnaSeq *seq; int gtIx; for (seq = seqsForGt, gtIx = 0; seq != NULL; seq = seq->next, gtIx++) { char al = toupper(seq->dna[baseIx]); if (al == 'U') al = 'T'; if (isIupacAmbiguous(al)) { if (optionExists("ambiguousToN")) al = 'N'; else if (optionExists("resolveAmbiguous")) { char *bases = iupacAmbiguousToString(al); if (strlen(bases) > 2 || @@ -285,40 +287,43 @@ if (altCount == MAX_ALTS) errAbort("faToVcf: fasta base %d (reference base %d) has at least %d alternate alleles; increase " "MAX_ALTS in faToVcf.c!", baseIx, chromStart, MAX_ALTS); altAlleleCounts[altIx] = 1; altAlleles[altIx] = al; } else { altIx = altFound - altAlleles; altAlleleCounts[altIx]++; } genotypes[gtIx] = altIx+1; } } int minAc = optionInt("minAc", 0); - if (minAc > 0) + double minAf = optionDouble("minAf", 0.0); + if (minAc > 0 || minAf > 0.0) { int oldToNewIx[altCount]; char newAltAlleles[altCount]; int newAltAlleleCounts[altCount]; int newAltCount = 0; + double nonNDouble = nonNCount; int altIx; for (altIx = 0; altIx < altCount; altIx++) { - if (altAlleleCounts[altIx] >= minAc) + if (altAlleleCounts[altIx] >= minAc && + (altAlleleCounts[altIx] / nonNDouble) >= minAf) { // This alt passes the filter. int newAltIx = newAltCount++; newAltAlleleCounts[newAltIx] = altAlleleCounts[altIx]; newAltAlleles[newAltIx] = altAlleles[altIx]; oldToNewIx[altIx] = newAltIx; } else { // We're removing this alt; count its genotypes as missing since they're not // really reference. for (gtIx = 0; gtIx < gtCount; gtIx++) if (genotypes[gtIx] == altIx+1) { missing[gtIx] = TRUE;