b69d77d71d0bf2464ed68a165fd1988a7044753e angie Sun Jun 27 20:20:21 2021 -0700 Added -includeNoAltN option to include info about Ns in the VCF. diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c index 454cb1e..c6341a4 100644 --- src/hg/utils/faToVcf/faToVcf.c +++ src/hg/utils/faToVcf/faToVcf.c @@ -5,58 +5,61 @@ #include "linefile.h" #include "hash.h" #include "iupac.h" #include "options.h" void usage(int exitCode) /* Explain usage and exit. */ { fputs( "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n" "usage:\n" " faToVcf in.fa out.vcf\n" "options:\n" " -ambiguousToN Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n" " -excludeFile=file Exclude sequences named in file which has one sequence name per line\n" + " -includeNoAltN Include base positions with no alternate alleles observed, but at\n" + " least one N (missing base / no-call)\n" " -includeRef Include the reference in the genotype columns\n" " (default: omitted as redundant)\n" " -maskSites=file Exclude variants in positions recommended for masking in file\n" " (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n" " -maxDiff=N Exclude sequences with more than N mismatches with the reference\n" " -minAc=N Ignore alternate alleles observed fewer than N times\n" " -minAf=F Ignore alternate alleles observed in less than F of non-N bases\n" " -noGenotypes Output 8-column VCF, without the sample genotype columns\n" " -ref=seqName Use seqName as the reference sequence; must be present in faFile\n" " (default: first sequence in faFile)\n" " -resolveAmbiguous For IUPAC ambiguous characters like R (A or G), if the character\n" " represents two bases and one is the reference base, convert it to the\n" " non-reference base. Otherwise convert it to N.\n" " -startOffset=N Add N bases to each position (for trimmed alignments)\n" " -vcfChrom=seqName Use seqName for the CHROM column in VCF (default: ref sequence)\n" "in.fa must contain a series of sequences with different names and the same length.\n" "Both N and - are treated as missing information.\n" , stderr ); exit(exitCode); } #define MAX_ALTS 256 /* Command line validation table. */ static struct optionSpec options[] = { { "ambiguousToN", OPTION_BOOLEAN }, { "excludeFile", OPTION_STRING }, + { "includeNoAltN", OPTION_BOOLEAN }, { "includeRef", OPTION_BOOLEAN }, { "maskSites", OPTION_STRING }, { "maxDiff", OPTION_INT }, { "minAc", OPTION_INT }, { "minAf", OPTION_DOUBLE }, { "noGenotypes", OPTION_BOOLEAN }, { "ref", OPTION_STRING }, { "resolveAmbiguous", OPTION_BOOLEAN }, { "startOffset", OPTION_INT}, { "vcfChrom", OPTION_STRING }, { "h", OPTION_BOOLEAN }, { "-help", OPTION_BOOLEAN }, { NULL, 0 }, }; @@ -386,53 +389,58 @@ { int oldGt = genotypes[gtIx]; if (oldGt == 0) genotypes[gtIx] = 0; else genotypes[gtIx] = oldToNewIx[oldGt-1] + 1; } altCount = newAltCount; for (altIx = 0; altIx < newAltCount; altIx++) { altAlleles[altIx] = newAltAlleles[altIx]; altAlleleCounts[altIx] = newAltAlleleCounts[altIx]; } } } - if (altCount) + if (altCount || (optionExists("includeNoAltN") && nonNCount < gtCount)) { int pos = chromStart + startOffset + 1; fprintf(outF, "%s\t%d\t", vcfChrom, pos); + if (altCount == 0) + fprintf(outF, "%c%d%c\t%c\t*\t.\t.\tAC=0;AN=%d", ref, pos, ref, ref, nonNCount); + else + { int altIx; for (altIx = 0; altIx < altCount; altIx++) fprintf(outF, "%s%c%d%c", (altIx > 0) ? "," : "", ref, pos, altAlleles[altIx]); fprintf(outF, "\t%c\t", ref); for (altIx = 0; altIx < altCount; altIx++) { if (altIx > 0) fprintf(outF, ","); fprintf(outF, "%c", altAlleles[altIx]); } fprintf(outF, "\t.\t.\tAC="); for (altIx = 0; altIx < altCount; altIx++) { if (altIx > 0) fprintf(outF, ","); fprintf(outF, "%d", altAlleleCounts[altIx]); } fprintf(outF, ";AN=%d", nonNCount); + } if (!noGenotypes) { fputs("\tGT", outF); for (gtIx = 0; gtIx < gtCount; gtIx++) { if (missing[gtIx]) fprintf(outF, "\t."); else fprintf(outF, "\t%d", genotypes[gtIx]); } } fprintf(outF, "\n"); } } carefulClose(&outF);