73d02b0696465e3bbc8634b0650ec5230c379caa angie Wed Jul 15 16:46:09 2020 -0700 faToVcf: added -ambiguousToN option to treat all IUPAC ambiguous bases as missing info (as opposed to observations of other alleles as with -resolveAmbiguous). refs #25904 diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c index 3272bfa..3d4fafc 100644 --- src/hg/utils/faToVcf/faToVcf.c +++ src/hg/utils/faToVcf/faToVcf.c @@ -2,50 +2,52 @@ #include "common.h" #include "fa.h" #include "linefile.h" #include "hash.h" #include "iupac.h" #include "options.h" void usage() /* Explain usage and exit. */ { errAbort( "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n" "usage:\n" " faToVcf in.fa out.vcf\n" "options:\n" + " -ambiguousToN Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n" " -excludeFile=file Exclude sequences named in file which has one sequence name per line\n" " -includeRef Include the reference in the genotype columns\n" " (default: omitted as redundant)\n" " -minAc=N Ignore alternate alleles observed fewer than N times\n" " -ref=seqName Use seqName as the reference sequence; must be present in faFile\n" " (default: first sequence in faFile)\n" " -resolveAmbiguous For IUPAC ambiguous characters like R (A or G), if the character\n" " represents two bases and one is the reference base, convert it to the\n" " non-reference base. Otherwise convert it to N.\n" " -startOffset=N Add N bases to each position (for trimmed alignments)\n" " -vcfChrom=seqName Use seqName for the CHROM column in VCF (default: ref sequence)\n" "in.fa must contain a series of sequences with different names and the same length.\n" "Both N and - are treated as missing information.\n" ); } #define MAX_ALTS 256 /* Command line validation table. */ static struct optionSpec options[] = { + { "ambiguousToN", OPTION_BOOLEAN }, { "excludeFile", OPTION_STRING }, { "includeRef", OPTION_BOOLEAN }, { "minAc", OPTION_INT }, { "ref", OPTION_STRING }, { "resolveAmbiguous", OPTION_BOOLEAN }, { "startOffset", OPTION_INT}, { "vcfChrom", OPTION_STRING }, { NULL, 0 }, }; static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences) /* If -excludeFile was passed in, remove any sequences whose names are found in the file and * return the modified list. */ { char *excludeFile = optionVal("excludeFile", NULL); @@ -177,41 +179,46 @@ if (ref == 'U') ref = 'T'; char altAlleles[MAX_ALTS]; int altAlleleCounts[MAX_ALTS]; int altCount = 0; memset(altAlleles, 0, sizeof(altAlleles)); memset(genotypes, 0, sizeof(int) * gtCount); memset(missing, 0, sizeof(boolean) * gtCount); struct dnaSeq *seq; int gtIx; for (seq = seqsForGt, gtIx = 0; seq != NULL; seq = seq->next, gtIx++) { char al = toupper(seq->dna[chromStart]); if (al == 'U') al = 'T'; - if (isIupacAmbiguous(al) && optionExists("resolveAmbiguous")) + if (isIupacAmbiguous(al)) + { + if (optionExists("ambiguousToN")) + al = 'N'; + else if (optionExists("resolveAmbiguous")) { char *bases = iupacAmbiguousToString(al); if (strlen(bases) > 2 || (toupper(bases[0]) != ref && toupper(bases[1]) != ref)) al = 'N'; else if (toupper(bases[0]) == ref) al = toupper(bases[1]); else al = toupper(bases[0]); } + } if (al == 'N' || al == '-') missing[gtIx] = TRUE; else if (al != ref) { char *altFound = strchr(altAlleles, al); int altIx = 0; if (! altFound) { altIx = altCount++; if (altCount == MAX_ALTS) errAbort("faToVcf: base %d has at least %d alternate alleles; increase " "MAX_ALTS in faToVcf.c!", chromStart, MAX_ALTS); altAlleleCounts[altIx] = 1; altAlleles[altIx] = al; } @@ -297,20 +304,23 @@ if (missing[gtIx]) fprintf(outF, "\t."); else fprintf(outF, "\t%d", genotypes[gtIx]); } fprintf(outF, "\n"); } } carefulClose(&outF); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); +if (optionExists("ambiguousToN") && optionExists("resolveAmbiguous")) + errAbort("-ambiguousToN and -resolveAmbiguous conflict with each other; " + "please use only one."); if (argc != 3) usage(); faToVcf(argv[1], argv[2]); return 0; }