73d02b0696465e3bbc8634b0650ec5230c379caa
angie
  Wed Jul 15 16:46:09 2020 -0700
faToVcf: added -ambiguousToN option to treat all IUPAC ambiguous bases as missing info (as opposed to observations of other alleles as with -resolveAmbiguous).  refs #25904

diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c
index 3272bfa..3d4fafc 100644
--- src/hg/utils/faToVcf/faToVcf.c
+++ src/hg/utils/faToVcf/faToVcf.c
@@ -2,50 +2,52 @@
 #include "common.h"
 #include "fa.h"
 #include "linefile.h"
 #include "hash.h"
 #include "iupac.h"
 #include "options.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n"
   "usage:\n"
   "   faToVcf in.fa out.vcf\n"
   "options:\n"
+  "   -ambiguousToN         Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n"
   "   -excludeFile=file     Exclude sequences named in file which has one sequence name per line\n"
   "   -includeRef           Include the reference in the genotype columns\n"
   "                         (default: omitted as redundant)\n"
   "   -minAc=N              Ignore alternate alleles observed fewer than N times\n"
   "   -ref=seqName          Use seqName as the reference sequence; must be present in faFile\n"
   "                         (default: first sequence in faFile)\n"
   "   -resolveAmbiguous     For IUPAC ambiguous characters like R (A or G), if the character\n"
   "                         represents two bases and one is the reference base, convert it to the\n"
   "                         non-reference base.  Otherwise convert it to N.\n"
   "   -startOffset=N        Add N bases to each position (for trimmed alignments)\n"
   "   -vcfChrom=seqName     Use seqName for the CHROM column in VCF (default: ref sequence)\n"
   "in.fa must contain a series of sequences with different names and the same length.\n"
   "Both N and - are treated as missing information.\n"
   );
 }
 
 #define MAX_ALTS 256
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
+    { "ambiguousToN", OPTION_BOOLEAN },
     { "excludeFile", OPTION_STRING },
     { "includeRef", OPTION_BOOLEAN },
     { "minAc", OPTION_INT },
     { "ref", OPTION_STRING },
     { "resolveAmbiguous", OPTION_BOOLEAN },
     { "startOffset", OPTION_INT},
     { "vcfChrom", OPTION_STRING },
     { NULL, 0 },
 };
 
 static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences)
 /* If -excludeFile was passed in, remove any sequences whose names are found in the file and
  * return the modified list. */
 {
 char *excludeFile = optionVal("excludeFile", NULL);
@@ -177,41 +179,46 @@
     if (ref == 'U')
         ref = 'T';
     char altAlleles[MAX_ALTS];
     int altAlleleCounts[MAX_ALTS];
     int altCount = 0;
     memset(altAlleles, 0, sizeof(altAlleles));
     memset(genotypes, 0, sizeof(int) * gtCount);
     memset(missing, 0, sizeof(boolean) * gtCount);
     struct dnaSeq *seq;
     int gtIx;
     for (seq = seqsForGt, gtIx = 0;  seq != NULL;  seq = seq->next, gtIx++)
         {
         char al = toupper(seq->dna[chromStart]);
         if (al == 'U')
             al = 'T';
-        if (isIupacAmbiguous(al) && optionExists("resolveAmbiguous"))
+        if (isIupacAmbiguous(al))
+            {
+            if (optionExists("ambiguousToN"))
+                al = 'N';
+            else if (optionExists("resolveAmbiguous"))
                 {
                 char *bases = iupacAmbiguousToString(al);
                 if (strlen(bases) > 2 ||
                     (toupper(bases[0]) != ref && toupper(bases[1]) != ref))
                     al = 'N';
                 else if (toupper(bases[0]) == ref)
                     al = toupper(bases[1]);
                 else
                     al = toupper(bases[0]);
                 }
+            }
         if (al == 'N' || al == '-')
             missing[gtIx] = TRUE;
         else if (al != ref)
             {
             char *altFound = strchr(altAlleles, al);
             int altIx = 0;
             if (! altFound)
                 {
                 altIx = altCount++;
                 if (altCount == MAX_ALTS)
                     errAbort("faToVcf: base %d has at least %d alternate alleles; increase "
                              "MAX_ALTS in faToVcf.c!", chromStart, MAX_ALTS);
                 altAlleleCounts[altIx] = 1;
                 altAlleles[altIx] = al;
                 }
@@ -297,20 +304,23 @@
             if (missing[gtIx])
                 fprintf(outF, "\t.");
             else
                 fprintf(outF, "\t%d", genotypes[gtIx]);
             }
         fprintf(outF, "\n");
         }
     }
 carefulClose(&outF);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
+if (optionExists("ambiguousToN") && optionExists("resolveAmbiguous"))
+    errAbort("-ambiguousToN and -resolveAmbiguous conflict with each other; "
+             "please use only one.");
 if (argc != 3)
     usage();
 faToVcf(argv[1], argv[2]);
 return 0;
 }