9da4f41aff8bc9265c4538a3af39736faed03d51
angie
  Wed May 26 19:34:38 2021 -0700
Added -minAf (alt allele frequency ignoring N bases).

diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c
index d2eb9a4..8c9cbf3 100644
--- src/hg/utils/faToVcf/faToVcf.c
+++ src/hg/utils/faToVcf/faToVcf.c
@@ -10,54 +10,56 @@
 void usage(int exitCode)
 /* Explain usage and exit. */
 {
 fputs(
   "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n"
   "usage:\n"
   "   faToVcf in.fa out.vcf\n"
   "options:\n"
   "   -ambiguousToN         Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n"
   "   -excludeFile=file     Exclude sequences named in file which has one sequence name per line\n"
   "   -includeRef           Include the reference in the genotype columns\n"
   "                         (default: omitted as redundant)\n"
   "   -maskSites=file       Exclude variants in positions recommended for masking in file\n"
   "                         (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n"
   "   -minAc=N              Ignore alternate alleles observed fewer than N times\n"
+  "   -minAf=F              Ignore alternate alleles observed in less than F of non-N bases\n"
   "   -noGenotypes          Output 8-column VCF, without the sample genotype columns\n"
   "   -ref=seqName          Use seqName as the reference sequence; must be present in faFile\n"
   "                         (default: first sequence in faFile)\n"
   "   -resolveAmbiguous     For IUPAC ambiguous characters like R (A or G), if the character\n"
   "                         represents two bases and one is the reference base, convert it to the\n"
   "                         non-reference base.  Otherwise convert it to N.\n"
   "   -startOffset=N        Add N bases to each position (for trimmed alignments)\n"
   "   -vcfChrom=seqName     Use seqName for the CHROM column in VCF (default: ref sequence)\n"
   "in.fa must contain a series of sequences with different names and the same length.\n"
   "Both N and - are treated as missing information.\n"
   , stderr
   );
 exit(exitCode);
 }
 
 #define MAX_ALTS 256
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     { "ambiguousToN", OPTION_BOOLEAN },
     { "excludeFile", OPTION_STRING },
     { "includeRef", OPTION_BOOLEAN },
     { "maskSites", OPTION_STRING },
     { "minAc", OPTION_INT },
+    { "minAf", OPTION_DOUBLE },
     { "noGenotypes", OPTION_BOOLEAN },
     { "ref", OPTION_STRING },
     { "resolveAmbiguous", OPTION_BOOLEAN },
     { "startOffset", OPTION_INT},
     { "vcfChrom", OPTION_STRING },
     { "h", OPTION_BOOLEAN },
     { "-help", OPTION_BOOLEAN },
     { NULL, 0 },
 };
 
 static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences)
 /* If -excludeFile was passed in, remove any sequences whose names are found in the file and
  * return the modified list. */
 {
 char *excludeFile = optionVal("excludeFile", NULL);
@@ -234,31 +236,31 @@
         chromStart--;
         continue;
         }
     if (ref == 'N')
         continue;
     if (maskSites && maskSites[chromStart])
         continue;
     if (ref == 'U')
         ref = 'T';
     char altAlleles[MAX_ALTS];
     int altAlleleCounts[MAX_ALTS];
     int altCount = 0;
     memset(altAlleles, 0, sizeof(altAlleles));
     memset(genotypes, 0, sizeof(int) * gtCount);
     memset(missing, 0, sizeof(boolean) * gtCount);
-    int nonNCount = seqCount;
+    int nonNCount = gtCount;
     struct dnaSeq *seq;
     int gtIx;
     for (seq = seqsForGt, gtIx = 0;  seq != NULL;  seq = seq->next, gtIx++)
         {
         char al = toupper(seq->dna[baseIx]);
         if (al == 'U')
             al = 'T';
         if (isIupacAmbiguous(al))
             {
             if (optionExists("ambiguousToN"))
                 al = 'N';
             else if (optionExists("resolveAmbiguous"))
                 {
                 char *bases = iupacAmbiguousToString(al);
                 if (strlen(bases) > 2 ||
@@ -285,40 +287,43 @@
                 if (altCount == MAX_ALTS)
                     errAbort("faToVcf: fasta base %d (reference base %d) has at least %d alternate alleles; increase "
                              "MAX_ALTS in faToVcf.c!", baseIx, chromStart, MAX_ALTS);
                 altAlleleCounts[altIx] = 1;
                 altAlleles[altIx] = al;
                 }
             else
                 {
                 altIx = altFound - altAlleles;
                 altAlleleCounts[altIx]++;
                 }
             genotypes[gtIx] = altIx+1;
             }
         }
     int minAc = optionInt("minAc", 0);
-    if (minAc > 0)
+    double minAf = optionDouble("minAf", 0.0);
+    if (minAc > 0 || minAf > 0.0)
         {
         int oldToNewIx[altCount];
         char newAltAlleles[altCount];
         int newAltAlleleCounts[altCount];
         int newAltCount = 0;
+        double nonNDouble = nonNCount;
         int altIx;
         for (altIx = 0;  altIx < altCount;  altIx++)
             {
-            if (altAlleleCounts[altIx] >= minAc)
+            if (altAlleleCounts[altIx] >= minAc &&
+                (altAlleleCounts[altIx] / nonNDouble) >= minAf)
                 {
                 // This alt passes the filter.
                 int newAltIx = newAltCount++;
                 newAltAlleleCounts[newAltIx] = altAlleleCounts[altIx];
                 newAltAlleles[newAltIx] = altAlleles[altIx];
                 oldToNewIx[altIx] = newAltIx;
                 }
             else
                 {
                 // We're removing this alt; count its genotypes as missing since they're not
                 // really reference.
                 for (gtIx = 0;  gtIx < gtCount;  gtIx++)
                     if (genotypes[gtIx] == altIx+1)
                         {
                         missing[gtIx] = TRUE;