b69d77d71d0bf2464ed68a165fd1988a7044753e
angie
  Sun Jun 27 20:20:21 2021 -0700
Added -includeNoAltN option to include info about Ns in the VCF.

diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c
index 454cb1e..c6341a4 100644
--- src/hg/utils/faToVcf/faToVcf.c
+++ src/hg/utils/faToVcf/faToVcf.c
@@ -5,58 +5,61 @@
 #include "linefile.h"
 #include "hash.h"
 #include "iupac.h"
 #include "options.h"
 
 void usage(int exitCode)
 /* Explain usage and exit. */
 {
 fputs(
   "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n"
   "usage:\n"
   "   faToVcf in.fa out.vcf\n"
   "options:\n"
   "   -ambiguousToN         Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n"
   "   -excludeFile=file     Exclude sequences named in file which has one sequence name per line\n"
+  "   -includeNoAltN        Include base positions with no alternate alleles observed, but at\n"
+  "                         least one N (missing base / no-call)\n"
   "   -includeRef           Include the reference in the genotype columns\n"
   "                         (default: omitted as redundant)\n"
   "   -maskSites=file       Exclude variants in positions recommended for masking in file\n"
   "                         (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n"
   "   -maxDiff=N            Exclude sequences with more than N mismatches with the reference\n"
   "   -minAc=N              Ignore alternate alleles observed fewer than N times\n"
   "   -minAf=F              Ignore alternate alleles observed in less than F of non-N bases\n"
   "   -noGenotypes          Output 8-column VCF, without the sample genotype columns\n"
   "   -ref=seqName          Use seqName as the reference sequence; must be present in faFile\n"
   "                         (default: first sequence in faFile)\n"
   "   -resolveAmbiguous     For IUPAC ambiguous characters like R (A or G), if the character\n"
   "                         represents two bases and one is the reference base, convert it to the\n"
   "                         non-reference base.  Otherwise convert it to N.\n"
   "   -startOffset=N        Add N bases to each position (for trimmed alignments)\n"
   "   -vcfChrom=seqName     Use seqName for the CHROM column in VCF (default: ref sequence)\n"
   "in.fa must contain a series of sequences with different names and the same length.\n"
   "Both N and - are treated as missing information.\n"
   , stderr
   );
 exit(exitCode);
 }
 
 #define MAX_ALTS 256
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     { "ambiguousToN", OPTION_BOOLEAN },
     { "excludeFile", OPTION_STRING },
+    { "includeNoAltN", OPTION_BOOLEAN },
     { "includeRef", OPTION_BOOLEAN },
     { "maskSites", OPTION_STRING },
     { "maxDiff", OPTION_INT },
     { "minAc", OPTION_INT },
     { "minAf", OPTION_DOUBLE },
     { "noGenotypes", OPTION_BOOLEAN },
     { "ref", OPTION_STRING },
     { "resolveAmbiguous", OPTION_BOOLEAN },
     { "startOffset", OPTION_INT},
     { "vcfChrom", OPTION_STRING },
     { "h", OPTION_BOOLEAN },
     { "-help", OPTION_BOOLEAN },
     { NULL, 0 },
 };
 
@@ -386,53 +389,58 @@
                 {
                 int oldGt = genotypes[gtIx];
                 if (oldGt == 0)
                     genotypes[gtIx] = 0;
                 else
                     genotypes[gtIx] = oldToNewIx[oldGt-1] + 1;
                 }
             altCount = newAltCount;
             for (altIx = 0;  altIx < newAltCount;  altIx++)
                 {
                 altAlleles[altIx] = newAltAlleles[altIx];
                 altAlleleCounts[altIx] = newAltAlleleCounts[altIx];
                 }
             }
         }
-    if (altCount)
+    if (altCount || (optionExists("includeNoAltN") && nonNCount < gtCount))
         {
         int pos = chromStart + startOffset + 1;
         fprintf(outF, "%s\t%d\t", vcfChrom, pos);
+        if (altCount == 0)
+            fprintf(outF, "%c%d%c\t%c\t*\t.\t.\tAC=0;AN=%d", ref, pos, ref, ref, nonNCount);
+        else
+            {
             int altIx;
             for (altIx = 0;  altIx < altCount;  altIx++)
                 fprintf(outF, "%s%c%d%c", (altIx > 0) ? "," : "",
                         ref, pos, altAlleles[altIx]);
             fprintf(outF, "\t%c\t", ref);
             for (altIx = 0;  altIx < altCount;  altIx++)
                 {
                 if (altIx > 0)
                     fprintf(outF, ",");
                 fprintf(outF, "%c", altAlleles[altIx]);
                 }
             fprintf(outF, "\t.\t.\tAC=");
             for (altIx = 0;  altIx < altCount;  altIx++)
                 {
                 if (altIx > 0)
                     fprintf(outF, ",");
                 fprintf(outF, "%d", altAlleleCounts[altIx]);
                 }
             fprintf(outF, ";AN=%d", nonNCount);
+            }
         if (!noGenotypes)
             {
             fputs("\tGT", outF);
             for (gtIx = 0;  gtIx < gtCount;  gtIx++)
                 {
                 if (missing[gtIx])
                     fprintf(outF, "\t.");
                 else
                     fprintf(outF, "\t%d", genotypes[gtIx]);
                 }
             }
         fprintf(outF, "\n");
         }
     }
 carefulClose(&outF);