src/hg/utils/faToVcf/faToVcf.c 37ec02a06719324a9c6cb2b167893b9b0559fbdd

37ec02a06719324a9c6cb2b167893b9b0559fbdd
angie
  Thu Sep 30 09:34:43 2021 -0700
Added -windowSize and -minAmbigInWindow options to mask bases with too many Ns/ambigs/gaps within +-windowSize bases.
Idea taken from Nicola De Maio and Rob Lanfear's mask_seq.py script in sarscov2phylo.

diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c
index c6341a4..0ff67ae 100644
--- src/hg/utils/faToVcf/faToVcf.c
+++ src/hg/utils/faToVcf/faToVcf.c
@@ -1,75 +1,88 @@
 /* faToVcf - Convert a FASTA alignment file to VCF. */
 #include "common.h"
 #include "errCatch.h"
 #include "fa.h"
 #include "linefile.h"
 #include "hash.h"
 #include "iupac.h"
 #include "options.h"
 
+// Command line option defaults
+#define DEFAULT_MIN_AMBIG_IN_WINDOW 2
+
 void usage(int exitCode)
 /* Explain usage and exit. */
 {
-fputs(
+fprintf(stderr,
   "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n"
   "usage:\n"
   "   faToVcf in.fa out.vcf\n"
   "options:\n"
   "   -ambiguousToN         Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n"
   "   -excludeFile=file     Exclude sequences named in file which has one sequence name per line\n"
   "   -includeNoAltN        Include base positions with no alternate alleles observed, but at\n"
   "                         least one N (missing base / no-call)\n"
   "   -includeRef           Include the reference in the genotype columns\n"
   "                         (default: omitted as redundant)\n"
   "   -maskSites=file       Exclude variants in positions recommended for masking in file\n"
   "                         (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n"
   "   -maxDiff=N            Exclude sequences with more than N mismatches with the reference\n"
+  "                         (if -windowSize is used, sequences are masked accordingly first)\n"
   "   -minAc=N              Ignore alternate alleles observed fewer than N times\n"
   "   -minAf=F              Ignore alternate alleles observed in less than F of non-N bases\n"
+  "   -minAmbigInWindow=N   When -windowSize is provided, mask any base for which there are at\n"
+  "                         least this many N, ambiguous or gap characters within the window.\n"
+  "                         (default: %d)\n"
   "   -noGenotypes          Output 8-column VCF, without the sample genotype columns\n"
   "   -ref=seqName          Use seqName as the reference sequence; must be present in faFile\n"
   "                         (default: first sequence in faFile)\n"
   "   -resolveAmbiguous     For IUPAC ambiguous characters like R (A or G), if the character\n"
   "                         represents two bases and one is the reference base, convert it to the\n"
   "                         non-reference base.  Otherwise convert it to N.\n"
   "   -startOffset=N        Add N bases to each position (for trimmed alignments)\n"
   "   -vcfChrom=seqName     Use seqName for the CHROM column in VCF (default: ref sequence)\n"
+  "   -windowSize=N         Mask any base for which there are at least -minAmbigWindow bases in a\n"
+  "                         window of +-N bases around the base.  Masking approach adapted from\n"
+  "                         https://github.com/roblanf/sarscov2phylo/ file scripts/mask_seq.py\n"
+  "                         Use -windowSize=7 for same results.\n"
   "in.fa must contain a series of sequences with different names and the same length.\n"
   "Both N and - are treated as missing information.\n"
-  , stderr
+  , DEFAULT_MIN_AMBIG_IN_WINDOW
   );
 exit(exitCode);
 }
 
 #define MAX_ALTS 256
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     { "ambiguousToN", OPTION_BOOLEAN },
     { "excludeFile", OPTION_STRING },
     { "includeNoAltN", OPTION_BOOLEAN },
     { "includeRef", OPTION_BOOLEAN },
     { "maskSites", OPTION_STRING },
     { "maxDiff", OPTION_INT },
     { "minAc", OPTION_INT },
     { "minAf", OPTION_DOUBLE },
+    { "minAmbigInWindow", OPTION_INT },
     { "noGenotypes", OPTION_BOOLEAN },
     { "ref", OPTION_STRING },
     { "resolveAmbiguous", OPTION_BOOLEAN },
     { "startOffset", OPTION_INT},
     { "vcfChrom", OPTION_STRING },
+    { "windowSize", OPTION_INT },
     { "h", OPTION_BOOLEAN },
     { "-help", OPTION_BOOLEAN },
     { NULL, 0 },
 };
 
 static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences)
 /* If -excludeFile was passed in, remove any sequences whose names are found in the file and
  * return the modified list. */
 {
 char *excludeFile = optionVal("excludeFile", NULL);
 if (excludeFile)
     {
     struct lineFile *lf = lineFileOpen(excludeFile, TRUE);
     struct hash *excludedSeqs = hashNew(0);
     char *line;
@@ -137,30 +150,85 @@
         nextSeq = seq->next;
         int diff = countDiffs(ref, seq);
         if (diff > maxDiff)
             excludeCount++;
         else
             slAddHead(&newList, seq);
         }
     slReverse(&newList);
     sequences = newList;
     verbose(2, "Excluded %d sequences with >%d differences with the reference (%d sequences remaining including reference)\n",
             excludeCount, maxDiff, slCount(sequences));
     }
 return sequences;
 }
 
+INLINE boolean isAmbig(char c)
+/* Return TRUE if c is '-' or upper or lowercase IUPAC ambiguous including N. */
+{
+return (c == '-' || isIupacAmbiguous(c));
+}
+
+static void maskAmbigInWindow(struct dnaSeq *sequences)
+/* If -windowSize was passed in (possibly with -minAmbigInWindow), mask any base with at least
+ * minAmbigInWindow N/ambiguous/gap characters within +- windowSize bases.
+ * Adapted from https://github.com/roblanf/sarscov2phylo/ file scripts/mask_seq.py which has
+ * a windowSize of 7, minAmbigInWindow of 2, counts only gap or N (not other ambiguous bases),
+ * and also masks the first and last 30 non-N bases. */
+{
+int windowSize = optionInt("windowSize", 0);
+if (windowSize > 0)
+    {
+    int minAmbigInWindow = optionInt("minAmbigInWindow", DEFAULT_MIN_AMBIG_IN_WINDOW);
+    struct dnaSeq *seq;
+    for (seq = sequences->next;  seq != NULL;  seq = seq->next)
+        {
+        char *newSeq = needMem(seq->size + 1);
+        int numAmbig = 0;
+        int i;
+        // Initialize numAmbig
+        for (i = 0;  i <= windowSize;  i++)
+            {
+            if (isAmbig(seq->dna[i]))
+                numAmbig++;
+            }
+        // Decide whether to mask each base in seq
+        for (i = 0;  i < seq->size;  i++)
+            {
+            if (numAmbig >= minAmbigInWindow)
+                {
+                if (seq->dna[i] == '-')
+                    newSeq[i] = '-';
+                else
+                    newSeq[i] = 'N';
+                }
+            else
+                newSeq[i] = seq->dna[i];
+            // Remove the leftmost base from numAmbig unless it's before the start of seq
+            if (i >= windowSize && isAmbig(seq->dna[i - windowSize]))
+                numAmbig--;
+            // Add the rightmost base for the next iteration unless it's past the end of seq
+            if (i < seq->size - windowSize - 1 && isAmbig(seq->dna[i + 1 + windowSize]))
+                numAmbig++;
+            }
+        safecpy(seq->dna, seq->size + 1, newSeq);
+        freeMem(newSeq);
+        }
+    }
+}
+
+
 static struct dnaSeq *readSequences(char *faFile)
 /* Read all sequences in faFile.  Make sure there are at least 2 sequences and that
  * all have the same length.  If a reference sequence is specified and is not the first
  * sequence in the file, then move the reference sequence to the head of the list. */
 {
 verbose(2, "Reading sequences from %s\n", faFile);
 struct dnaSeq *sequences = faReadAllMixed(faFile);
 int seqCount = slCount(sequences);
 verbose(2, "Read %d sequences.\n", seqCount);
 
 if (seqCount < 2)
     errAbort("faToVcf: expecting multiple sequences in %s but found only %d.", faFile, seqCount);
 
 int seqSize = sequences->size;
 struct dnaSeq *seq;
@@ -181,30 +249,31 @@
         {
         if (sameString(seq->next->name, refName))
             {
             struct dnaSeq *ref = seq->next;
             seq->next = ref->next;
             ref->next = sequences;
             sequences = ref;
             break;
             }
         }
     if (differentString(sequences->name, refName))
         errAbort("Could not find -ref sequence '%s' in %s.", refName, faFile);
     }
 
 sequences = removeExcludedSequences(sequences);
+maskAmbigInWindow(sequences);
 sequences = filterMaxDiff(sequences);
 return sequences;
 }
 
 static void writeVcfHeader(FILE *outF, char *faFile, char *vcfFile,
                            struct dnaSeq *sequences, boolean includeRef, boolean includeGenotypes)
 /* Write a simple VCF header with sequence names as genotype columns. */
 {
 fprintf(outF,
         "##fileformat=VCFv4.2\n"
         "##reference=%s:%s\n"
         "##source=faToVcf %s %s\n"  //#*** Should document options used.
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
         faFile, sequences->name, faFile, vcfFile);
 if (includeGenotypes)