55093dc24432e86e100890a206c4bab473e74276
angie
  Fri Oct 16 15:59:25 2020 -0700
faToVcf: add option -maskSites to remove variants at positions recommended for masking in in Problematic Sites VCF file.

diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c
index 9cda34f..ed9e691 100644
--- src/hg/utils/faToVcf/faToVcf.c
+++ src/hg/utils/faToVcf/faToVcf.c
@@ -6,51 +6,54 @@
 #include "iupac.h"
 #include "options.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n"
   "usage:\n"
   "   faToVcf in.fa out.vcf\n"
   "options:\n"
   "   -ambiguousToN         Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n"
   "   -excludeFile=file     Exclude sequences named in file which has one sequence name per line\n"
   "   -includeRef           Include the reference in the genotype columns\n"
   "                         (default: omitted as redundant)\n"
+  "   -maskSites=file       Exclude variants in positions recommended for masking in file\n"
+  "                         (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n"
   "   -minAc=N              Ignore alternate alleles observed fewer than N times\n"
   "   -noGenotypes          Output 8-column VCF, without the sample genotype columns\n"
   "   -ref=seqName          Use seqName as the reference sequence; must be present in faFile\n"
   "                         (default: first sequence in faFile)\n"
   "   -resolveAmbiguous     For IUPAC ambiguous characters like R (A or G), if the character\n"
   "                         represents two bases and one is the reference base, convert it to the\n"
   "                         non-reference base.  Otherwise convert it to N.\n"
   "   -startOffset=N        Add N bases to each position (for trimmed alignments)\n"
   "   -vcfChrom=seqName     Use seqName for the CHROM column in VCF (default: ref sequence)\n"
   "in.fa must contain a series of sequences with different names and the same length.\n"
   "Both N and - are treated as missing information.\n"
   );
 }
 
 #define MAX_ALTS 256
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     { "ambiguousToN", OPTION_BOOLEAN },
     { "excludeFile", OPTION_STRING },
     { "includeRef", OPTION_BOOLEAN },
+    { "maskSites", OPTION_STRING },
     { "minAc", OPTION_INT },
     { "noGenotypes", OPTION_BOOLEAN },
     { "ref", OPTION_STRING },
     { "resolveAmbiguous", OPTION_BOOLEAN },
     { "startOffset", OPTION_INT},
     { "vcfChrom", OPTION_STRING },
     { NULL, 0 },
 };
 
 static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences)
 /* If -excludeFile was passed in, remove any sequences whose names are found in the file and
  * return the modified list. */
 {
 char *excludeFile = optionVal("excludeFile", NULL);
 if (excludeFile)
@@ -134,72 +137,112 @@
 return sequences;
 }
 
 static void writeVcfHeader(FILE *outF, char *faFile, char *vcfFile,
                            struct dnaSeq *sequences, boolean includeRef, boolean includeGenotypes)
 /* Write a simple VCF header with sequence names as genotype columns. */
 {
 fprintf(outF,
         "##fileformat=VCFv4.2\n"
         "##reference=%s:%s\n"
         "##source=faToVcf %s %s\n"  //#*** Should document options used.
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
         faFile, sequences->name, faFile, vcfFile);
 if (includeGenotypes)
     {
-    printf("\tFORMAT");
+    fprintf(outF, "\tFORMAT");
     struct dnaSeq *seqsForGt = (includeRef ? sequences : sequences->next), *seq;
     for (seq = seqsForGt;  seq != NULL;  seq = seq->next)
         fprintf(outF, "\t%s", seq->name);
     }
 fprintf(outF, "\n");
 }
 
+static boolean *getMaskSites(char *maskSitesFile, struct dnaSeq *ref)
+/* If -maskSites=file is given, parse sites to be masked from file (expecting VCF with 'mask' in
+ * the filter column at reference positions to be masked) and return an array of booleans indexed
+ * by reference position, TRUE for mask.  Otherwise return NULL. */
+{
+boolean *maskSites = NULL;
+if (maskSitesFile)
+    {
+    int chromSize = strlen(ref->dna) - countChars(ref->dna, '-');
+    AllocArray(maskSites, chromSize);
+    struct lineFile *lf = lineFileOpen(maskSitesFile, TRUE);
+    char *line;
+    while (lineFileNext(lf, &line, NULL))
+        {
+        if (line[0] == '#')
+            continue;
+        char *words[9];
+        int wordCount = chopTabs(line, words);
+        lineFileExpectWords(lf, 8, wordCount);
+        if (!isAllDigits(words[1]))
+            lineFileAbort(lf, "Expected second column to contain numeric position but got '%s'",
+                          words[1]);
+        int pos = atol(words[1]);
+        if (pos < 1 || pos > chromSize)
+            {
+            warn("Warning line %d of %s: "
+                 "Expected second column to contain positions between 1 and %d but got %d",
+                 lf->lineIx, lf->fileName, chromSize, pos);
+            }
+        else if (sameString(words[6], "mask"))
+            maskSites[pos-1] = TRUE;
+        }
+    lineFileClose(&lf);
+    }
+return maskSites;
+}
+
 void faToVcf(char *faFile, char *vcfFile)
 /* faToVcf - Convert a FASTA alignment file to VCF. */
 {
 struct dnaSeq *sequences = readSequences(faFile);
 int seqCount = slCount(sequences);
 int seqSize = sequences->size;
 boolean includeRef = optionExists("includeRef");
 
 verbose(2, "Writing VCF to %s\n", vcfFile);
 FILE *outF = mustOpen(vcfFile, "w");
 boolean noGenotypes = optionExists("noGenotypes");
 writeVcfHeader(outF, faFile, vcfFile, sequences, includeRef, !noGenotypes);
 
-char *refName = sequences->name;
-char *vcfChrom = optionVal("vcfChrom", refName);
+struct dnaSeq *refSeq = sequences;
+char *vcfChrom = optionVal("vcfChrom", refSeq->name);
 int startOffset = optionInt("startOffset", 0);
+boolean *maskSites = getMaskSites(optionVal("maskSites", NULL), refSeq);
 
 struct dnaSeq *seqsForGt = (includeRef ? sequences : sequences->next);
 int gtCount = includeRef ? seqCount : seqCount - 1;
 int *genotypes = needMem(sizeof(int) * gtCount);
 boolean *missing = needMem(sizeof(boolean) * gtCount);
 
 int baseIx, chromStart = 0;
 for (baseIx = 0, chromStart = 0;  baseIx < seqSize;  baseIx++, chromStart++)
     {
-    char ref = toupper(sequences->dna[baseIx]);
+    char ref = toupper(refSeq->dna[baseIx]);
     if (ref == '-')
         {
         // Insertion into reference -- ignore and adjust reference coordinate.
         chromStart--;
         continue;
         }
     if (ref == 'N')
         continue;
+    if (maskSites && maskSites[chromStart])
+        continue;
     if (ref == 'U')
         ref = 'T';
     char altAlleles[MAX_ALTS];
     int altAlleleCounts[MAX_ALTS];
     int altCount = 0;
     memset(altAlleles, 0, sizeof(altAlleles));
     memset(genotypes, 0, sizeof(int) * gtCount);
     memset(missing, 0, sizeof(boolean) * gtCount);
     int nonNCount = seqCount;
     struct dnaSeq *seq;
     int gtIx;
     for (seq = seqsForGt, gtIx = 0;  seq != NULL;  seq = seq->next, gtIx++)
         {
         char al = toupper(seq->dna[baseIx]);
         if (al == 'U')