src/hg/utils/faToVcf/faToVcf.c 0410ce3f2bff7c04762f60551db166f4cae460f7

0410ce3f2bff7c04762f60551db166f4cae460f7
angie
  Tue Jun 22 10:13:57 2021 -0700
faToVcf: added -maxDiff to exclude sequences with too many mismatches w/reference (causes usher mem usage increase, no good placement).

diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c
index 8c9cbf3..454cb1e 100644
--- src/hg/utils/faToVcf/faToVcf.c
+++ src/hg/utils/faToVcf/faToVcf.c
@@ -9,55 +9,57 @@
 
 void usage(int exitCode)
 /* Explain usage and exit. */
 {
 fputs(
   "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n"
   "usage:\n"
   "   faToVcf in.fa out.vcf\n"
   "options:\n"
   "   -ambiguousToN         Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n"
   "   -excludeFile=file     Exclude sequences named in file which has one sequence name per line\n"
   "   -includeRef           Include the reference in the genotype columns\n"
   "                         (default: omitted as redundant)\n"
   "   -maskSites=file       Exclude variants in positions recommended for masking in file\n"
   "                         (typically https://github.com/W-L/ProblematicSites_SARS-CoV2/raw/master/problematic_sites_sarsCov2.vcf)\n"
+  "   -maxDiff=N            Exclude sequences with more than N mismatches with the reference\n"
   "   -minAc=N              Ignore alternate alleles observed fewer than N times\n"
   "   -minAf=F              Ignore alternate alleles observed in less than F of non-N bases\n"
   "   -noGenotypes          Output 8-column VCF, without the sample genotype columns\n"
   "   -ref=seqName          Use seqName as the reference sequence; must be present in faFile\n"
   "                         (default: first sequence in faFile)\n"
   "   -resolveAmbiguous     For IUPAC ambiguous characters like R (A or G), if the character\n"
   "                         represents two bases and one is the reference base, convert it to the\n"
   "                         non-reference base.  Otherwise convert it to N.\n"
   "   -startOffset=N        Add N bases to each position (for trimmed alignments)\n"
   "   -vcfChrom=seqName     Use seqName for the CHROM column in VCF (default: ref sequence)\n"
   "in.fa must contain a series of sequences with different names and the same length.\n"
   "Both N and - are treated as missing information.\n"
   , stderr
   );
 exit(exitCode);
 }
 
 #define MAX_ALTS 256
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     { "ambiguousToN", OPTION_BOOLEAN },
     { "excludeFile", OPTION_STRING },
     { "includeRef", OPTION_BOOLEAN },
     { "maskSites", OPTION_STRING },
+    { "maxDiff", OPTION_INT },
     { "minAc", OPTION_INT },
     { "minAf", OPTION_DOUBLE },
     { "noGenotypes", OPTION_BOOLEAN },
     { "ref", OPTION_STRING },
     { "resolveAmbiguous", OPTION_BOOLEAN },
     { "startOffset", OPTION_INT},
     { "vcfChrom", OPTION_STRING },
     { "h", OPTION_BOOLEAN },
     { "-help", OPTION_BOOLEAN },
     { NULL, 0 },
 };
 
 static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences)
 /* If -excludeFile was passed in, remove any sequences whose names are found in the file and
  * return the modified list. */
@@ -87,30 +89,75 @@
         nextSeq = seq->next;
         if (hashLookup(excludedSeqs, seq->name))
             excludeCount++;
         else
             slAddHead(&newList, seq);
         }
     hashFree(&excludedSeqs);
     slReverse(&newList);
     sequences = newList;
     verbose(2, "Excluded %d sequences named in %s (%d sequences remaining including reference)\n",
             excludeCount, excludeFile, slCount(sequences));
     }
 return sequences;
 }
 
+static int countDiffs(struct dnaSeq *ref, struct dnaSeq *seq)
+/* Return the number of bases that differ between ref and seq ignoring 'N'. */
+{
+if (ref->size != seq->size)
+    errAbort("countDiffs: expecting equally sized sequences but %s size %d != %s size %d",
+             ref->name, ref->size, seq->name, seq->size);
+int diffs = 0;
+int i;
+for (i = 0;  i < ref->size;  i++)
+    {
+    char refBase = toupper(ref->dna[i]);
+    char seqBase = toupper(seq->dna[i]);
+    if (refBase != 'N' && seqBase != 'N' && seqBase != refBase)
+        diffs++;
+    }
+return diffs;
+}
+
+static struct dnaSeq *filterMaxDiff(struct dnaSeq *sequences)
+/* If -maxDiff was passed in, remove any sequences with more than that number of differences
+ * from the reference (ignoring Ns but not IUPAC ambiguous bases). */
+{
+int maxDiff = optionInt("maxDiff", 0);
+if (maxDiff > 0)
+    {
+    int excludeCount = 0;
+    struct dnaSeq *ref = sequences;
+    struct dnaSeq *newList = NULL, *seq, *nextSeq = NULL;
+    for (seq = sequences;  seq != NULL;  seq = nextSeq)
+        {
+        nextSeq = seq->next;
+        int diff = countDiffs(ref, seq);
+        if (diff > maxDiff)
+            excludeCount++;
+        else
+            slAddHead(&newList, seq);
+        }
+    slReverse(&newList);
+    sequences = newList;
+    verbose(2, "Excluded %d sequences with >%d differences with the reference (%d sequences remaining including reference)\n",
+            excludeCount, maxDiff, slCount(sequences));
+    }
+return sequences;
+}
+
 static struct dnaSeq *readSequences(char *faFile)
 /* Read all sequences in faFile.  Make sure there are at least 2 sequences and that
  * all have the same length.  If a reference sequence is specified and is not the first
  * sequence in the file, then move the reference sequence to the head of the list. */
 {
 verbose(2, "Reading sequences from %s\n", faFile);
 struct dnaSeq *sequences = faReadAllMixed(faFile);
 int seqCount = slCount(sequences);
 verbose(2, "Read %d sequences.\n", seqCount);
 
 if (seqCount < 2)
     errAbort("faToVcf: expecting multiple sequences in %s but found only %d.", faFile, seqCount);
 
 int seqSize = sequences->size;
 struct dnaSeq *seq;
@@ -131,30 +178,31 @@
         {
         if (sameString(seq->next->name, refName))
             {
             struct dnaSeq *ref = seq->next;
             seq->next = ref->next;
             ref->next = sequences;
             sequences = ref;
             break;
             }
         }
     if (differentString(sequences->name, refName))
         errAbort("Could not find -ref sequence '%s' in %s.", refName, faFile);
     }
 
 sequences = removeExcludedSequences(sequences);
+sequences = filterMaxDiff(sequences);
 return sequences;
 }
 
 static void writeVcfHeader(FILE *outF, char *faFile, char *vcfFile,
                            struct dnaSeq *sequences, boolean includeRef, boolean includeGenotypes)
 /* Write a simple VCF header with sequence names as genotype columns. */
 {
 fprintf(outF,
         "##fileformat=VCFv4.2\n"
         "##reference=%s:%s\n"
         "##source=faToVcf %s %s\n"  //#*** Should document options used.
         "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
         faFile, sequences->name, faFile, vcfFile);
 if (includeGenotypes)
     {