0083ef3c88c18261d0cf52ec3259a1add58c9d8f
angie
  Sun Sep 27 18:26:24 2020 -0700
Added -noGenotypes option.  Handle insertions in reference sequence.  Don't count N bases in INFO column AN.  refs MLQ #26227

diff --git src/hg/utils/faToVcf/faToVcf.c src/hg/utils/faToVcf/faToVcf.c
index 3d4fafc..9cda34f 100644
--- src/hg/utils/faToVcf/faToVcf.c
+++ src/hg/utils/faToVcf/faToVcf.c
@@ -7,50 +7,52 @@
 #include "options.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "faToVcf - Convert a FASTA alignment file to Variant Call Format (VCF) single-nucleotide diffs\n"
   "usage:\n"
   "   faToVcf in.fa out.vcf\n"
   "options:\n"
   "   -ambiguousToN         Treat all IUPAC ambiguous bases (N, R, V etc) as N (no call).\n"
   "   -excludeFile=file     Exclude sequences named in file which has one sequence name per line\n"
   "   -includeRef           Include the reference in the genotype columns\n"
   "                         (default: omitted as redundant)\n"
   "   -minAc=N              Ignore alternate alleles observed fewer than N times\n"
+  "   -noGenotypes          Output 8-column VCF, without the sample genotype columns\n"
   "   -ref=seqName          Use seqName as the reference sequence; must be present in faFile\n"
   "                         (default: first sequence in faFile)\n"
   "   -resolveAmbiguous     For IUPAC ambiguous characters like R (A or G), if the character\n"
   "                         represents two bases and one is the reference base, convert it to the\n"
   "                         non-reference base.  Otherwise convert it to N.\n"
   "   -startOffset=N        Add N bases to each position (for trimmed alignments)\n"
   "   -vcfChrom=seqName     Use seqName for the CHROM column in VCF (default: ref sequence)\n"
   "in.fa must contain a series of sequences with different names and the same length.\n"
   "Both N and - are treated as missing information.\n"
   );
 }
 
 #define MAX_ALTS 256
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     { "ambiguousToN", OPTION_BOOLEAN },
     { "excludeFile", OPTION_STRING },
     { "includeRef", OPTION_BOOLEAN },
     { "minAc", OPTION_INT },
+    { "noGenotypes", OPTION_BOOLEAN },
     { "ref", OPTION_STRING },
     { "resolveAmbiguous", OPTION_BOOLEAN },
     { "startOffset", OPTION_INT},
     { "vcfChrom", OPTION_STRING },
     { NULL, 0 },
 };
 
 static struct dnaSeq *removeExcludedSequences(struct dnaSeq *sequences)
 /* If -excludeFile was passed in, remove any sequences whose names are found in the file and
  * return the modified list. */
 {
 char *excludeFile = optionVal("excludeFile", NULL);
 if (excludeFile)
     {
     struct lineFile *lf = lineFileOpen(excludeFile, TRUE);
@@ -121,116 +123,130 @@
             seq->next = ref->next;
             ref->next = sequences;
             sequences = ref;
             break;
             }
         }
     if (differentString(sequences->name, refName))
         errAbort("Could not find -ref sequence '%s' in %s.", refName, faFile);
     }
 
 sequences = removeExcludedSequences(sequences);
 return sequences;
 }
 
 static void writeVcfHeader(FILE *outF, char *faFile, char *vcfFile,
-                           struct dnaSeq *sequences, boolean includeRef)
+                           struct dnaSeq *sequences, boolean includeRef, boolean includeGenotypes)
 /* Write a simple VCF header with sequence names as genotype columns. */
 {
 fprintf(outF,
         "##fileformat=VCFv4.2\n"
         "##reference=%s:%s\n"
         "##source=faToVcf %s %s\n"  //#*** Should document options used.
-        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT",
+        "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO",
         faFile, sequences->name, faFile, vcfFile);
-
+if (includeGenotypes)
+    {
+    printf("\tFORMAT");
     struct dnaSeq *seqsForGt = (includeRef ? sequences : sequences->next), *seq;
     for (seq = seqsForGt;  seq != NULL;  seq = seq->next)
         fprintf(outF, "\t%s", seq->name);
+    }
 fprintf(outF, "\n");
 }
 
 void faToVcf(char *faFile, char *vcfFile)
 /* faToVcf - Convert a FASTA alignment file to VCF. */
 {
 struct dnaSeq *sequences = readSequences(faFile);
 int seqCount = slCount(sequences);
 int seqSize = sequences->size;
 boolean includeRef = optionExists("includeRef");
 
 verbose(2, "Writing VCF to %s\n", vcfFile);
 FILE *outF = mustOpen(vcfFile, "w");
-writeVcfHeader(outF, faFile, vcfFile, sequences, includeRef);
+boolean noGenotypes = optionExists("noGenotypes");
+writeVcfHeader(outF, faFile, vcfFile, sequences, includeRef, !noGenotypes);
 
 char *refName = sequences->name;
 char *vcfChrom = optionVal("vcfChrom", refName);
 int startOffset = optionInt("startOffset", 0);
 
 struct dnaSeq *seqsForGt = (includeRef ? sequences : sequences->next);
 int gtCount = includeRef ? seqCount : seqCount - 1;
 int *genotypes = needMem(sizeof(int) * gtCount);
 boolean *missing = needMem(sizeof(boolean) * gtCount);
 
-int chromStart;
-for (chromStart = 0;  chromStart < seqSize;  chromStart++)
+int baseIx, chromStart = 0;
+for (baseIx = 0, chromStart = 0;  baseIx < seqSize;  baseIx++, chromStart++)
     {
-    char ref = toupper(sequences->dna[chromStart]);
+    char ref = toupper(sequences->dna[baseIx]);
+    if (ref == '-')
+        {
+        // Insertion into reference -- ignore and adjust reference coordinate.
+        chromStart--;
+        continue;
+        }
     if (ref == 'N')
         continue;
     if (ref == 'U')
         ref = 'T';
     char altAlleles[MAX_ALTS];
     int altAlleleCounts[MAX_ALTS];
     int altCount = 0;
     memset(altAlleles, 0, sizeof(altAlleles));
     memset(genotypes, 0, sizeof(int) * gtCount);
     memset(missing, 0, sizeof(boolean) * gtCount);
+    int nonNCount = seqCount;
     struct dnaSeq *seq;
     int gtIx;
     for (seq = seqsForGt, gtIx = 0;  seq != NULL;  seq = seq->next, gtIx++)
         {
-        char al = toupper(seq->dna[chromStart]);
+        char al = toupper(seq->dna[baseIx]);
         if (al == 'U')
             al = 'T';
         if (isIupacAmbiguous(al))
             {
             if (optionExists("ambiguousToN"))
                 al = 'N';
             else if (optionExists("resolveAmbiguous"))
                 {
                 char *bases = iupacAmbiguousToString(al);
                 if (strlen(bases) > 2 ||
                     (toupper(bases[0]) != ref && toupper(bases[1]) != ref))
                     al = 'N';
                 else if (toupper(bases[0]) == ref)
                     al = toupper(bases[1]);
                 else
                     al = toupper(bases[0]);
                 }
             }
         if (al == 'N' || al == '-')
+            {
             missing[gtIx] = TRUE;
+            nonNCount--;
+            }
         else if (al != ref)
             {
             char *altFound = strchr(altAlleles, al);
             int altIx = 0;
             if (! altFound)
                 {
                 altIx = altCount++;
                 if (altCount == MAX_ALTS)
-                    errAbort("faToVcf: base %d has at least %d alternate alleles; increase "
-                             "MAX_ALTS in faToVcf.c!", chromStart, MAX_ALTS);
+                    errAbort("faToVcf: fasta base %d (reference base %d) has at least %d alternate alleles; increase "
+                             "MAX_ALTS in faToVcf.c!", baseIx, chromStart, MAX_ALTS);
                 altAlleleCounts[altIx] = 1;
                 altAlleles[altIx] = al;
                 }
             else
                 {
                 altIx = altFound - altAlleles;
                 altAlleleCounts[altIx]++;
                 }
             genotypes[gtIx] = altIx+1;
             }
         }
     int minAc = optionInt("minAc", 0);
     if (minAc > 0)
         {
         int oldToNewIx[altCount];
@@ -242,31 +258,34 @@
             {
             if (altAlleleCounts[altIx] >= minAc)
                 {
                 // This alt passes the filter.
                 int newAltIx = newAltCount++;
                 newAltAlleleCounts[newAltIx] = altAlleleCounts[altIx];
                 newAltAlleles[newAltIx] = altAlleles[altIx];
                 oldToNewIx[altIx] = newAltIx;
                 }
             else
                 {
                 // We're removing this alt; count its genotypes as missing since they're not
                 // really reference.
                 for (gtIx = 0;  gtIx < gtCount;  gtIx++)
                     if (genotypes[gtIx] == altIx+1)
+                        {
                         missing[gtIx] = TRUE;
+                        nonNCount--;
+                        }
                 oldToNewIx[altIx] = -1;
                 }
             }
         if (newAltCount != altCount)
             {
             for (gtIx = 0;  gtIx < gtCount;  gtIx++)
                 {
                 int oldGt = genotypes[gtIx];
                 if (oldGt == 0)
                     genotypes[gtIx] = 0;
                 else
                     genotypes[gtIx] = oldToNewIx[oldGt-1] + 1;
                 }
             altCount = newAltCount;
             for (altIx = 0;  altIx < newAltCount;  altIx++)
@@ -286,38 +305,42 @@
                     ref, pos, altAlleles[altIx]);
         fprintf(outF, "\t%c\t", ref);
         for (altIx = 0;  altIx < altCount;  altIx++)
             {
             if (altIx > 0)
                 fprintf(outF, ",");
             fprintf(outF, "%c", altAlleles[altIx]);
             }
         fprintf(outF, "\t.\t.\tAC=");
         for (altIx = 0;  altIx < altCount;  altIx++)
             {
             if (altIx > 0)
                 fprintf(outF, ",");
             fprintf(outF, "%d", altAlleleCounts[altIx]);
             }
-        fprintf(outF, ";AN=%d\tGT", seqCount);
+        fprintf(outF, ";AN=%d", nonNCount);
+        if (!noGenotypes)
+            {
+            fputs("\tGT", outF);
             for (gtIx = 0;  gtIx < gtCount;  gtIx++)
                 {
                 if (missing[gtIx])
                     fprintf(outF, "\t.");
                 else
                     fprintf(outF, "\t%d", genotypes[gtIx]);
                 }
+            }
         fprintf(outF, "\n");
         }
     }
 carefulClose(&outF);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (optionExists("ambiguousToN") && optionExists("resolveAmbiguous"))
     errAbort("-ambiguousToN and -resolveAmbiguous conflict with each other; "
              "please use only one.");
 if (argc != 3)
     usage();