1562b816fff70fb76a2c67341e5488c562e51857
chmalee
  Fri Sep 25 14:52:52 2020 -0700
Code cleanup in vcfToBed and fix wrong array length variable for looping over info fields, refs #25010

diff --git src/hg/utils/vcfToBed/vcfToBed.c src/hg/utils/vcfToBed/vcfToBed.c
index 605b46e..ad7b49c 100644
--- src/hg/utils/vcfToBed/vcfToBed.c
+++ src/hg/utils/vcfToBed/vcfToBed.c
@@ -1,26 +1,26 @@
 /* vcfToBed - Convert VCF to BED9+ with optional extra fields.
  * Extra VCF tags get placed into a separate tab file for later indexing.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "vcf.h"
 #include "dnautil.h"
 
 #define MAX_BED_EXTRA 50 // how many extra fields  can we put into the bigBed itself (for filtering, labels, etc)
-#define bed9Header "#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tref\talt"
+#define bed9Header "#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tref\talt\tFILTER"
 
 // hash up all the info tags and their vcfInfoDef structs for faster searching
 struct hash *infoHash = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
     "vcfToBed - Convert VCF to BED9+ with optional extra fields.\n"
     "usage:\n"
     "    vcfToBed in.vcf outPrefix\n"
     "options:\n"
     "    -fields=comma-sep list of tags to include in the bed file, other fields will be placed into\n"
     "           out.extraFields.tab\n"
     "\n"
@@ -55,144 +55,156 @@
 return keepIx;
 }
 
 char *fixupChromName(char *chrom)
 /* Prepend "chr" if missing */
 {
 if (!startsWith(chrom, "chr"))
     return catTwoStrings("chr", chrom);
 else
     return chrom;
 }
 
 #define VCF_MAX_INFO (4*1024)
 
 void fixupLeftPadding(int *start, char *ref, char *alt)
-/* If an indel is left padded fix up the star position */
+/* If an indel is left padded fix up the start position */
 {
 char *altTmp = cloneString(alt);
 char *alleles[VCF_MAX_INFO+1];
 alleles[0] = cloneString(ref);
 char *altAlleles[VCF_MAX_INFO];
 int altCount = chopCommas(altTmp, altAlleles);
 int alleleCount = 1 + altCount;
 if (alleleCount > 1)
     {
     int i;
     for (i = 0; i < altCount; i++)
         alleles[1+i] = altAlleles[i];
     boolean hasPaddingBase = allelesHavePaddingBase(alleles, alleleCount);
     if (hasPaddingBase)
         (*start)++;
     }
 }
 
+void shortenRefAltString(struct dyString *shortenedRef, struct dyString *shortenedAlt, char *ref, char *alt)
+/* Shorten up the Ref and Alt strings for bed labels */
+{
+int i;
+if (strlen(ref) > 10)
+    {
+    for (i = 0; i < 10; i++)
+        dyStringAppendC(shortenedRef, ref[i]);
+    dyStringAppend(shortenedRef, "...");
+    }
+else
+    shortenedRef->string = ref;
+if (strlen(alt) > 10)
+    {
+    for (i = 0; i < 10; i++)
+        dyStringAppendC(shortenedAlt, alt[i]);
+    dyStringAppend(shortenedAlt, "...");
+    }
+else
+    shortenedAlt->string = alt;
+}
+
+void printExtraFieldInfo(FILE *outBed, char *info, char **infoTags, int infoCount, char **keepFields, int extraFieldCount)
+{
+int i,j;
+int numTags = chopByChar(info, ';', infoTags, infoCount);
+for (i = 0; i < extraFieldCount; i++)
+    {
+    char *extraField = keepFields[i];
+    for (j = 0; j < numTags && infoTags[j]; j++)
+        {
+        boolean isKeyVal = (strchr(infoTags[j], '=') != NULL);
+        if (isKeyVal && startsWithWordByDelimiter(extraField, '=', infoTags[j]))
+            {
+            char *infoTagVal = infoTags[j];
+            char *val = strchr(infoTagVal, '=');
+            *(val++) = '\0';
+            if (sameString(infoTagVal, extraField))
+                {
+                fprintf(outBed, "%s", val);
+                break;
+                }
+            }
+        else if (sameString(infoTags[j], extraField))
+            {
+            fprintf(outBed, "Yes");
+            break;
+            }
+        }
+    if (i < (extraFieldCount - 1))
+        fprintf(outBed, "\t");
+    }
+}
+
 void printHeaders(struct vcfFile *vcff, char **keepFields, int keepCount, FILE *outBed)
-/* Print the required header into extraFields.tab, and put a technically optional
- * comment into outBed */
+/* Print a comment describing all the fields into outBed */
 {
 int i;
 fprintf(outBed, "%s", bed9Header);
 for (i = 0; i < keepCount; i++)
     if (keepFields[i] != NULL)
         fprintf(outBed, "\t%s", keepFields[i]);
 fprintf(outBed, "\n");
 }
 
 void vcfLinesToBed(struct vcfFile *vcff, char **keepFields, int extraFieldCount,
                     FILE *outBed) //, FILE *outExtra)
 /* Turn a VCF line into a bed9+ line */
 {
 struct dyString *name = dyStringNew(0);
-struct dyString *maybeShortenedRef = dyStringNew(0);
-struct dyString *maybeShortenedAlt = dyStringNew(0);
+struct dyString *shortenedRef = dyStringNew(0);
+struct dyString *shortenedAlt = dyStringNew(0);
 char *line = NULL;
-char *chopped[9];
+char *chopped[9]; // 9 fields in case VCF has genotype fields, keep them separate from info fields
 int infoCount = slCount(vcff->infoDefs);
 char *infoTags[infoCount];
-int i;
 while (lineFileNext(vcff->lf, &line, NULL))
     {
     int fieldCount = chopTabs(line, chopped);
     if (fieldCount < 8)
         errAbort("ERROR: malformed VCF, missing fields at line: '%d'", vcff->lf->lineIx);
     char *chrom = fixupChromName(chopped[0]);
     int start = atoi(chopped[1]);
     int end = start + 1;
     char *ref = chopped[3];
     char *alt = chopped[4];
     if (strlen(ref) == dnaFilteredSize(ref))
         end = start + strlen(ref);
     fixupLeftPadding(&start, ref, alt);
     if (!sameString(chopped[2], "."))
         dyStringPrintf(name, "%s", chopped[2]);
     else
         {
-        if (strlen(ref) > 10)
-            {
-            for (i = 0; i < 10; i++)
-                dyStringAppendC(maybeShortenedRef, ref[i]);
-            dyStringAppend(maybeShortenedRef, "...");
-            }
-        else
-            maybeShortenedRef->string = ref;
-        if (strlen(alt) > 10)
-            {
-            for (i = 0; i < 10; i++)
-                dyStringAppendC(maybeShortenedAlt, alt[i]);
-            dyStringAppend(maybeShortenedAlt, "...");
-            }
-        else
-            maybeShortenedAlt->string = alt;
-        dyStringPrintf(name, "%s_%d:%s/%s", chrom,start,maybeShortenedRef->string,maybeShortenedAlt->string);
+        shortenRefAltString(shortenedRef, shortenedAlt, ref, alt);
+        dyStringPrintf(name, "%s_%d:%s/%s", chrom,start,shortenedRef->string,shortenedAlt->string);
         }
     fprintf(outBed, "%s\t%d\t%d\t%s\t0\t.\t%d\t%d\t0,0,0", chrom,start,end,name->string,start,end);
-    fprintf(outBed, "\t%s\t%s", ref, alt);
+    fprintf(outBed, "\t%s\t%s\t%s", ref, alt, chopped[6]);
     if (extraFieldCount)
         {
         fprintf(outBed, "\t");
-        chopByChar(chopped[7], ';', infoTags, infoCount);
-        int i;
-        for (i = 0; i < extraFieldCount; i++)
-            {
-            char *extraField = keepFields[i];
-            char *infoTagVal = NULL;
-            int j;
-            for (j = 0; j < infoCount; j++)
-                {
-                if (infoTags[j] && startsWith(extraField, infoTags[j]))
-                    {
-                    infoTagVal = cloneString(infoTags[j]);
-                    break;
-                    }
-                }
-            if (infoTagVal)
-                {
-                char *val = strchr(infoTagVal, '=');
-                if (val)
-                    {
-                    val += 1;
-                    fprintf(outBed, "%s", val);
-                    }
-                }
-            fprintf(outBed, "\t");
-            }
+        printExtraFieldInfo(outBed, chopped[7], infoTags, infoCount, keepFields, extraFieldCount);
         }
     fprintf(outBed, "\n");
     dyStringClear(name);
-    dyStringClear(maybeShortenedRef);
-    dyStringClear(maybeShortenedAlt);
+    dyStringClear(shortenedRef);
+    dyStringClear(shortenedAlt);
     }
 }
 
 void vcfToBed(char *vcfFileName, char *outPrefix, char *tagsToKeep)
 /* vcfToBed - Convert VCF to BED9+ with optional extra fields.
  * Extra VCF tags get placed into a separate tab file for later indexing. */
 {
 FILE *outBed;
 struct vcfFile *vcff = NULL;
 char tbiFile[4096];
 int i;
 int vcfTagCount = 0; // no need for extra file if there's only a few tags
 int keepCount = 0; // count of comma-sep tagsToKeep
 char *keepFields[MAX_BED_EXTRA]; // Max 50 extra fields to put into bigBed, also needed for
                                  // comment string header