13db5365510728c1e96d96110a6fe86065949b8a
chmalee
  Thu Sep 24 16:17:54 2020 -0700
Make vcfToBed way more basic  and don't do any special parsing of VCF specific stuff, just chop out requested fields, refs #25010

diff --git src/hg/utils/vcfToBed/vcfToBed.c src/hg/utils/vcfToBed/vcfToBed.c
index 6b9f6bc..605b46e 100644
--- src/hg/utils/vcfToBed/vcfToBed.c
+++ src/hg/utils/vcfToBed/vcfToBed.c
@@ -1,282 +1,254 @@
 /* vcfToBed - Convert VCF to BED9+ with optional extra fields.
  * Extra VCF tags get placed into a separate tab file for later indexing.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "vcf.h"
+#include "dnautil.h"
 
 #define MAX_BED_EXTRA 50 // how many extra fields  can we put into the bigBed itself (for filtering, labels, etc)
-#define bed9Header "#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb"
+#define bed9Header "#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tref\talt"
 
 // hash up all the info tags and their vcfInfoDef structs for faster searching
 struct hash *infoHash = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
     "vcfToBed - Convert VCF to BED9+ with optional extra fields.\n"
     "usage:\n"
     "    vcfToBed in.vcf outPrefix\n"
     "options:\n"
     "    -fields=comma-sep list of tags to include in the bed file, other fields will be placed into\n"
     "           out.extraFields.tab\n"
     "\n"
     "NOTE: Extra VCF tags (that aren't listed in -fields)  get placed into a separate tab\n"
     "file for later indexing.\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     {"fields", OPTION_STRING},
     {NULL, 0},
 };
 
-void hashInfoTags(struct vcfFile *vcff)
-{
-if (!infoHash)
-    infoHash = hashNew(0);
-struct vcfInfoDef *def = NULL;
-for (def = vcff->infoDefs; def != NULL; def = def->next)
-    {
-    hashAdd(infoHash, def->key, def);
-    }
-}
-
 int trimMissingTagsFromFieldList(char **keepFields, char **tempKeepFields, int keepCount, struct vcfFile *vcff)
 /* If there are requested fields in keepFields that don't exist in vcff, warn about them
  * and remove them from the keepFields array. Return number of valid entries */
 {
 int i = 0;
 int keepIx = 0;
 for (; i < keepCount; i++)
     {
     char *tag = tempKeepFields[i];
     struct vcfInfoDef *def = NULL;
     if ( (def = vcfInfoDefForKey(vcff, tag)) == NULL)
         verbose(2, "Warning: skipping desired tag '%s', does not exist in INFO fields.\n", tag);
     else
         {
         keepFields[keepIx++] = cloneString(tag);
         }
     }
 return keepIx;
 }
 
 char *fixupChromName(char *chrom)
 /* Prepend "chr" if missing */
 {
 if (!startsWith(chrom, "chr"))
     return catTwoStrings("chr", chrom);
 else
     return chrom;
 }
 
-char *fixupVcfRecordName(struct vcfRecord *rec, int fixedChromStart)
-/* Fill in for often missing vcf record name (subs out '.' for something useful) */
-{
-static struct dyString *ds;
-if (!ds)
-    ds = dyStringNew(0);
-char *name = NULL;
-if (isNotEmpty(rec->name) && !sameString(rec->name,"."))
-    name = cloneString(rec->name);
-else
-    {
-    dyStringPrintf(ds, "%s_%d:", fixupChromName(rec->chrom), fixedChromStart);
-    if (rec->alleleCount < 2)
-        dyStringPrintf(ds, "?");
-    else
-        dyStringPrintf(ds, "%s/%s", rec->alleles[0], rec->alleles[1]);
-    name = cloneString(ds->string);
-    }
-dyStringClear(ds);
-return name;
-}
+#define VCF_MAX_INFO (4*1024)
 
-void printVcfInfoElem(struct vcfFile *vcff, struct vcfRecord *rec, char *tag, FILE *out, boolean printTabular)
-/* Maybe print a VCF INFO tag element to out */
-{
-const struct vcfInfoElement *elem = vcfRecordFindInfo(rec, tag);
-if (elem)
-    {
-    const struct vcfInfoDef *def = hashMustFindVal(infoHash, elem->key);
-    //const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, elem->key);
-    enum vcfInfoType type = def ? def->type : vcfInfoString;
-    if (elem != NULL)
+void fixupLeftPadding(int *start, char *ref, char *alt)
+/* If an indel is left padded fix up the star position */
 {
-        if (type == vcfInfoFlag && elem->count == 0)
-            fprintf(out, "Yes");
-        if (looksTabular(def, elem) && !printTabular)
-            errAbort("Error: Tables not supported in bigBed fields, tag '%s'", elem->key);
-        else
-            {
-            int j;
-            for (j = 0; j < elem->count; j++)
+char *altTmp = cloneString(alt);
+char *alleles[VCF_MAX_INFO+1];
+alleles[0] = cloneString(ref);
+char *altAlleles[VCF_MAX_INFO];
+int altCount = chopCommas(altTmp, altAlleles);
+int alleleCount = 1 + altCount;
+if (alleleCount > 1)
     {
-                if (j > 0)
-                    fprintf(out, ", ");
-                if (elem->missingData[j])
-                    fprintf(out, ".");
-                else
-                    vcfPrintDatum(out, elem->values[j], type);
-                }
-            }
-        }
+    int i;
+    for (i = 0; i < altCount; i++)
+        alleles[1+i] = altAlleles[i];
+    boolean hasPaddingBase = allelesHavePaddingBase(alleles, alleleCount);
+    if (hasPaddingBase)
+        (*start)++;
     }
 }
 
-void printHeaders(struct vcfFile *vcff, char **keepFields, int keepCount, FILE *outBed, FILE *outExtra)
+void printHeaders(struct vcfFile *vcff, char **keepFields, int keepCount, FILE *outBed)
 /* Print the required header into extraFields.tab, and put a technically optional
  * comment into outBed */
 {
 int i;
 fprintf(outBed, "%s", bed9Header);
 for (i = 0; i < keepCount; i++)
     if (keepFields[i] != NULL)
         fprintf(outBed, "\t%s", keepFields[i]);
 fprintf(outBed, "\n");
+}
 
-if (outExtra)
+void vcfLinesToBed(struct vcfFile *vcff, char **keepFields, int extraFieldCount,
+                    FILE *outBed) //, FILE *outExtra)
+/* Turn a VCF line into a bed9+ line */
 {
-    struct vcfInfoDef *def;
-    boolean first = TRUE;
-    for (def = vcff->infoDefs; def != NULL; def = def->next)
+struct dyString *name = dyStringNew(0);
+struct dyString *maybeShortenedRef = dyStringNew(0);
+struct dyString *maybeShortenedAlt = dyStringNew(0);
+char *line = NULL;
+char *chopped[9];
+int infoCount = slCount(vcff->infoDefs);
+char *infoTags[infoCount];
+int i;
+while (lineFileNext(vcff->lf, &line, NULL))
+    {
+    int fieldCount = chopTabs(line, chopped);
+    if (fieldCount < 8)
+        errAbort("ERROR: malformed VCF, missing fields at line: '%d'", vcff->lf->lineIx);
+    char *chrom = fixupChromName(chopped[0]);
+    int start = atoi(chopped[1]);
+    int end = start + 1;
+    char *ref = chopped[3];
+    char *alt = chopped[4];
+    if (strlen(ref) == dnaFilteredSize(ref))
+        end = start + strlen(ref);
+    fixupLeftPadding(&start, ref, alt);
+    if (!sameString(chopped[2], "."))
+        dyStringPrintf(name, "%s", chopped[2]);
+    else
         {
-        if (stringArrayIx(def->key, keepFields, keepCount) == -1)
+        if (strlen(ref) > 10)
             {
-            if (!first)
-                fprintf(outExtra, "\t");
-            fprintf(outExtra, "%s", def->key);
-            first = FALSE;
-            }
+            for (i = 0; i < 10; i++)
+                dyStringAppendC(maybeShortenedRef, ref[i]);
+            dyStringAppend(maybeShortenedRef, "...");
             }
-    fprintf(outExtra, "\n");
+        else
+            maybeShortenedRef->string = ref;
+        if (strlen(alt) > 10)
+            {
+            for (i = 0; i < 10; i++)
+                dyStringAppendC(maybeShortenedAlt, alt[i]);
+            dyStringAppend(maybeShortenedAlt, "...");
             }
+        else
+            maybeShortenedAlt->string = alt;
+        dyStringPrintf(name, "%s_%d:%s/%s", chrom,start,maybeShortenedRef->string,maybeShortenedAlt->string);
         }
-
-void vcfLinesToBed(struct vcfFile *vcff, char **keepFields, int extraFieldCount,
-                    FILE *outBed, FILE *outExtra)
-/* Turn a VCF line into a bed9+ line */
+    fprintf(outBed, "%s\t%d\t%d\t%s\t0\t.\t%d\t%d\t0,0,0", chrom,start,end,name->string,start,end);
+    fprintf(outBed, "\t%s\t%s", ref, alt);
+    if (extraFieldCount)
         {
-struct vcfRecord *rec;
+        fprintf(outBed, "\t");
+        chopByChar(chopped[7], ';', infoTags, infoCount);
         int i;
-while ( (rec = vcfNextRecord(vcff)) != NULL)
-    {
-    int start = vcfRecordTrimIndelLeftBase(rec);
-    char *name = fixupVcfRecordName(rec, start);
-    fprintf(outBed, "%s\t%d\t%d\t%s", fixupChromName(rec->chrom), start, rec->chromEnd, name);
-    fprintf(outBed, "0\t.\t%d\t%d\t0,0,0", start, rec->chromEnd);
         for (i = 0; i < extraFieldCount; i++)
             {
-        fprintf(outBed, "\t");
-        printVcfInfoElem(vcff, rec, keepFields[i], outBed, FALSE);
-        }
-    fprintf(outBed, "\n");
-
-    // write out extra tags for this line, if any:
-    if (outExtra)
+            char *extraField = keepFields[i];
+            char *infoTagVal = NULL;
+            int j;
+            for (j = 0; j < infoCount; j++)
                 {
-        fprintf(outExtra, "%s", name);
-        struct vcfInfoDef *def;
-        for (def = vcff->infoDefs; def != NULL; def = def->next)
+                if (infoTags[j] && startsWith(extraField, infoTags[j]))
                     {
-            if (stringArrayIx(def->key, keepFields, extraFieldCount) == -1)
+                    infoTagVal = cloneString(infoTags[j]);
+                    break;
+                    }
+                }
+            if (infoTagVal)
+                {
+                char *val = strchr(infoTagVal, '=');
+                if (val)
                     {
-                fprintf(outExtra, "\t");
-                printVcfInfoElem(vcff, rec, def->key, outExtra, TRUE);
+                    val += 1;
+                    fprintf(outBed, "%s", val);
                     }
                 }
-        fprintf(outExtra, "\n");
+            fprintf(outBed, "\t");
             }
-    fflush(outBed);
-    fflush(outExtra);
+        }
+    fprintf(outBed, "\n");
+    dyStringClear(name);
+    dyStringClear(maybeShortenedRef);
+    dyStringClear(maybeShortenedAlt);
     }
 }
 
 void vcfToBed(char *vcfFileName, char *outPrefix, char *tagsToKeep)
 /* vcfToBed - Convert VCF to BED9+ with optional extra fields.
  * Extra VCF tags get placed into a separate tab file for later indexing. */
 {
-FILE *outBed, *outExtra;
+FILE *outBed;
 struct vcfFile *vcff = NULL;
 char tbiFile[4096];
 int i;
 int vcfTagCount = 0; // no need for extra file if there's only a few tags
 int keepCount = 0; // count of comma-sep tagsToKeep
 char *keepFields[MAX_BED_EXTRA]; // Max 50 extra fields to put into bigBed, also needed for
                                  // comment string header
 char *tempKeepFields[MAX_BED_EXTRA]; // allow requesting fields that don't exist, just don't output
 memset(keepFields, 0, MAX_BED_EXTRA);
 memset(tempKeepFields, 0, MAX_BED_EXTRA);
 
 // open up VCF for reading
 safef(tbiFile, sizeof(tbiFile), "%s.tbi", vcfFileName);
 if (fileExists(tbiFile))
     {
     vcff = vcfTabixFileAndIndexMayOpen(vcfFileName, tbiFile, NULL, 0,0,VCF_IGNORE_ERRS,-1);
     if (vcff == NULL)
         errAbort("error opening %s\n", vcfFileName);
     vcfTagCount = slCount(vcff->infoDefs);
-    hashInfoTags(vcff);
     // do a batch read and use the reuse pool
     vcfFileMakeReusePool(vcff, 64*1024*1024);
     }
 else
     errAbort("no tabix index for %s\n", vcfFileName);
 
 if (tagsToKeep)
     {
     keepCount = chopCommas(tagsToKeep, tempKeepFields);
     if (keepCount > vcfTagCount)
         verbose(2, "Warning: fewer fields in VCF than -fields specification.");
     keepCount = trimMissingTagsFromFieldList(keepFields, tempKeepFields, keepCount, vcff);
     }
 
 // open output for writing
-char *outBedName = NULL, *outExtraName = NULL;
+char *outBedName = NULL;
 if (endsWith(outPrefix, ".bed"))
-    {
     outBedName = cloneString(outPrefix);
-    if (keepCount <= vcfTagCount)
-        {
-        outExtraName = replaceChars(outPrefix, ".bed", ".extraFields.tab");
-        outExtra = mustOpen(outExtraName, "w");
-        }
-    }
 else
-    {
     outBedName = catTwoStrings(outPrefix, ".bed");
-    if (keepCount <= vcfTagCount)
-        {
-        outExtraName =  catTwoStrings(outPrefix, ".extraFields.tab");
-        outExtra = mustOpen(outExtraName, "w");
-        }
-    }
 outBed = mustOpen(outBedName, "w");
 
 verbose(2, "input vcf tag count = %d\n", vcfTagCount);
 verbose(2, "number of valid requested tags = %d\n", keepCount);
 for (i = 0; i < keepCount; i++)
     if (keepFields[i] != NULL)
         verbose(2, "found tag: '%s'\n", keepFields[i]);
 
-printHeaders(vcff, keepFields, keepCount, outBed, outExtra);
-vcfLinesToBed(vcff, keepFields, keepCount, outBed, outExtra);
+printHeaders(vcff, keepFields, keepCount, outBed);
+vcfLinesToBed(vcff, keepFields, keepCount, outBed);
 vcfFileFree(&vcff);
 carefulClose(&outBed);
-carefulClose(&outExtra);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 char *fields = optionVal("fields", NULL);
 vcfToBed(argv[1],argv[2], fields);
 return 0;
 }