c5225b06da002ec4b1f5cf3946300d5d43f1cb5d
chmalee
  Wed Feb 7 15:43:09 2024 -0800
Check array is not full when reading INFO tags to keep from a file, refs #32973

diff --git src/hg/utils/vcfToBed/vcfToBed.c src/hg/utils/vcfToBed/vcfToBed.c
index 08e4705..7ad6f63 100644
--- src/hg/utils/vcfToBed/vcfToBed.c
+++ src/hg/utils/vcfToBed/vcfToBed.c
@@ -1,282 +1,282 @@
 /* vcfToBed - Convert VCF to BED9+ with optional extra fields.
  * Extra VCF tags get placed into a separate tab file for later indexing.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "vcf.h"
 #include "dnautil.h"
 
 // how many extra fields  can we put into the bigBed itself (for filtering, labels, etc):
 #define MAX_BED_EXTRA 1000
 #define bed9Header "#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tref\talt\tFILTER"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
     "vcfToBed - Convert VCF to BED9+ with optional extra fields.\n"
     "usage:\n"
     "    vcfToBed in.vcf outPrefix\n"
     "options:\n"
     "    -fixChromNames If present, prepend 'chr' to chromosome names\n"
     "    -fields=comma-sep list of tags to include in the bed file, other fields\n"
     "           will be placed into out.extraFields.tab\n"
     "    -fieldsIsFile If present, the -fields argument is a file with one tag per\n"
     "           line to keep. Note only the first word (white space delimited) will\n"
     "           be kept per line\n"
     "\n"
     "NOTE: Extra VCF tags (that aren't listed in -fields)  get placed into a separate tab\n"
     "file for later indexing.\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     {"fields", OPTION_STRING},
     {"fixChromNames", OPTION_BOOLEAN},
     {"fieldsIsFile", OPTION_BOOLEAN},
     {NULL, 0},
 };
 
 int trimMissingTagsFromFieldList(char **keepFields, char **tempKeepFields, int keepCount, struct vcfFile *vcff)
 /* If there are requested fields in keepFields that don't exist in vcff, warn about them
  * and remove them from the keepFields array. Return number of valid entries */
 {
 int i = 0;
 int keepIx = 0;
 for (; i < keepCount; i++)
     {
     char *tag = tempKeepFields[i];
     struct vcfInfoDef *def = NULL;
     if ( (def = vcfInfoDefForKey(vcff, tag)) == NULL)
         verbose(2, "Warning: skipping desired tag '%s', does not exist in INFO fields.\n", tag);
     else
         {
         keepFields[keepIx++] = cloneString(tag);
         }
     }
 return keepIx;
 }
 
 char *fixupChromName(char *chrom)
 /* Prepend "chr" if missing */
 {
 if (startsWith("chr", chrom))
     return chrom;
 else
     return catTwoStrings("chr", chrom);
 }
 
 #define VCF_MAX_INFO (4*1024)
 
 void fixupLeftPadding(int *start, char *ref, char *alt)
 /* If an indel is left padded fix up the start position */
 {
 char *altTmp = cloneString(alt);
 char *alleles[VCF_MAX_INFO+1];
 alleles[0] = cloneString(ref);
 char *altAlleles[VCF_MAX_INFO];
 int altCount = chopCommas(altTmp, altAlleles);
 int alleleCount = 1 + altCount;
 if (alleleCount > 1)
     {
     int i;
     for (i = 0; i < altCount; i++)
         alleles[1+i] = altAlleles[i];
     boolean hasPaddingBase = allelesHavePaddingBase(alleles, alleleCount);
     if (hasPaddingBase)
         (*start)++;
     }
 }
 
 void shortenRefAltString(struct dyString *shortenedRef, struct dyString *shortenedAlt, char *ref, char *alt)
 /* Shorten up the Ref and Alt strings for bed labels */
 {
 int i;
 if (strlen(ref) > 10)
     {
     for (i = 0; i < 10; i++)
         dyStringAppendC(shortenedRef, ref[i]);
     dyStringAppend(shortenedRef, "...");
     }
 else
     shortenedRef->string = ref;
 if (strlen(alt) > 10)
     {
     for (i = 0; i < 10; i++)
         dyStringAppendC(shortenedAlt, alt[i]);
     dyStringAppend(shortenedAlt, "...");
     }
 else
     shortenedAlt->string = alt;
 }
 
 void printExtraFieldInfo(FILE *outBed, char *info, char **infoTags, int infoCount, char **keepFields, int extraFieldCount)
 {
 int i,j;
 int numTags = chopByChar(info, ';', infoTags, infoCount);
 for (i = 0; i < extraFieldCount; i++)
     {
     char *extraField = keepFields[i];
     for (j = 0; j < numTags && infoTags[j]; j++)
         {
         boolean isKeyVal = (strchr(infoTags[j], '=') != NULL);
         if (isKeyVal && startsWithWordByDelimiter(extraField, '=', infoTags[j]))
             {
             char *infoTagVal = infoTags[j];
             char *val = strchr(infoTagVal, '=');
             *(val++) = '\0';
             if (sameString(infoTagVal, extraField))
                 {
                 fprintf(outBed, "%s", val);
                 break;
                 }
             }
         else if (sameString(infoTags[j], extraField))
             {
             fprintf(outBed, "Yes");
             break;
             }
         }
     if (i < (extraFieldCount - 1))
         fprintf(outBed, "\t");
     }
 }
 
 void printHeaders(char **keepFields, int keepCount, FILE *outBed)
 /* Print a comment describing all the fields into outBed */
 {
 int i;
 fprintf(outBed, "%s", bed9Header);
 for (i = 0; i < keepCount; i++)
     if (keepFields[i] != NULL)
         fprintf(outBed, "\t%s", keepFields[i]);
 fprintf(outBed, "\n");
 }
 
 void vcfLinesToBed(struct vcfFile *vcff, char **keepFields, int extraFieldCount,
                     boolean fixChromNames, boolean fieldsIsFile, FILE *outBed) //, FILE *outExtra)
 /* Turn a VCF line into a bed9+ line */
 {
 struct dyString *name = dyStringNew(0);
 struct dyString *shortenedRef = dyStringNew(0);
 struct dyString *shortenedAlt = dyStringNew(0);
 char *line = NULL;
 char *chopped[9]; // 9 fields in case VCF has genotype fields, keep them separate from info fields
 int infoCount = slCount(vcff->infoDefs);
 char *infoTags[infoCount];
 while (lineFileNext(vcff->lf, &line, NULL))
     {
     int fieldCount = chopTabs(line, chopped);
     if (fieldCount < 8)
         errAbort("ERROR: malformed VCF, missing fields at line: '%d'", vcff->lf->lineIx);
     char *chrom = chopped[0];
     if (fixChromNames)
         chrom = fixupChromName(chrom);
     int start = atoi(chopped[1]) - 1;
     int end = start;
     char *ref = cloneString(chopped[3]);
     char *alt = cloneString(chopped[4]);
     if (strlen(ref) == dnaFilteredSize(ref))
         end = start + strlen(ref);
     fixupLeftPadding(&start, ref, alt);
     if (!sameString(chopped[2], "."))
         dyStringPrintf(name, "%s", chopped[2]);
     else
         {
         shortenRefAltString(shortenedRef, shortenedAlt, ref, alt);
         dyStringPrintf(name, "%s_%d:%s/%s", chrom,start,shortenedRef->string,shortenedAlt->string);
         }
     fprintf(outBed, "%s\t%d\t%d\t%s\t0\t.\t%d\t%d\t0,0,0", chrom,start,end,name->string,start,end);
     fprintf(outBed, "\t%s\t%s\t%s", ref, alt, chopped[6]);
     if (extraFieldCount)
         {
         fprintf(outBed, "\t");
         printExtraFieldInfo(outBed, chopped[7], infoTags, infoCount, keepFields, extraFieldCount);
         }
     fprintf(outBed, "\n");
     dyStringClear(name);
     dyStringClear(shortenedRef);
     dyStringClear(shortenedAlt);
     }
 }
 
 void vcfToBed(char *vcfFileName, char *outPrefix, char *tagsToKeep, boolean fixChromNames, boolean fieldsIsFile)
 /* vcfToBed - Convert VCF to BED9+ with optional extra fields.
  * Extra VCF tags get placed into a separate tab file for later indexing. */
 {
 FILE *outBed;
 struct vcfFile *vcff = NULL;
 char tbiFile[4096];
 int i;
 int vcfTagCount = 0; // no need for extra file if there's only a few tags
 int keepCount = 0; // count of comma-sep tagsToKeep
 char *keepFields[MAX_BED_EXTRA]; // Max 1000 extra fields to put into bigBed, also needed for
                                  // comment string header
 char *tempKeepFields[MAX_BED_EXTRA]; // allow requesting fields that don't exist, just don't output
 memset(keepFields, 0, MAX_BED_EXTRA);
 memset(tempKeepFields, 0, MAX_BED_EXTRA);
 
 // open up VCF for reading
 safef(tbiFile, sizeof(tbiFile), "%s.tbi", vcfFileName);
 if (fileExists(tbiFile))
     {
     vcff = vcfTabixFileAndIndexMayOpen(vcfFileName, tbiFile, NULL, 0,0,VCF_IGNORE_ERRS,-1);
     if (vcff == NULL)
         errAbort("error opening %s\n", vcfFileName);
     vcfTagCount = slCount(vcff->infoDefs);
     // do a batch read and use the reuse pool
     vcfFileMakeReusePool(vcff, 64*1024*1024);
     }
 else
     errAbort("no tabix index for %s\n", vcfFileName);
 
 if (tagsToKeep)
     {
     if (fieldsIsFile)
         {
         struct lineFile *lf = lineFileOpen(tagsToKeep, FALSE);
         char *tag;
-        while (lineFileNextRow(lf, &tag, 1))
+        while ( (keepCount < MAX_BED_EXTRA) && lineFileNextRow(lf, &tag, 1))
             tempKeepFields[keepCount++] = cloneString(tag);
         }
     else
         keepCount = chopCommas(tagsToKeep, tempKeepFields);
     if (keepCount > vcfTagCount)
         verbose(2, "Warning: fewer fields in VCF than -fields specification.");
     keepCount = trimMissingTagsFromFieldList(keepFields, tempKeepFields, keepCount, vcff);
     }
 
 // open output for writing
 char *outBedName = NULL;
 if (endsWith(outPrefix, ".bed"))
     outBedName = cloneString(outPrefix);
 else
     outBedName = catTwoStrings(outPrefix, ".bed");
 outBed = mustOpen(outBedName, "w");
 
 verbose(2, "input vcf tag count = %d\n", vcfTagCount);
 verbose(2, "number of valid requested tags = %d\n", keepCount);
 for (i = 0; i < keepCount; i++)
     if (keepFields[i] != NULL)
         verbose(2, "found tag: '%s'\n", keepFields[i]);
 
 printHeaders(keepFields, keepCount, outBed);
 vcfLinesToBed(vcff, keepFields, keepCount, fixChromNames, fieldsIsFile, outBed);
 vcfFileFree(&vcff);
 carefulClose(&outBed);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 3)
     usage();
 char *fields = optionVal("fields", NULL);
 boolean fixChromNames = optionExists("fixChromNames");
 boolean fieldsIsFile  = optionExists("fieldsIsFile");
 vcfToBed(argv[1],argv[2], fields, fixChromNames, fieldsIsFile);
 return 0;
 }