6ba8692a29a9cdd90cf4a133a4d8a3ee488dc899 chmalee Thu Feb 1 17:22:40 2024 -0800 Add a flag -fieldsIsFile to vcfToBed to read the -fields argument from a file rather than the command line diff --git src/hg/utils/vcfToBed/vcfToBed.c src/hg/utils/vcfToBed/vcfToBed.c index dcbe3cc..08e4705 100644 --- src/hg/utils/vcfToBed/vcfToBed.c +++ src/hg/utils/vcfToBed/vcfToBed.c @@ -1,52 +1,53 @@ /* vcfToBed - Convert VCF to BED9+ with optional extra fields. * Extra VCF tags get placed into a separate tab file for later indexing.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "vcf.h" #include "dnautil.h" // how many extra fields can we put into the bigBed itself (for filtering, labels, etc): -#define MAX_BED_EXTRA 100 +#define MAX_BED_EXTRA 1000 #define bed9Header "#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tref\talt\tFILTER" -// hash up all the info tags and their vcfInfoDef structs for faster searching -struct hash *infoHash = NULL; - void usage() /* Explain usage and exit. */ { errAbort( "vcfToBed - Convert VCF to BED9+ with optional extra fields.\n" "usage:\n" " vcfToBed in.vcf outPrefix\n" "options:\n" " -fixChromNames If present, prepend 'chr' to chromosome names\n" - " -fields=comma-sep list of tags to include in the bed file, other fields will be placed into\n" - " out.extraFields.tab\n" + " -fields=comma-sep list of tags to include in the bed file, other fields\n" + " will be placed into out.extraFields.tab\n" + " -fieldsIsFile If present, the -fields argument is a file with one tag per\n" + " line to keep. Note only the first word (white space delimited) will\n" + " be kept per line\n" "\n" "NOTE: Extra VCF tags (that aren't listed in -fields) get placed into a separate tab\n" "file for later indexing.\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {"fields", OPTION_STRING}, {"fixChromNames", OPTION_BOOLEAN}, + {"fieldsIsFile", OPTION_BOOLEAN}, {NULL, 0}, }; int trimMissingTagsFromFieldList(char **keepFields, char **tempKeepFields, int keepCount, struct vcfFile *vcff) /* If there are requested fields in keepFields that don't exist in vcff, warn about them * and remove them from the keepFields array. Return number of valid entries */ { int i = 0; int keepIx = 0; for (; i < keepCount; i++) { char *tag = tempKeepFields[i]; struct vcfInfoDef *def = NULL; if ( (def = vcfInfoDefForKey(vcff, tag)) == NULL) verbose(2, "Warning: skipping desired tag '%s', does not exist in INFO fields.\n", tag); @@ -131,43 +132,43 @@ fprintf(outBed, "%s", val); break; } } else if (sameString(infoTags[j], extraField)) { fprintf(outBed, "Yes"); break; } } if (i < (extraFieldCount - 1)) fprintf(outBed, "\t"); } } -void printHeaders(struct vcfFile *vcff, char **keepFields, int keepCount, FILE *outBed) +void printHeaders(char **keepFields, int keepCount, FILE *outBed) /* Print a comment describing all the fields into outBed */ { int i; fprintf(outBed, "%s", bed9Header); for (i = 0; i < keepCount; i++) if (keepFields[i] != NULL) fprintf(outBed, "\t%s", keepFields[i]); fprintf(outBed, "\n"); } void vcfLinesToBed(struct vcfFile *vcff, char **keepFields, int extraFieldCount, - boolean fixChromNames, FILE *outBed) //, FILE *outExtra) + boolean fixChromNames, boolean fieldsIsFile, FILE *outBed) //, FILE *outExtra) /* Turn a VCF line into a bed9+ line */ { struct dyString *name = dyStringNew(0); struct dyString *shortenedRef = dyStringNew(0); struct dyString *shortenedAlt = dyStringNew(0); char *line = NULL; char *chopped[9]; // 9 fields in case VCF has genotype fields, keep them separate from info fields int infoCount = slCount(vcff->infoDefs); char *infoTags[infoCount]; while (lineFileNext(vcff->lf, &line, NULL)) { int fieldCount = chopTabs(line, chopped); if (fieldCount < 8) errAbort("ERROR: malformed VCF, missing fields at line: '%d'", vcff->lf->lineIx); char *chrom = chopped[0]; @@ -189,84 +190,93 @@ } fprintf(outBed, "%s\t%d\t%d\t%s\t0\t.\t%d\t%d\t0,0,0", chrom,start,end,name->string,start,end); fprintf(outBed, "\t%s\t%s\t%s", ref, alt, chopped[6]); if (extraFieldCount) { fprintf(outBed, "\t"); printExtraFieldInfo(outBed, chopped[7], infoTags, infoCount, keepFields, extraFieldCount); } fprintf(outBed, "\n"); dyStringClear(name); dyStringClear(shortenedRef); dyStringClear(shortenedAlt); } } -void vcfToBed(char *vcfFileName, char *outPrefix, char *tagsToKeep, boolean fixChromNames) +void vcfToBed(char *vcfFileName, char *outPrefix, char *tagsToKeep, boolean fixChromNames, boolean fieldsIsFile) /* vcfToBed - Convert VCF to BED9+ with optional extra fields. * Extra VCF tags get placed into a separate tab file for later indexing. */ { FILE *outBed; struct vcfFile *vcff = NULL; char tbiFile[4096]; int i; int vcfTagCount = 0; // no need for extra file if there's only a few tags int keepCount = 0; // count of comma-sep tagsToKeep -char *keepFields[MAX_BED_EXTRA]; // Max 50 extra fields to put into bigBed, also needed for +char *keepFields[MAX_BED_EXTRA]; // Max 1000 extra fields to put into bigBed, also needed for // comment string header char *tempKeepFields[MAX_BED_EXTRA]; // allow requesting fields that don't exist, just don't output memset(keepFields, 0, MAX_BED_EXTRA); memset(tempKeepFields, 0, MAX_BED_EXTRA); // open up VCF for reading safef(tbiFile, sizeof(tbiFile), "%s.tbi", vcfFileName); if (fileExists(tbiFile)) { vcff = vcfTabixFileAndIndexMayOpen(vcfFileName, tbiFile, NULL, 0,0,VCF_IGNORE_ERRS,-1); if (vcff == NULL) errAbort("error opening %s\n", vcfFileName); vcfTagCount = slCount(vcff->infoDefs); // do a batch read and use the reuse pool vcfFileMakeReusePool(vcff, 64*1024*1024); } else errAbort("no tabix index for %s\n", vcfFileName); if (tagsToKeep) { + if (fieldsIsFile) + { + struct lineFile *lf = lineFileOpen(tagsToKeep, FALSE); + char *tag; + while (lineFileNextRow(lf, &tag, 1)) + tempKeepFields[keepCount++] = cloneString(tag); + } + else keepCount = chopCommas(tagsToKeep, tempKeepFields); if (keepCount > vcfTagCount) verbose(2, "Warning: fewer fields in VCF than -fields specification."); keepCount = trimMissingTagsFromFieldList(keepFields, tempKeepFields, keepCount, vcff); } // open output for writing char *outBedName = NULL; if (endsWith(outPrefix, ".bed")) outBedName = cloneString(outPrefix); else outBedName = catTwoStrings(outPrefix, ".bed"); outBed = mustOpen(outBedName, "w"); verbose(2, "input vcf tag count = %d\n", vcfTagCount); verbose(2, "number of valid requested tags = %d\n", keepCount); for (i = 0; i < keepCount; i++) if (keepFields[i] != NULL) verbose(2, "found tag: '%s'\n", keepFields[i]); -printHeaders(vcff, keepFields, keepCount, outBed); -vcfLinesToBed(vcff, keepFields, keepCount, fixChromNames, outBed); +printHeaders(keepFields, keepCount, outBed); +vcfLinesToBed(vcff, keepFields, keepCount, fixChromNames, fieldsIsFile, outBed); vcfFileFree(&vcff); carefulClose(&outBed); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 3) usage(); char *fields = optionVal("fields", NULL); boolean fixChromNames = optionExists("fixChromNames"); -vcfToBed(argv[1],argv[2], fields, fixChromNames); +boolean fieldsIsFile = optionExists("fieldsIsFile"); +vcfToBed(argv[1],argv[2], fields, fixChromNames, fieldsIsFile); return 0; }