a3e03473bbde188141b61d0c3db48c85d2ec6aa3 chmalee Tue Sep 22 17:38:13 2020 -0700 Basic vcf to bed converter, with a -fields argument for what VCF INFO tags to include in the bigBed itself, refs #25010 diff --git src/hg/utils/vcfToBed/vcfToBed.c src/hg/utils/vcfToBed/vcfToBed.c new file mode 100644 index 0000000..6b9f6bc --- /dev/null +++ src/hg/utils/vcfToBed/vcfToBed.c @@ -0,0 +1,282 @@ +/* vcfToBed - Convert VCF to BED9+ with optional extra fields. + * Extra VCF tags get placed into a separate tab file for later indexing.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" +#include "vcf.h" + +#define MAX_BED_EXTRA 50 // how many extra fields can we put into the bigBed itself (for filtering, labels, etc) +#define bed9Header "#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb" + +// hash up all the info tags and their vcfInfoDef structs for faster searching +struct hash *infoHash = NULL; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "vcfToBed - Convert VCF to BED9+ with optional extra fields.\n" + "usage:\n" + " vcfToBed in.vcf outPrefix\n" + "options:\n" + " -fields=comma-sep list of tags to include in the bed file, other fields will be placed into\n" + " out.extraFields.tab\n" + "\n" + "NOTE: Extra VCF tags (that aren't listed in -fields) get placed into a separate tab\n" + "file for later indexing.\n" + ); +} + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"fields", OPTION_STRING}, + {NULL, 0}, +}; + +void hashInfoTags(struct vcfFile *vcff) +{ +if (!infoHash) + infoHash = hashNew(0); +struct vcfInfoDef *def = NULL; +for (def = vcff->infoDefs; def != NULL; def = def->next) + { + hashAdd(infoHash, def->key, def); + } +} + +int trimMissingTagsFromFieldList(char **keepFields, char **tempKeepFields, int keepCount, struct vcfFile *vcff) +/* If there are requested fields in keepFields that don't exist in vcff, warn about them + * and remove them from the keepFields array. Return number of valid entries */ +{ +int i = 0; +int keepIx = 0; +for (; i < keepCount; i++) + { + char *tag = tempKeepFields[i]; + struct vcfInfoDef *def = NULL; + if ( (def = vcfInfoDefForKey(vcff, tag)) == NULL) + verbose(2, "Warning: skipping desired tag '%s', does not exist in INFO fields.\n", tag); + else + { + keepFields[keepIx++] = cloneString(tag); + } + } +return keepIx; +} + +char *fixupChromName(char *chrom) +/* Prepend "chr" if missing */ +{ +if (!startsWith(chrom, "chr")) + return catTwoStrings("chr", chrom); +else + return chrom; +} + +char *fixupVcfRecordName(struct vcfRecord *rec, int fixedChromStart) +/* Fill in for often missing vcf record name (subs out '.' for something useful) */ +{ +static struct dyString *ds; +if (!ds) + ds = dyStringNew(0); +char *name = NULL; +if (isNotEmpty(rec->name) && !sameString(rec->name,".")) + name = cloneString(rec->name); +else + { + dyStringPrintf(ds, "%s_%d:", fixupChromName(rec->chrom), fixedChromStart); + if (rec->alleleCount < 2) + dyStringPrintf(ds, "?"); + else + dyStringPrintf(ds, "%s/%s", rec->alleles[0], rec->alleles[1]); + name = cloneString(ds->string); + } +dyStringClear(ds); +return name; +} + +void printVcfInfoElem(struct vcfFile *vcff, struct vcfRecord *rec, char *tag, FILE *out, boolean printTabular) +/* Maybe print a VCF INFO tag element to out */ +{ +const struct vcfInfoElement *elem = vcfRecordFindInfo(rec, tag); +if (elem) + { + const struct vcfInfoDef *def = hashMustFindVal(infoHash, elem->key); + //const struct vcfInfoDef *def = vcfInfoDefForKey(vcff, elem->key); + enum vcfInfoType type = def ? def->type : vcfInfoString; + if (elem != NULL) + { + if (type == vcfInfoFlag && elem->count == 0) + fprintf(out, "Yes"); + if (looksTabular(def, elem) && !printTabular) + errAbort("Error: Tables not supported in bigBed fields, tag '%s'", elem->key); + else + { + int j; + for (j = 0; j < elem->count; j++) + { + if (j > 0) + fprintf(out, ", "); + if (elem->missingData[j]) + fprintf(out, "."); + else + vcfPrintDatum(out, elem->values[j], type); + } + } + } + } +} + +void printHeaders(struct vcfFile *vcff, char **keepFields, int keepCount, FILE *outBed, FILE *outExtra) +/* Print the required header into extraFields.tab, and put a technically optional + * comment into outBed */ +{ +int i; +fprintf(outBed, "%s", bed9Header); +for (i = 0; i < keepCount; i++) + if (keepFields[i] != NULL) + fprintf(outBed, "\t%s", keepFields[i]); +fprintf(outBed, "\n"); + +if (outExtra) + { + struct vcfInfoDef *def; + boolean first = TRUE; + for (def = vcff->infoDefs; def != NULL; def = def->next) + { + if (stringArrayIx(def->key, keepFields, keepCount) == -1) + { + if (!first) + fprintf(outExtra, "\t"); + fprintf(outExtra, "%s", def->key); + first = FALSE; + } + } + fprintf(outExtra, "\n"); + } +} + +void vcfLinesToBed(struct vcfFile *vcff, char **keepFields, int extraFieldCount, + FILE *outBed, FILE *outExtra) +/* Turn a VCF line into a bed9+ line */ +{ +struct vcfRecord *rec; +int i; +while ( (rec = vcfNextRecord(vcff)) != NULL) + { + int start = vcfRecordTrimIndelLeftBase(rec); + char *name = fixupVcfRecordName(rec, start); + fprintf(outBed, "%s\t%d\t%d\t%s", fixupChromName(rec->chrom), start, rec->chromEnd, name); + fprintf(outBed, "0\t.\t%d\t%d\t0,0,0", start, rec->chromEnd); + for (i = 0; i < extraFieldCount; i++) + { + fprintf(outBed, "\t"); + printVcfInfoElem(vcff, rec, keepFields[i], outBed, FALSE); + } + fprintf(outBed, "\n"); + + // write out extra tags for this line, if any: + if (outExtra) + { + fprintf(outExtra, "%s", name); + struct vcfInfoDef *def; + for (def = vcff->infoDefs; def != NULL; def = def->next) + { + if (stringArrayIx(def->key, keepFields, extraFieldCount) == -1) + { + fprintf(outExtra, "\t"); + printVcfInfoElem(vcff, rec, def->key, outExtra, TRUE); + } + } + fprintf(outExtra, "\n"); + } + fflush(outBed); + fflush(outExtra); + } +} + +void vcfToBed(char *vcfFileName, char *outPrefix, char *tagsToKeep) +/* vcfToBed - Convert VCF to BED9+ with optional extra fields. + * Extra VCF tags get placed into a separate tab file for later indexing. */ +{ +FILE *outBed, *outExtra; +struct vcfFile *vcff = NULL; +char tbiFile[4096]; +int i; +int vcfTagCount = 0; // no need for extra file if there's only a few tags +int keepCount = 0; // count of comma-sep tagsToKeep +char *keepFields[MAX_BED_EXTRA]; // Max 50 extra fields to put into bigBed, also needed for + // comment string header +char *tempKeepFields[MAX_BED_EXTRA]; // allow requesting fields that don't exist, just don't output +memset(keepFields, 0, MAX_BED_EXTRA); +memset(tempKeepFields, 0, MAX_BED_EXTRA); + +// open up VCF for reading +safef(tbiFile, sizeof(tbiFile), "%s.tbi", vcfFileName); +if (fileExists(tbiFile)) + { + vcff = vcfTabixFileAndIndexMayOpen(vcfFileName, tbiFile, NULL, 0,0,VCF_IGNORE_ERRS,-1); + if (vcff == NULL) + errAbort("error opening %s\n", vcfFileName); + vcfTagCount = slCount(vcff->infoDefs); + hashInfoTags(vcff); + // do a batch read and use the reuse pool + vcfFileMakeReusePool(vcff, 64*1024*1024); + } +else + errAbort("no tabix index for %s\n", vcfFileName); + +if (tagsToKeep) + { + keepCount = chopCommas(tagsToKeep, tempKeepFields); + if (keepCount > vcfTagCount) + verbose(2, "Warning: fewer fields in VCF than -fields specification."); + keepCount = trimMissingTagsFromFieldList(keepFields, tempKeepFields, keepCount, vcff); + } + +// open output for writing +char *outBedName = NULL, *outExtraName = NULL; +if (endsWith(outPrefix, ".bed")) + { + outBedName = cloneString(outPrefix); + if (keepCount <= vcfTagCount) + { + outExtraName = replaceChars(outPrefix, ".bed", ".extraFields.tab"); + outExtra = mustOpen(outExtraName, "w"); + } + } +else + { + outBedName = catTwoStrings(outPrefix, ".bed"); + if (keepCount <= vcfTagCount) + { + outExtraName = catTwoStrings(outPrefix, ".extraFields.tab"); + outExtra = mustOpen(outExtraName, "w"); + } + } +outBed = mustOpen(outBedName, "w"); + +verbose(2, "input vcf tag count = %d\n", vcfTagCount); +verbose(2, "number of valid requested tags = %d\n", keepCount); +for (i = 0; i < keepCount; i++) + if (keepFields[i] != NULL) + verbose(2, "found tag: '%s'\n", keepFields[i]); + +printHeaders(vcff, keepFields, keepCount, outBed, outExtra); +vcfLinesToBed(vcff, keepFields, keepCount, outBed, outExtra); +vcfFileFree(&vcff); +carefulClose(&outBed); +carefulClose(&outExtra); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +char *fields = optionVal("fields", NULL); +vcfToBed(argv[1],argv[2], fields); +return 0; +}