06d7be056190c14b85e71bc12523f18ea6815b5e markd Mon Dec 7 00:50:29 2020 -0800 BLAT mmap index support merge with master diff --git src/hg/utils/vcfToBed/vcfToBed.c src/hg/utils/vcfToBed/vcfToBed.c index dfae97e..b36c374 100644 --- src/hg/utils/vcfToBed/vcfToBed.c +++ src/hg/utils/vcfToBed/vcfToBed.c @@ -1,25 +1,26 @@ /* vcfToBed - Convert VCF to BED9+ with optional extra fields. * Extra VCF tags get placed into a separate tab file for later indexing.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "vcf.h" #include "dnautil.h" -#define MAX_BED_EXTRA 50 // how many extra fields can we put into the bigBed itself (for filtering, labels, etc) +// how many extra fields can we put into the bigBed itself (for filtering, labels, etc): +#define MAX_BED_EXTRA 100 #define bed9Header "#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tref\talt\tFILTER" // hash up all the info tags and their vcfInfoDef structs for faster searching struct hash *infoHash = NULL; void usage() /* Explain usage and exit. */ { errAbort( "vcfToBed - Convert VCF to BED9+ with optional extra fields.\n" "usage:\n" " vcfToBed in.vcf outPrefix\n" "options:\n" " -fields=comma-sep list of tags to include in the bed file, other fields will be placed into\n" " out.extraFields.tab\n" @@ -46,34 +47,34 @@ char *tag = tempKeepFields[i]; struct vcfInfoDef *def = NULL; if ( (def = vcfInfoDefForKey(vcff, tag)) == NULL) verbose(2, "Warning: skipping desired tag '%s', does not exist in INFO fields.\n", tag); else { keepFields[keepIx++] = cloneString(tag); } } return keepIx; } char *fixupChromName(char *chrom) /* Prepend "chr" if missing */ { -if (!startsWith(chrom, "chr")) - return catTwoStrings("chr", chrom); -else +if (startsWith("chr", chrom)) return chrom; +else + return catTwoStrings("chr", chrom); } #define VCF_MAX_INFO (4*1024) void fixupLeftPadding(int *start, char *ref, char *alt) /* If an indel is left padded fix up the start position */ { char *altTmp = cloneString(alt); char *alleles[VCF_MAX_INFO+1]; alleles[0] = cloneString(ref); char *altAlleles[VCF_MAX_INFO]; int altCount = chopCommas(altTmp, altAlleles); int alleleCount = 1 + altCount; if (alleleCount > 1) { @@ -156,32 +157,32 @@ /* Turn a VCF line into a bed9+ line */ { struct dyString *name = dyStringNew(0); struct dyString *shortenedRef = dyStringNew(0); struct dyString *shortenedAlt = dyStringNew(0); char *line = NULL; char *chopped[9]; // 9 fields in case VCF has genotype fields, keep them separate from info fields int infoCount = slCount(vcff->infoDefs); char *infoTags[infoCount]; while (lineFileNext(vcff->lf, &line, NULL)) { int fieldCount = chopTabs(line, chopped); if (fieldCount < 8) errAbort("ERROR: malformed VCF, missing fields at line: '%d'", vcff->lf->lineIx); char *chrom = fixupChromName(chopped[0]); - int start = atoi(chopped[1]); - int end = start + 1; + int start = atoi(chopped[1]) - 1; + int end = start; char *ref = cloneString(chopped[3]); char *alt = cloneString(chopped[4]); if (strlen(ref) == dnaFilteredSize(ref)) end = start + strlen(ref); fixupLeftPadding(&start, ref, alt); if (!sameString(chopped[2], ".")) dyStringPrintf(name, "%s", chopped[2]); else { shortenRefAltString(shortenedRef, shortenedAlt, ref, alt); dyStringPrintf(name, "%s_%d:%s/%s", chrom,start,shortenedRef->string,shortenedAlt->string); } fprintf(outBed, "%s\t%d\t%d\t%s\t0\t.\t%d\t%d\t0,0,0", chrom,start,end,name->string,start,end); fprintf(outBed, "\t%s\t%s\t%s", ref, alt, chopped[6]); if (extraFieldCount)