99e41c7f76633c836d8788f2f99aa77d996519a2 markd Mon Oct 28 12:27:30 2019 -0700 Added bedPartition program to calculate non-overlapping ranges in a BED file diff --git src/hg/utils/bedPartition/bedPartition.c src/hg/utils/bedPartition/bedPartition.c new file mode 100644 index 0000000..143e778 --- /dev/null +++ src/hg/utils/bedPartition/bedPartition.c @@ -0,0 +1,172 @@ +/* bedPartition - split BED ranges into non-overlapping ranges */ + +/* Copyright (C) 2019 The Regents of the University of California + * See README in this or parent directory for licensing information. */ +#include "common.h" +#include "options.h" +#include "pipeline.h" +#include "basicBed.h" +#include "sqlNum.h" +#include "dystring.h" +#include "portable.h" + + +/* command line options and values */ +static struct optionSpec optionSpecs[] = +{ + {NULL, 0} +}; + +static void usage(char *msg) +/* Explain usage and exit. */ +{ +errAbort("Error: %s\n" + "bedPartition - split BED ranges into non-overlapping ranges\n" + "usage:\n" + " bedPartition [options] bedFile rangesBed\n" + "\n" + "Split ranges in a BED into non-overlapping sets for use in cluster jobs.\n" + "Output is a BED 3 of the ranges.\n" + "The bedFile maybe compressed and no ordering is assumed.\n" + "\n" + "options:\n" + "\n", msg); +} + +struct bedInput +/* object to read a bed */ +{ + struct pipeline *pl; /* sorting pipeline */ + struct lineFile *lf; /* lineFile to pipeline */ + struct bed3 *pending; /* next bed to read, if not NULL */ +}; + +static struct pipeline *openBedSortPipe(char *bedFile) +/* open pipeline that sorts bed */ +{ +static char *zcatCmd[] = {"zcat", NULL}; +static char *bzcatCmd[] = {"zcat", NULL}; +static char *sortCmd[] = {"sort", "-k", "1,1", "-k", "2,2n", "-k", "3,3nr", NULL}; +int iCmd = 0; +char **cmds[3]; + +if (endsWith(bedFile, ".gz") || endsWith(bedFile, ".Z")) + cmds[iCmd++] = zcatCmd; +else if (endsWith(bedFile, ".bz2")) + cmds[iCmd++] = bzcatCmd; +cmds[iCmd++] = sortCmd; +cmds[iCmd++] = NULL; + +return pipelineOpen(cmds, pipelineRead, bedFile, NULL); +} + +static struct bedInput *bedInputNew(char *bedFile) +/* create object to read BEDs */ +{ +struct bedInput *bi; +AllocVar(bi); +bi->pl = openBedSortPipe(bedFile); +bi->lf = pipelineLineFile(bi->pl); +return bi; +} + +static void bedInputFree(struct bedInput **biPtr) +/* free bedInput object */ +{ +struct bedInput *bi = *biPtr; +if (bi != NULL) + { + assert(bi->pending == NULL); + pipelineClose(&bi->pl); + freez(biPtr); + } +} + +static struct bed3 *bed3Next(struct lineFile *lf) +/* read a BED3 */ +{ +char *row[3]; +if (!lineFileRow(lf, row)) + return NULL; +return bed3New(row[0], sqlUnsigned(row[1]), sqlUnsigned(row[2])); +} + + +static struct bed3 *bedInputNext(struct bedInput *bi) +/* read next bed */ +{ +struct bed3 *bed = bi->pending; +if (bed != NULL) + bi->pending = NULL; +else + bed = bed3Next(bi->lf); +return bed; +} + +static void bedInputPutBack(struct bedInput *bi, struct bed3 *bed) +/* save bed pending slot. */ +{ +if (bi->pending) + errAbort("only one bed maybe pending"); +bi->pending = bed; +} + +static boolean isOverlapped(struct bed3 *bed, struct bed3 *bedPart) +/* determine if a bed is in the partition */ +{ +return (bed->chromStart < bedPart->chromEnd) && (bed->chromEnd > bedPart->chromStart) + && sameString(bed->chrom, bedPart->chrom); +} + +static struct bed3 *readPartition(struct bedInput *bi) +/* read next set of overlapping beds */ +{ +struct bed3 *bedPart = bedInputNext(bi); +struct bed3 *bed; + +if (bedPart == NULL) + return NULL; /* no more */ +/* add more beds while they overlap, easy since input is sorted */ +while ((bed = bedInputNext(bi)) != NULL) + { + if (isOverlapped(bed, bedPart)) + { + bedPart->chromStart = min(bedPart->chromStart, bed->chromStart); + bedPart->chromEnd = max(bedPart->chromEnd, bed->chromEnd); + bed3Free(&bed); + } + else + { + bedInputPutBack(bi, bed); + break; + } + } + +return bedPart; +} + +static void bedPartition(char *bedFile, char *rangesBed) +/* split BED files into non-overlapping sets */ +{ +struct bedInput *bi = bedInputNew(bedFile); +struct bed3 *bedPart; +FILE *outFh = mustOpen(rangesBed, "w"); + +while ((bedPart = readPartition(bi)) != NULL) + { + fprintf(outFh, "%s\t%d\t%d\n", bedPart->chrom, bedPart->chromStart, bedPart->chromEnd); + bed3Free(&bedPart); + } +carefulClose(&outFh); +bedInputFree(&bi); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, optionSpecs); +if (argc != 3) + usage("wrong # args"); +bedPartition(argv[1], argv[2]); +return 0; +}