99e41c7f76633c836d8788f2f99aa77d996519a2
markd
  Mon Oct 28 12:27:30 2019 -0700
Added bedPartition program to calculate non-overlapping ranges in a BED file

diff --git src/hg/utils/bedPartition/bedPartition.c src/hg/utils/bedPartition/bedPartition.c
new file mode 100644
index 0000000..143e778
--- /dev/null
+++ src/hg/utils/bedPartition/bedPartition.c
@@ -0,0 +1,172 @@
+/* bedPartition - split BED ranges into non-overlapping ranges  */
+
+/* Copyright (C) 2019 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+#include "common.h"
+#include "options.h"
+#include "pipeline.h"
+#include "basicBed.h"
+#include "sqlNum.h"
+#include "dystring.h"
+#include "portable.h"
+
+
+/* command line options and values */
+static struct optionSpec optionSpecs[] =
+{
+    {NULL, 0}
+};
+
+static void usage(char *msg)
+/* Explain usage and exit. */
+{
+errAbort("Error: %s\n"
+  "bedPartition - split BED ranges into non-overlapping ranges\n"
+  "usage:\n"
+  "   bedPartition [options] bedFile rangesBed\n"
+  "\n"
+  "Split ranges in a BED into non-overlapping sets for use in cluster jobs.\n"
+  "Output is a BED 3 of the ranges.\n"
+  "The bedFile maybe compressed and no ordering is assumed.\n"
+  "\n"
+  "options:\n"
+  "\n", msg);
+}
+
+struct bedInput
+/* object to read a bed */
+{
+    struct pipeline *pl;     /* sorting pipeline */
+    struct lineFile *lf;     /* lineFile to pipeline */
+    struct bed3 *pending;     /* next bed to read, if not NULL */
+};
+
+static struct pipeline *openBedSortPipe(char *bedFile)
+/* open pipeline that sorts bed */
+{
+static char *zcatCmd[] = {"zcat", NULL};
+static char *bzcatCmd[] = {"zcat", NULL};
+static char *sortCmd[] = {"sort", "-k", "1,1", "-k", "2,2n", "-k", "3,3nr", NULL};
+int iCmd = 0;
+char **cmds[3];
+
+if (endsWith(bedFile, ".gz") || endsWith(bedFile, ".Z"))
+    cmds[iCmd++] = zcatCmd;
+else if (endsWith(bedFile, ".bz2"))
+    cmds[iCmd++] = bzcatCmd;
+cmds[iCmd++] = sortCmd;
+cmds[iCmd++] = NULL;
+
+return pipelineOpen(cmds, pipelineRead, bedFile, NULL);
+}
+
+static struct bedInput *bedInputNew(char *bedFile)
+/* create object to read BEDs */
+{
+struct bedInput *bi;
+AllocVar(bi);
+bi->pl = openBedSortPipe(bedFile);
+bi->lf = pipelineLineFile(bi->pl);
+return bi;
+}
+
+static void bedInputFree(struct bedInput **biPtr)
+/* free bedInput object */
+{
+struct bedInput *bi = *biPtr;
+if (bi != NULL)
+    {
+    assert(bi->pending == NULL);
+    pipelineClose(&bi->pl);
+    freez(biPtr);
+    }
+}
+
+static struct bed3 *bed3Next(struct lineFile *lf)
+/* read a BED3 */
+{
+char *row[3];
+if (!lineFileRow(lf, row))
+    return NULL;
+return bed3New(row[0], sqlUnsigned(row[1]), sqlUnsigned(row[2]));
+}
+
+
+static struct bed3 *bedInputNext(struct bedInput *bi)
+/* read next bed */
+{
+struct bed3 *bed = bi->pending;
+if (bed != NULL)
+    bi->pending = NULL;
+else
+    bed = bed3Next(bi->lf);
+return bed;
+}
+
+static void bedInputPutBack(struct bedInput *bi, struct bed3 *bed)
+/* save bed pending slot. */
+{
+if (bi->pending)
+    errAbort("only one bed maybe pending");
+bi->pending = bed;
+}
+
+static boolean isOverlapped(struct bed3 *bed, struct bed3 *bedPart)
+/* determine if a bed is in the partition */
+{
+return (bed->chromStart < bedPart->chromEnd) && (bed->chromEnd > bedPart->chromStart)
+    && sameString(bed->chrom, bedPart->chrom);
+}
+
+static struct bed3 *readPartition(struct bedInput *bi)
+/* read next set of overlapping beds */
+{
+struct bed3 *bedPart = bedInputNext(bi);
+struct bed3 *bed;
+
+if (bedPart == NULL)
+    return NULL;  /* no more */
+/* add more beds while they overlap, easy since input is sorted */
+while ((bed = bedInputNext(bi)) != NULL)
+    {
+    if (isOverlapped(bed, bedPart))
+        {
+        bedPart->chromStart = min(bedPart->chromStart, bed->chromStart);
+        bedPart->chromEnd = max(bedPart->chromEnd, bed->chromEnd);
+        bed3Free(&bed);
+        }
+    else
+        {
+        bedInputPutBack(bi, bed);
+        break;
+        }
+    }
+
+return bedPart;
+}
+
+static void bedPartition(char *bedFile, char *rangesBed)
+/* split BED files into non-overlapping sets */
+{
+struct bedInput *bi = bedInputNew(bedFile);
+struct bed3 *bedPart;
+FILE *outFh = mustOpen(rangesBed, "w");
+
+while ((bedPart = readPartition(bi)) != NULL)
+    {
+    fprintf(outFh, "%s\t%d\t%d\n", bedPart->chrom, bedPart->chromStart, bedPart->chromEnd);
+    bed3Free(&bedPart);
+    }
+carefulClose(&outFh);
+bedInputFree(&bi);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, optionSpecs);
+if (argc != 3)
+    usage("wrong # args");
+bedPartition(argv[1], argv[2]);
+return 0;
+}