30b1ed267a38a0136b3463d3638ce108bfb62960 markd Tue Mar 18 17:29:07 2025 -0700 bedPartition: add -minPartitionItems, -partMergeDist, and statistics output diff --git src/hg/utils/bedPartition/bedPartition.c src/hg/utils/bedPartition/bedPartition.c index b2f71ce3f0e..24b16b0a403 100644 --- src/hg/utils/bedPartition/bedPartition.c +++ src/hg/utils/bedPartition/bedPartition.c @@ -1,56 +1,64 @@ /* bedPartition - split BED ranges into non-overlapping ranges */ /* Copyright (C) 2019 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ +#include #include "common.h" #include "options.h" +#include "verbose.h" #include "partitionSort.h" #include "basicBed.h" #include "sqlNum.h" #include "dystring.h" #include "portable.h" /* command line options and values */ static struct optionSpec optionSpecs[] = { - {"partMergeSize", OPTION_INT}, + {"minPartitionItems", OPTION_INT}, + {"partMergeDist", OPTION_INT}, + {"partMergeSize", OPTION_INT}, // obsolete {"parallel", OPTION_INT}, {NULL, 0} }; -static int gPartMergeSize = 0; +static int gMinPartitionItems = 0; +static int gPartMergeDist = 0; static int gParallel = 0; static void usage() /* Explain usage and exit. */ { errAbort("bedPartition - split BED ranges into non-overlapping ranges\n" "usage:\n" " bedPartition [options] bedFile rangesBed\n" "\n" "Split ranges in a BED into non-overlapping sets for use in cluster jobs.\n" "Output is a BED 4 of the ranges and a generated name.\n" "The bedFile maybe compressed and no ordering is assumed.\n" "\n" "options:\n" - " -partMergeSize=0 - will combine adjacent non-overlapping partitions, that are\n" - " separated by no more that this number of bases.\n" - " per set of overlapping records.\n" + " -verbose=1 - print statistics if >= 1\n" + " -minPartitionItems=0 - minimum number of input items in a partition. Partitions with\n" + " fewer items will merged with into subsequent partitions\n" + " -partMergeDist=0 - will combine adjacent non-overlapping partitions that are\n" + " separated by no more that this number of bases. -partMergeSize is an obsolete\n" + " name for this option.\n" " -parallel=n - use this many cores for parallel sorting\n" "notes:\n" - " - Generate name is useful for testing if two ranges are in the same partition\n"); + " - Generate name is useful for identifying partition\n"); } struct bedInput /* object to read a bed */ { struct pipeline *pl; /* sorting pipeline */ struct lineFile *lf; /* lineFile to pipeline */ struct bed3 *pending; /* next bed to read, if not NULL */ }; static struct bedInput *bedInputNew(char *bedFile) /* create object to read BEDs */ { struct bedInput *bi; AllocVar(bi); @@ -66,35 +74,31 @@ struct bedInput *bi = *biPtr; if (bi != NULL) { assert(bi->pending == NULL); pipelineClose(&bi->pl); freez(biPtr); } } static struct bed3 *bed3Next(struct lineFile *lf) /* read a BED3 */ { char *row[3]; if (!lineFileRow(lf, row)) return NULL; -// adjust start to allow for merge padding -unsigned start = sqlUnsigned(row[1]); -if (start > gPartMergeSize) // don't underflow - start -= gPartMergeSize; -return bed3New(row[0], start, sqlUnsigned(row[2])); +return bed3New(row[0], sqlUnsigned(row[1]), sqlUnsigned(row[2])); } static struct bed3 *bedInputNext(struct bedInput *bi) /* read next bed */ { struct bed3 *bed = bi->pending; if (bed != NULL) bi->pending = NULL; else bed = bed3Next(bi->lf); return bed; } static void bedInputPutBack(struct bedInput *bi, struct bed3 *bed) @@ -106,72 +110,110 @@ } static boolean sameChrom(struct bed3 *bed, struct bed3 *bedPart) /* chrom the same? */ { return sameString(bed->chrom, bedPart->chrom); } static boolean isOverlapped(struct bed3 *bed, struct bed3 *bedPart) /* determine if a bed is in the partition */ { return (bed->chromStart < bedPart->chromEnd) && (bed->chromEnd > bedPart->chromStart) && sameChrom(bed, bedPart); } +static boolean shouldMergeAdjacent(struct bed3 *bed, struct bed3 *bedPart) +/* should this be merged with adjacent due to distance */ +{ +return (bedPart->chromEnd < bed->chromStart) && + ((bed->chromStart - bedPart->chromEnd) < gPartMergeDist); +} + +static boolean inclInPartation(struct bed3 *bed, struct bed3 *bedPart, + int currentItemCnt) +/* should this BED be included in the current partition? */ +{ +return sameChrom(bed, bedPart) && + ((isOverlapped(bed, bedPart) || (currentItemCnt < gMinPartitionItems) || + shouldMergeAdjacent(bed, bedPart))); +} -static struct bed3 *readPartition(struct bedInput *bi) +static struct bed3 *readPartition(struct bedInput *bi, int *itemCntP, int *minPartItemsP, + int *maxPartItemsP) /* read next set of overlapping beds */ { struct bed3 *bedPart = bedInputNext(bi); struct bed3 *bed; +int currentItemCnt = 0; if (bedPart == NULL) return NULL; /* no more */ +currentItemCnt++; + /* add more beds while they overlap, due to way input is sorted * with by start and then reverse end */ - while ((bed = bedInputNext(bi)) != NULL) { - if (isOverlapped(bed, bedPart)) + if (inclInPartation(bed, bedPart, currentItemCnt)) { bedPart->chromStart = min(bedPart->chromStart, bed->chromStart); bedPart->chromEnd = max(bedPart->chromEnd, bed->chromEnd); bed3Free(&bed); + currentItemCnt++; } else { bedInputPutBack(bi, bed); break; } } + +(*itemCntP) += currentItemCnt; +*minPartItemsP = min(*minPartItemsP, currentItemCnt); +*maxPartItemsP = max(*maxPartItemsP, currentItemCnt); return bedPart; } static void bedPartition(char *bedFile, char *rangesBed) /* split BED files into non-overlapping sets */ { struct bedInput *bi = bedInputNew(bedFile); struct bed3 *bedPart; FILE *outFh = mustOpen(rangesBed, "w"); -int ibed = 0; -while ((bedPart = readPartition(bi)) != NULL) - { - fprintf(outFh, "%s\t%d\t%d\tP%d\n", bedPart->chrom, bedPart->chromStart, bedPart->chromEnd, ibed); - ibed++; +int partCnt = 0; +int itemCnt = 0; +int minPartItems = INT_MAX; +int maxPartItems = 0; +while ((bedPart = readPartition(bi, &itemCnt, &minPartItems, &maxPartItems)) != NULL) + { + fprintf(outFh, "%s\t%d\t%d\tP%d\n", bedPart->chrom, bedPart->chromStart, bedPart->chromEnd, partCnt); + partCnt++; bed3Free(&bedPart); } carefulClose(&outFh); bedInputFree(&bi); +verbose(1, "Number of items: %d\n", itemCnt); +verbose(1, "Number of partitions: %d\n", partCnt); +verbose(1, "Min items per partition: %d\n", minPartItems); +verbose(1, "Max items per partition: %d\n", maxPartItems); +if (partCnt > 0) + verbose(1, "Mean items per partition: %0.1f\n", (double)itemCnt / (double)partCnt); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc != 3) usage(); -gPartMergeSize = optionInt("partMergeSize", gPartMergeSize); +verboseSetLevel(1); +gMinPartitionItems = optionInt("minPartitionItems", gMinPartitionItems); +if (optionExists("partMergeSize")) + fprintf(stderr, "Note: -partMergeSize is an obsolete name, use -partMergeDist\n"); +// if both are specified, partMergeDist wins +gPartMergeDist = optionInt("partMergeSize", gPartMergeDist); +gPartMergeDist = optionInt("partMergeDist", gPartMergeDist); gParallel = optionInt("parallel", gParallel); bedPartition(argv[1], argv[2]); return 0; }