7cc9ee442be0bb8ab7a90e77566f5df30bdb7277 markd Thu Jan 9 08:29:12 2020 -0800 add option to use parallel sort for pslPartition and bedPartition diff --git src/hg/pslPartition/pslPartition.c src/hg/pslPartition/pslPartition.c index f32a09d..da252d3 100644 --- src/hg/pslPartition/pslPartition.c +++ src/hg/pslPartition/pslPartition.c @@ -1,98 +1,82 @@ /* pslPartition - split PSL files into non-overlapping sets */ /* Copyright (C) 2011 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "options.h" -#include "pipeline.h" +#include "partitionSort.h" #include "psl.h" #include "dystring.h" #include "portable.h" /* command line options and values */ static struct optionSpec optionSpecs[] = { {"outLevels", OPTION_INT}, {"partSize", OPTION_INT}, {"dropContained", OPTION_BOOLEAN}, + {"parallel", OPTION_INT}, {NULL, 0} }; static int gOutLevels = 0; static int gPartSize = 20000; static boolean gDropContained = FALSE; +static int gParallel = 0; static void usage(char *msg) /* Explain usage and exit. */ { errAbort("Error: %s\n" "pslPartition - split PSL files into non-overlapping sets\n" "usage:\n" " pslPartition [options] pslFile outDir\n" "\n" "Split psl files into non-overlapping sets for use in cluster jobs,\n" "limiting memory usage, etc. Multiple levels of directories can be are\n" "created under outDir to prevent slow access to huge directories.\n" "The pslFile maybe compressed and no ordering is assumed.\n" "\n" "options:\n" " -outLevels=0 - number of output subdirectory levels. 0 puts all files\n" " directly in outDir, 2, will create files in the form outDir/0/0/00.psl\n" " -partSize=20000 - will combine non-overlapping partitions, while attempting\n" " to keep them under this number of PSLs. This reduces the number of\n" " files that are created while ensuring that there are no overlaps\n" " between any two PSL files. A value of 0 creates a PSL file per set of\n" " overlapping PSLs.\n" " -dropContained - drop PSLs that are completely contained in a block of\n" " another PSL.\n" + " -parallel=n - use this many cores for parallel sorting\n" "\n", msg); } struct pslInput /* object to read a psl */ { struct pipeline *pl; /* sorting pipeline */ struct lineFile *lf; /* lineFile to pipeline */ struct psl *pending; /* next psl to read, if not NULL */ }; -static struct pipeline *openPslSortPipe(char *pslFile) -/* open pipeline that sorts psl */ -{ -static char *zcatCmd[] = {"zcat", NULL}; -static char *bzcatCmd[] = {"zcat", NULL}; -static char *sortCmd[] = {"sort", "-k", "14,14", "-k", "16,16n", "-k", "17,17nr", NULL}; -int iCmd = 0; -char **cmds[3]; - -if (endsWith(pslFile, ".gz") || endsWith(pslFile, ".Z")) - cmds[iCmd++] = zcatCmd; -else if (endsWith(pslFile, ".bz2")) - cmds[iCmd++] = bzcatCmd; -cmds[iCmd++] = sortCmd; -cmds[iCmd++] = NULL; - -return pipelineOpen(cmds, pipelineRead, pslFile, NULL); -} - static struct pslInput *pslInputNew(char *pslFile) /* create object to read PSLs */ { struct pslInput *pi; AllocVar(pi); -pi->pl = openPslSortPipe(pslFile); +pi->pl = partitionSortOpenPipeline(pslFile, 13, 15, 16, gParallel); pi->lf = pipelineLineFile(pi->pl); return pi; } static void pslInputFree(struct pslInput **piPtr) /* free pslInput object */ { struct pslInput *pi = *piPtr; if (pi != NULL) { assert(pi->pending == NULL); pipelineClose(&pi->pl); freez(piPtr); } } @@ -279,18 +263,19 @@ } if (parts.psls != NULL) pslPartsWrite(&parts, outDir); pslInputFree(&pi); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc != 3) usage("wrong # args"); gOutLevels = optionInt("outLevels", gOutLevels); gPartSize = optionInt("partSize", gPartSize); gDropContained = optionExists("dropContained"); +gParallel = optionInt("parallel", gParallel); pslPartition(argv[1], argv[2]); return 0; }