aee5cb870a32c26ba410232a896e5dda0e92fb5c braney Wed Feb 19 17:50:40 2025 -0800 add sort option to bedToBigBed refs #29104 diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index a8dd192d883..2b859727451 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -8,30 +8,31 @@ #include "options.h" #include "dystring.h" #include "obscure.h" #include "asParse.h" #include "basicBed.h" #include "memalloc.h" #include "sig.h" #include "rangeTree.h" #include "zlibFace.h" #include "sqlNum.h" #include "bPlusTree.h" #include "bigBed.h" #include "bbiAlias.h" #include "twoBit.h" #include "portable.h" +#include "errCatch.h" char *version = "2.10"; // when changing, consider changing in bedToBigBed, bedGraphToBigWig, and wigToBigWig /* Version history from 2.6 on at least - * 2.10 - allow chromsomes to be in non-lexicographic order * 2.9 - ability to specify chromAlias bigBed as chromSizes file * 2.8 - Various changes where developer didn't increment version id * 2.7 - Added check for duplicate field names in asParse.c * 2.6 - Made it not crash on empty input. * */ /* Things set directly or indirectly by command lne in main() routine. */ int blockSize = 256; int itemsPerSlot = 512; char *extraIndex = NULL; int bedN = 0; /* number of standard bed fields */ @@ -66,77 +67,81 @@ "twoBitInfo on the assembly .2bit file or the 2bit file or used directly\n" "if the -sizesIs2Bit option is specified.\n" "\n" "The chrom.sizes file may also be a chromAlias bigBed file, or a URL to\n" "such a file, by specifying the -sizesIsChromAliasBb option. When using\n" "a chromAlias bigBed file, the input BED file may have chromosome names\n" "matching any of the sequence name aliases in the chromAlias file.\n" "\n" "For UCSC provided genomes, the chromAlias files can be found under:\n" " https://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chromAlias.bb\n" "For UCSC GenArk assembly hubs, the chrom aliases are namedd in the form:\n" " https://hgdownload.soe.ucsc.edu/hubs/GCF/006/542/625/GCF_006542625.1/GCF_006542625.1.chromAlias.bb\n" "For a description of generating chromAlias files for your own assembly hub, see:\n" " http://genomewiki.ucsc.edu/index.php/Chrom_Alias\n" "\n" - "The in.bed file must be sorted by chromosome,start,\n" + "Without the -sort option the in.bed file must be sorted by chromosome,start,\n" " to sort a bed file, use the unix sort command:\n" " sort -k1,1 -k2,2n unsorted.bed > sorted.bed\n" "Sequences must be sorted by name so all sequences with the same name\n" "are collected together, but they don't need to be in any particular order.\n" + "If the -sort=tempName option is specified then the input file will be sorted using\n" + "using a file called tempName, which will be removed after use.\n" "\n" "options:\n" " -type=bedN[+[P]] : \n" " N is between 3 and 15, \n" " optional (+) if extra \"bedPlus\" fields, \n" " optional P specifies the number of extra fields. Not required, but preferred.\n" " Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n" " (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n" " -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n" " of each field in a row in AutoSql format here.\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" " -unc - If set, do not use compression.\n" " -tab - If set, expect fields to be tab separated, normally\n" " expects white space separator.\n" " -extraIndex=fieldList - If set, make an index on each field in a comma separated list\n" " extraIndex=name and extraIndex=name,id are commonly used.\n" " -sizesIs2Bit -- If set, the chrom.sizes file is assumed to be a 2bit file.\n" " -sizesIsChromAliasBb -- If set, then chrom.sizes file is assumed to be a chromAlias\n" " bigBed file or a URL to a such a file (see above).\n" " -sizesIsBb -- Obsolete name for -sizesIsChromAliasBb.\n" " -udcDir=/path/to/udcCacheDir -- sets the UDC cache dir for caching of remote files.\n" " -allow1bpOverlap -- allow exons to overlap by at most one base pair\n" " -maxAlloc=N -- Set the maximum memory allocation size to N bytes\n" + " -sort=tempFile -- sort the input file using tempFile. Removes tempFile afterwards. Use something like $TMPDIR/fileName\n" , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, {"type", OPTION_STRING}, {"as", OPTION_STRING}, {"unc", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, {"sizesIs2Bit", OPTION_BOOLEAN}, {"sizesIsChromAliasBb", OPTION_BOOLEAN}, {"sizesIsBb", OPTION_BOOLEAN}, {"extraIndex", OPTION_STRING}, {"udcDir", OPTION_STRING}, {"allow1bpOverlap", OPTION_BOOLEAN}, {"maxAlloc", OPTION_LONG_LONG}, + {"sort", OPTION_STRING}, {NULL, 0}, }; static struct lineFile *rewindFile(char *inName, struct lineFile *lf) /* set up lineFile to point at the beginning of the file. It we're reading from a decompressing * pipe, we need to close and reopen the pipe. */ { if (lf->pl) { lineFileClose(&lf); lf = lineFileOpen(inName, TRUE); } else lineFileRewind(lf); @@ -941,21 +946,45 @@ * or if more than or maximum defined fields, then for bed15+ */ bedN = fieldCount; if (bedN > bedKnownFields) { bedP = bedN - bedKnownFields; bedN = bedKnownFields; } } /* Make sure that fields are defined, from bed spec if nowhere else. */ if (asFile) readInGulp(asFile, &asText, NULL); else asText = bedAsDef(bedN, bedN + bedP); +char *sortFile = optionVal("sort", NULL); +if (sortFile) + { + char sysBuf[4096]; + + safef(sysBuf, sizeof sysBuf, "sort -k1,1 -k2,2n %s > %s", bedFileName, sortFile); + if (system(sysBuf) != 0) + errAbort("Couldn't sort %s into %s. Error code %d\n", bedFileName, sortFile, errno); + + bedFileName = sortFile; + } + +struct errCatch *errCatch = errCatchNew(); +if (errCatchStart(errCatch)) + { bedToBigBed(bedFileName, argv[2], argv[3]); + } +if (errCatch->gotError || errCatch->gotWarning) + { + fprintf(stderr, "%s",errCatch->message->string); + } +errCatchEnd(errCatch); + +if (sortFile) + unlink(sortFile); optionFree(); if (verboseLevel() > 1) printVmPeak(); return 0; }