8fdef9e866ecc228bfe4ea54102d0079818f0366 braney Fri Apr 8 14:49:38 2022 -0700 let bedGraphToBigWig use chromAlias.bb as chromSizes file. Add tests to both bedGraphToBigWig and bedToBigBed diff --git src/utils/bedGraphToBigWig/bedGraphToBigWig.c src/utils/bedGraphToBigWig/bedGraphToBigWig.c index deb33ac..6e89e40 100644 --- src/utils/bedGraphToBigWig/bedGraphToBigWig.c +++ src/utils/bedGraphToBigWig/bedGraphToBigWig.c @@ -8,71 +8,75 @@ #include "linefile.h" #include "localmem.h" #include "hash.h" #include "options.h" #include "sqlNum.h" #include "dystring.h" #include "cirTree.h" #include "sig.h" #include "zlibFace.h" #include "bPlusTree.h" #include "bbiFile.h" #include "bwgInternal.h" #include "bigWig.h" -char *version = "2.8"; // when changing, change in bedToBigBed, bedGraphToBigWig, and wigToBigWig +char *version = "2.9"; // when changing, change in bedToBigBed, bedGraphToBigWig, and wigToBigWig /* Version history from 2.8 on at least - + * 2.9 - ability to specify chromAlias bigBed as chromSizes file * 2.8 sync up version numbers with bedToBigBed */ static int blockSize = 256; static int itemsPerSlot = 1024; static boolean doCompress = FALSE; static int maxGigs = 100; // Maximum number of gigs to allocate in one block. // Undocumented on purpose. +static boolean sizesIsBb = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedGraphToBigWig v %s - Convert a bedGraph file to bigWig format (bbi version: %d).\n" "usage:\n" " bedGraphToBigWig in.bedGraph chrom.sizes out.bw\n" "where in.bedGraph is a four column file in the format:\n" " <chrom> <start> <end> <value>\n" "and chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n" "and out.bw is the output indexed big wig file.\n" "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n" " http://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n" "or you may use the script fetchChromSizes to download the chrom.sizes file.\n" "If not hosted by UCSC, a chrom.sizes file can be generated by running\n" "twoBitInfo on the assembly .2bit file.\n" "The input bedGraph file must be sorted, use the unix sort command:\n" " sort -k1,1 -k2,2n unsorted.bedGraph > sorted.bedGraph\n" "options:\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" + " -sizesIsBb -- If set, the chrom.sizes file is assumed to be a bigBed file.\n" " -unc - If set, do not use compression." , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, + {"sizesIsBb", OPTION_BOOLEAN}, {"unc", OPTION_BOOLEAN}, {"maxGigs", OPTION_INT}, {NULL, 0}, }; struct sectionItem /* An item in a section of a bedGraph. */ { bits32 start, end; /* Position in chromosome, half open. */ float val; /* Single precision value. */ }; void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f, int resTryCount, int resScales[], int resSizes[], @@ -360,38 +364,45 @@ assert(boundsPt == boundsEnd); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; } void bedGraphToBigWig(char *inName, char *chromSizes, char *outName) /* bedGraphToBigWig - Convert a bedGraph program to bigWig.. */ { verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); -struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); -verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes); int minDiff = 0, i; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; -struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, NULL, +struct bbiChromUsage *usageList; + +if (sizesIsBb) + usageList = bbiChromUsageFromBedFileAlias(lf, chromSizes, NULL, &minDiff, &aveSize, &bedCount, FALSE); +else + { + struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); + verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes); + usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, NULL, &minDiff, &aveSize, &bedCount, FALSE); + } verboseTime(2, "pass1"); verbose(2, "%d chroms in %s, minDiff=%d, aveSize=%g, bedCount=%lld\n", slCount(usageList), inName, minDiff, aveSize, bedCount); /* Write out dummy header, zoom offsets. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); @@ -493,23 +504,24 @@ fseek(f, 0L, SEEK_END); writeOne(f, sig); lineFileClose(&lf); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); maxGigs = optionInt("maxGigs", maxGigs); setMaxAlloc(maxGigs*1000000000L); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); +sizesIsBb = optionExists("sizesIsBb"); doCompress = !optionExists("unc"); if (argc != 4) usage(); bedGraphToBigWig(argv[1], argv[2], argv[3]); if (verboseLevel() > 1) printVmPeak(); return 0; }