src/utils/bedGraphToBigWig/bedGraphToBigWig.c 1.20
1.20 2009/11/13 19:02:38 kent
Adding compression to bigBed. Improving bigWigInfo and bigBedInfo a little.
Index: src/utils/bedGraphToBigWig/bedGraphToBigWig.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/bedGraphToBigWig/bedGraphToBigWig.c,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 4 -r1.19 -r1.20
--- src/utils/bedGraphToBigWig/bedGraphToBigWig.c 12 Nov 2009 23:15:52 -0000 1.19
+++ src/utils/bedGraphToBigWig/bedGraphToBigWig.c 13 Nov 2009 19:02:38 -0000 1.20
@@ -4,19 +4,22 @@
#include "localmem.h"
#include "hash.h"
#include "options.h"
#include "sqlNum.h"
+#include "dystring.h"
#include "cirTree.h"
#include "sig.h"
+#include "zlibFace.h"
#include "bPlusTree.h"
#include "bbiFile.h"
#include "bwgInternal.h"
#include "bigWig.h"
static char const rcsid[] = "$Id$";
-int blockSize = 256;
-int itemsPerSlot = 1024;
+static int blockSize = 256;
+static int itemsPerSlot = 1024;
+static boolean doCompress = FALSE;
void usage()
/* Explain usage and exit. */
@@ -31,15 +34,17 @@
"and out.bw is the output indexed big wig file.\n"
"options:\n"
" -blockSize=N - Number of items to bundle in r-tree. Default %d\n"
" -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n"
+ " -compress - If set use zlib compression."
, bbiCurrentVersion, blockSize, itemsPerSlot
);
}
static struct optionSpec options[] = {
{"blockSize", OPTION_INT},
{"itemsPerSlot", OPTION_INT},
+ {"compress", OPTION_BOOLEAN},
{NULL, 0},
};
struct sectionItem
@@ -50,12 +55,14 @@
};
void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf,
int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f,
- int resTryCount, int resScales[], int resSizes[])
+ int resTryCount, int resScales[], int resSizes[],
+ boolean doCompress, bits32 *retMaxSectionSize)
/* Read through lf, chunking it into sections that get written to f. Save info
* about sections in bounds. */
{
+int maxSectionSize = 0;
struct bbiChromUsage *usage = usageList;
int itemIx = 0, sectionIx = 0;
bits32 reserved32 = 0;
UBYTE reserved8 = 0;
@@ -64,8 +71,10 @@
bits32 resEnds[resTryCount];
int resTry;
for (resTry = 0; resTry < resTryCount; ++resTry)
resEnds[resTry] = 0;
+struct dyString *stream = dyStringNew(0);
+
for (;;)
{
/* Get next line of input if any. */
char *row[5];
@@ -89,29 +98,44 @@
section->range.chromIx = chromId;
section->range.start = sectionStart;
section->range.end = sectionEnd;
- /* Output section header to file. */
+ /* Output section header to stream. */
+ dyStringClear(stream);
UBYTE type = bwgTypeBedGraph;
bits16 itemCount = itemIx;
- writeOne(f, chromId); // chromId
- writeOne(f, sectionStart); // start
- writeOne(f, sectionEnd); // end
- writeOne(f, reserved32); // itemStep
- writeOne(f, reserved32); // itemSpan
- writeOne(f, type); // type
- writeOne(f, reserved8); // reserved
- writeOne(f, itemCount); // itemCount
+ dyStringWriteOne(stream, chromId); // chromId
+ dyStringWriteOne(stream, sectionStart); // start
+ dyStringWriteOne(stream, sectionEnd); // end
+ dyStringWriteOne(stream, reserved32); // itemStep
+ dyStringWriteOne(stream, reserved32); // itemSpan
+ dyStringWriteOne(stream, type); // type
+ dyStringWriteOne(stream, reserved8); // reserved
+ dyStringWriteOne(stream, itemCount); // itemCount
- /* Output each item in section to file. */
+ /* Output each item in section to stream. */
int i;
for (i=0; i<itemIx; ++i)
{
struct sectionItem *item = &items[i];
- writeOne(f, item->start);
- writeOne(f, item->end);
- writeOne(f, item->val);
+ dyStringWriteOne(stream, item->start);
+ dyStringWriteOne(stream, item->end);
+ dyStringWriteOne(stream, item->val);
+ }
+
+ /* Save stream to file, compressing if need be. */
+ if (stream->stringSize > maxSectionSize)
+ maxSectionSize = stream->stringSize;
+ if (doCompress)
+ {
+ size_t maxCompSize = zCompBufSize(stream->stringSize);
+ char compBuf[maxCompSize];
+ int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
+ mustWrite(f, compBuf, compSize);
}
+ else
+ mustWrite(f, stream->string, stream->stringSize);
+
/* If at end of input we are done. */
if (rowSize == 0)
break;
@@ -172,13 +196,15 @@
lastB = b;
itemIx += 1;
}
assert(sectionIx == sectionCount);
+
+*retMaxSectionSize = maxSectionSize;
}
static struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList,
struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount,
- int zoomIncrement, int blockSize, int itemsPerSlot,
+ int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
struct bbiSummaryElement *totalSum)
/* Write out data reduced by factor of initialReduction. Also calculate and keep in memory
* next reduction level. This is more work than some ways, but it keeps us from having to
@@ -194,8 +220,10 @@
*retDataStart = ftell(f);
writeOne(f, initialReductionCount);
boolean firstRow = TRUE;
+
+struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
for (;;)
{
/* Get next line of input if any. */
char *row[5];
@@ -204,9 +232,9 @@
/* Output last section and break if at end of file. */
if (rowSize == 0 && sum != NULL)
{
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
- &boundsPt, boundsEnd, usage->size, lm, f);
+ &boundsPt, boundsEnd, usage->size, lm, stream);
break;
}
/* Parse out row. */
@@ -238,17 +266,17 @@
if (differentString(chrom, usage->name))
{
usage = usage->next;
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
- &boundsPt, boundsEnd, usage->size, lm, f);
+ &boundsPt, boundsEnd, usage->size, lm, stream);
sum = NULL;
}
/* If start past existing block then output it. */
else if (sum != NULL && sum->end <= start)
{
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
- &boundsPt, boundsEnd, usage->size, lm, f);
+ &boundsPt, boundsEnd, usage->size, lm, stream);
sum = NULL;
}
/* If don't have a summary we're working on now, make one. */
@@ -276,9 +304,9 @@
if (sum->maxVal < val) sum->maxVal = val;
sum->sumData += val * overlap;
sum->sumSquares += val*val * overlap;
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
- &boundsPt, boundsEnd, usage->size, lm, f);
+ &boundsPt, boundsEnd, usage->size, lm, stream);
size -= overlap;
/* Move summary to next part. */
sum->start = start = sum->end;
@@ -295,8 +323,9 @@
if (sum->maxVal < val) sum->maxVal = val;
sum->sumData += val * size;
sum->sumSquares += val*val * size;
}
+bbiSumOutStreamClose(&stream);
/* Write out 1st zoom index. */
int indexOffset = *retIndexStart = ftell(f);
assert(boundsPt == boundsEnd);
@@ -318,8 +347,9 @@
verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes);
int minDiff = 0, i;
double aveSize = 0;
bits64 bedCount = 0;
+bits32 uncompressBufSize = 0;
struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSize, &bedCount);
verboseTime(2, "pass1");
verbose(2, "%d chroms in %s\n", slCount(usageList), inName);
@@ -361,10 +391,11 @@
writeOne(f, sectionCount);
struct bbiBoundsArray *boundsArray;
AllocArray(boundsArray, sectionCount);
lineFileRewind(lf);
+bits32 maxSectionSize = 0;
writeSections(usageList, lf, itemsPerSlot, boundsArray, sectionCount, f,
- resTryCount, resScales, resSizes);
+ resTryCount, resScales, resSizes, doCompress, &maxSectionSize);
verboseTime(2, "pass2");
/* Write out primary data index. */
bits64 indexOffset = ftell(f);
@@ -389,8 +420,10 @@
/* Figure out initialReduction for zoom. */
for (resTry = 0; resTry < resTryCount; ++resTry)
{
bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk);
+ if (doCompress)
+ reducedSize /= 2; // Estimate!
if (reducedSize <= maxReducedSize)
{
initialReduction = resScales[resTry];
initialReducedCount = resSizes[resTry];
@@ -406,9 +439,9 @@
int zoomIncrement = 4;
lineFileRewind(lf);
struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList,
lf, initialReduction, initialReducedCount,
- resIncrement, blockSize, itemsPerSlot, lm,
+ resIncrement, blockSize, itemsPerSlot, doCompress, lm,
f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum);
verboseTime(2, "writeReducedOnceReturnReducedTwice");
zoomAmounts[0] = initialReduction;
zoomLevels = 1;
@@ -422,9 +455,9 @@
break;
zoomCount = rezoomCount;
zoomDataOffsets[zoomLevels] = ftell(f);
zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList,
- blockSize, itemsPerSlot, FALSE, f);
+ blockSize, itemsPerSlot, doCompress, f);
zoomAmounts[zoomLevels] = reduction;
++zoomLevels;
reduction *= zoomIncrement;
rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm);
@@ -434,14 +467,21 @@
}
}
+/* Figure out buffer size needed for uncompression if need be. */
+if (doCompress)
+ {
+ int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
+ uncompressBufSize = max(maxSectionSize, maxZoomUncompSize);
+ }
+
/* Go back and rewrite header. */
rewind(f);
bits32 sig = bigWigSig;
bits16 version = bbiCurrentVersion;
bits16 summaryCount = zoomLevels;
-bits32 reserved16 = 0;
+bits16 reserved16 = 0;
bits32 reserved32 = 0;
bits64 reserved64 = 0;
/* Write fixed header */
@@ -454,10 +494,12 @@
writeOne(f, reserved16); // fieldCount
writeOne(f, reserved16); // definedFieldCount
writeOne(f, reserved64); // autoSqlOffset
writeOne(f, totalSummaryOffset);
-for (i=0; i<3; ++i)
+writeOne(f, uncompressBufSize);
+for (i=0; i<2; ++i)
writeOne(f, reserved32);
+assert(ftell(f) == 64);
/* Write summary headers with data. */
verbose(2, "Writing %d levels of zoom\n", zoomLevels);
for (i=0; i<zoomLevels; ++i)
@@ -490,8 +532,9 @@
{
optionInit(&argc, argv, options);
blockSize = optionInt("blockSize", blockSize);
itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
+doCompress = optionExists("compress");
if (argc != 4)
usage();
bedGraphToBigWig(argv[1], argv[2], argv[3]);
return 0;