src/lib/bwgCreate.c 1.20
1.20 2009/11/12 23:15:52 kent
First cut of compressed bigWig/bigBed stuff. So far read side should be complete including Genome Browser, Table Browser, wigToBigWig and bigWig utility functions. Still to do bedGraphToBigWig and bedToBigBed.
Index: src/lib/bwgCreate.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bwgCreate.c,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 4 -r1.19 -r1.20
--- src/lib/bwgCreate.c 7 Nov 2009 19:27:02 -0000 1.19
+++ src/lib/bwgCreate.c 12 Nov 2009 23:15:52 -0000 1.20
@@ -7,8 +7,9 @@
#include "localmem.h"
#include "errabort.h"
#include "sqlNum.h"
#include "sig.h"
+#include "zlibFace.h"
#include "bPlusTree.h"
#include "cirTree.h"
#include "bbiFile.h"
#include "bwgInternal.h"
@@ -105,10 +106,10 @@
sum->chromId, sum->start, sum->end, sum->minVal, sum->maxVal, sum->sumData,
sum->sumSquares, sum->validCount, sum->sumData/sum->validCount);
}
-static void bwgSectionWrite(struct bwgSection *section, FILE *f)
-/* Write out section to file, filling in section->fileOffset. */
+static int bwgSectionWriteUnc(struct bwgSection *section, FILE *f)
+/* Write out compressed section to file, filling in section->fileOffset. */
{
UBYTE type = section->type;
UBYTE reserved8 = 0;
@@ -160,8 +161,107 @@
default:
internalErr();
break;
}
+return 0;
+}
+
+static int bwgSectionWriteComp(struct bwgSection *section, FILE *f)
+/* Write out compressed section to file, filling in section->fileOffset.
+ * Returns uncompressed size. */
+{
+UBYTE type = section->type;
+UBYTE reserved8 = 0;
+int itemSize;
+switch (section->type)
+ {
+ case bwgTypeBedGraph:
+ itemSize = 12;
+ break;
+ case bwgTypeVariableStep:
+ itemSize = 8;
+ break;
+ case bwgTypeFixedStep:
+ itemSize = 4;
+ break;
+ default:
+ itemSize = 0; // Suppress compiler warning
+ internalErr();
+ break;
+ }
+int fixedSize = sizeof(section->chromId) + sizeof(section->start) + sizeof(section->end) +
+ sizeof(section->itemStep) + sizeof(section->itemSpan) + sizeof(type) + sizeof(reserved8) +
+ sizeof(section->itemCount);
+int bufSize = section->itemCount * itemSize + fixedSize;
+char buf[bufSize];
+char *bufPt = buf;
+
+section->fileOffset = ftell(f);
+memWriteOne(&bufPt, section->chromId);
+memWriteOne(&bufPt, section->start);
+memWriteOne(&bufPt, section->end);
+memWriteOne(&bufPt, section->itemStep);
+memWriteOne(&bufPt, section->itemSpan);
+memWriteOne(&bufPt, type);
+memWriteOne(&bufPt, reserved8);
+memWriteOne(&bufPt, section->itemCount);
+
+int i;
+switch (section->type)
+ {
+ case bwgTypeBedGraph:
+ {
+ struct bwgBedGraphItem *item = section->items.bedGraphList;
+ for (item = section->items.bedGraphList; item != NULL; item = item->next)
+ {
+ memWriteOne(&bufPt, item->start);
+ memWriteOne(&bufPt, item->end);
+ memWriteOne(&bufPt, item->val);
+ }
+ break;
+ }
+ case bwgTypeVariableStep:
+ {
+ struct bwgVariableStepPacked *items = section->items.variableStepPacked;
+ for (i=0; i<section->itemCount; ++i)
+ {
+ memWriteOne(&bufPt, items->start);
+ memWriteOne(&bufPt, items->val);
+ items += 1;
+ }
+ break;
+ }
+ case bwgTypeFixedStep:
+ {
+ struct bwgFixedStepPacked *items = section->items.fixedStepPacked;
+ for (i=0; i<section->itemCount; ++i)
+ {
+ memWriteOne(&bufPt, items->val);
+ items += 1;
+ }
+ break;
+ }
+ default:
+ internalErr();
+ break;
+ }
+assert(bufSize == (bufPt - buf) );
+
+size_t maxCompSize = zCompBufSize(bufSize);
+char compBuf[maxCompSize];
+int compSize = zCompress(buf, bufSize, compBuf, maxCompSize);
+mustWrite(f, compBuf, compSize);
+return bufSize;
+}
+
+
+static int bwgSectionWrite(struct bwgSection *section, boolean doCompress, FILE *f)
+/* Write out section to file, filling in section->fileOffset. */
+{
+if (doCompress)
+ return bwgSectionWriteComp(section, f);
+else
+ return bwgSectionWriteUnc(section, f);
}
static int bwgSectionCmp(const void *va, const void *vb)
@@ -795,9 +895,9 @@
return outList;
}
static void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash,
- int blockSize, int itemsPerSlot, char *fileName)
+ int blockSize, int itemsPerSlot, boolean doCompress, char *fileName)
/* Create a bigWig file out of a sorted sectionList. */
{
bits64 sectionCount = slCount(sectionList);
FILE *f = mustOpen(fileName, "wb");
@@ -810,8 +910,10 @@
bits64 dataOffset = 0, dataOffsetPos;
bits64 indexOffset = 0, indexOffsetPos;
bits64 chromTreeOffset = 0, chromTreeOffsetPos;
bits64 totalSummaryOffset = 0, totalSummaryOffsetPos;
+bits32 uncompressBufSize = 0;
+bits64 uncompressBufSizePos;
struct bbiSummary *reduceSummaries[10];
bits32 reductionAmounts[10];
bits64 reductionDataOffsetPos[10];
bits64 reductionDataOffsets[10];
@@ -884,9 +986,11 @@
writeOne(f, reserved16); /* definedFieldCount */
writeOne(f, reserved64); /* autoSqlOffset. */
totalSummaryOffsetPos = ftell(f);
writeOne(f, totalSummaryOffset);
-for (i=0; i<3; ++i)
+uncompressBufSizePos = ftell(f);
+writeOne(f, uncompressBufSize);
+for (i=0; i<2; ++i)
writeOne(f, reserved32);
/* Write summary headers */
for (i=0; i<summaryCount; ++i)
@@ -916,9 +1020,13 @@
dataOffset = ftell(f);
writeOne(f, sectionCount);
struct bwgSection *section;
for (section = sectionList; section != NULL; section = section->next)
- bwgSectionWrite(section, f);
+ {
+ bits32 uncSizeOne = bwgSectionWrite(section, doCompress, f);
+ if (uncSizeOne > uncompressBufSize)
+ uncompressBufSize = uncSizeOne;
+ }
/* Write out index - creating a temporary array rather than list representation of
* sections in the process. */
indexOffset = ftell(f);
@@ -935,9 +1043,9 @@
verbose(2, "bwgCreate writing %d summaries\n", summaryCount);
for (i=0; i<summaryCount; ++i)
{
reductionDataOffsets[i] = ftell(f);
- reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, f);
+ reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, doCompress, f);
verbose(3, "wrote %d of data, %d of index on level %d\n", (int)(reductionIndexOffsets[i] - reductionDataOffsets[i]), (int)(ftell(f) - reductionIndexOffsets[i]), i);
}
/* Calculate summary */
@@ -973,8 +1081,17 @@
writeOne(f, chromTreeOffset);
fseek(f, totalSummaryOffsetPos, SEEK_SET);
writeOne(f, totalSummaryOffset);
+if (doCompress)
+ {
+ int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
+ if (maxZoomUncompSize > uncompressBufSize)
+ uncompressBufSize = maxZoomUncompSize;
+ fseek(f, uncompressBufSizePos, SEEK_SET);
+ writeOne(f, uncompressBufSize);
+ }
+
/* Also fill in offsets in zoom headers. */
for (i=0; i<summaryCount; ++i)
{
fseek(f, reductionDataOffsetPos[i], SEEK_SET);
@@ -1048,8 +1165,9 @@
char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */
int itemsPerSlot, /* Number of items in lowest level of tree. 512 is good. */
boolean clipDontDie, /* If TRUE then clip items off end of chrom rather than dying. */
+ boolean compress, /* If TRUE then compress data. */
char *outName)
/* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format)
* to binary big wig format. */
{
@@ -1063,8 +1181,8 @@
struct lm *lm = lmInit(0);
struct bwgSection *sectionList = bwgParseWig(inName, clipDontDie, chromSizeHash, itemsPerSlot, lm);
if (sectionList == NULL)
errAbort("%s is empty of data", inName);
-bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, outName);
+bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, outName);
lmCleanup(&lm);
}