src/lib/bwgCreate.c 1.20

1.20 2009/11/12 23:15:52 kent
First cut of compressed bigWig/bigBed stuff. So far read side should be complete including Genome Browser, Table Browser, wigToBigWig and bigWig utility functions. Still to do bedGraphToBigWig and bedToBigBed.
Index: src/lib/bwgCreate.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bwgCreate.c,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 4 -r1.19 -r1.20
--- src/lib/bwgCreate.c	7 Nov 2009 19:27:02 -0000	1.19
+++ src/lib/bwgCreate.c	12 Nov 2009 23:15:52 -0000	1.20
@@ -7,8 +7,9 @@
 #include "localmem.h"
 #include "errabort.h"
 #include "sqlNum.h"
 #include "sig.h"
+#include "zlibFace.h"
 #include "bPlusTree.h"
 #include "cirTree.h"
 #include "bbiFile.h"
 #include "bwgInternal.h"
@@ -105,10 +106,10 @@
      sum->chromId, sum->start, sum->end, sum->minVal, sum->maxVal, sum->sumData,
      sum->sumSquares, sum->validCount, sum->sumData/sum->validCount);
 }
 
-static void bwgSectionWrite(struct bwgSection *section, FILE *f)
-/* Write out section to file, filling in section->fileOffset. */
+static int bwgSectionWriteUnc(struct bwgSection *section, FILE *f)
+/* Write out compressed section to file, filling in section->fileOffset. */
 {
 UBYTE type = section->type;
 UBYTE reserved8 = 0;
 
@@ -160,8 +161,107 @@
     default:
         internalErr();
 	break;
     }
+return 0;
+}
+
+static int bwgSectionWriteComp(struct bwgSection *section, FILE *f)
+/* Write out compressed section to file, filling in section->fileOffset. 
+ * Returns uncompressed size. */
+{
+UBYTE type = section->type;
+UBYTE reserved8 = 0;
+int itemSize;
+switch (section->type)
+    {
+    case bwgTypeBedGraph:
+        itemSize = 12;
+	break;
+    case bwgTypeVariableStep:
+        itemSize = 8;
+	break;
+    case bwgTypeFixedStep:
+        itemSize = 4;
+	break;
+    default:
+        itemSize = 0;  // Suppress compiler warning
+	internalErr();
+	break;
+    }
+int fixedSize = sizeof(section->chromId) + sizeof(section->start) + sizeof(section->end) + 
+     sizeof(section->itemStep) + sizeof(section->itemSpan) + sizeof(type) + sizeof(reserved8) +
+     sizeof(section->itemCount);
+int bufSize = section->itemCount * itemSize + fixedSize;
+char buf[bufSize];
+char *bufPt = buf;
+
+section->fileOffset = ftell(f);
+memWriteOne(&bufPt, section->chromId);
+memWriteOne(&bufPt, section->start);
+memWriteOne(&bufPt, section->end);
+memWriteOne(&bufPt, section->itemStep);
+memWriteOne(&bufPt, section->itemSpan);
+memWriteOne(&bufPt, type);
+memWriteOne(&bufPt, reserved8);
+memWriteOne(&bufPt, section->itemCount);
+
+int i;
+switch (section->type)
+    {
+    case bwgTypeBedGraph:
+	{
+	struct bwgBedGraphItem *item = section->items.bedGraphList;
+	for (item = section->items.bedGraphList; item != NULL; item = item->next)
+	    {
+	    memWriteOne(&bufPt, item->start);
+	    memWriteOne(&bufPt, item->end);
+	    memWriteOne(&bufPt, item->val);
+	    }
+	break;
+	}
+    case bwgTypeVariableStep:
+	{
+	struct bwgVariableStepPacked *items = section->items.variableStepPacked;
+	for (i=0; i<section->itemCount; ++i)
+	    {
+	    memWriteOne(&bufPt, items->start);
+	    memWriteOne(&bufPt, items->val);
+	    items += 1;
+	    }
+	break;
+	}
+    case bwgTypeFixedStep:
+	{
+	struct bwgFixedStepPacked *items = section->items.fixedStepPacked;
+	for (i=0; i<section->itemCount; ++i)
+	    {
+	    memWriteOne(&bufPt, items->val);
+	    items += 1;
+	    }
+	break;
+	}
+    default:
+        internalErr();
+	break;
+    }
+assert(bufSize == (bufPt - buf) );
+
+size_t maxCompSize = zCompBufSize(bufSize);
+char compBuf[maxCompSize];
+int compSize = zCompress(buf, bufSize, compBuf, maxCompSize);
+mustWrite(f, compBuf, compSize);
+return bufSize;
+}
+
+
+static int bwgSectionWrite(struct bwgSection *section, boolean doCompress, FILE *f)
+/* Write out section to file, filling in section->fileOffset. */
+{
+if (doCompress)
+    return bwgSectionWriteComp(section, f);
+else
+    return bwgSectionWriteUnc(section, f);
 }
 
 
 static int bwgSectionCmp(const void *va, const void *vb)
@@ -795,9 +895,9 @@
 return outList;
 }
 
 static void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, 
-	int blockSize, int itemsPerSlot, char *fileName)
+	int blockSize, int itemsPerSlot, boolean doCompress, char *fileName)
 /* Create a bigWig file out of a sorted sectionList. */
 {
 bits64 sectionCount = slCount(sectionList);
 FILE *f = mustOpen(fileName, "wb");
@@ -810,8 +910,10 @@
 bits64 dataOffset = 0, dataOffsetPos;
 bits64 indexOffset = 0, indexOffsetPos;
 bits64 chromTreeOffset = 0, chromTreeOffsetPos;
 bits64 totalSummaryOffset = 0, totalSummaryOffsetPos;
+bits32 uncompressBufSize = 0;
+bits64 uncompressBufSizePos;
 struct bbiSummary *reduceSummaries[10];
 bits32 reductionAmounts[10];
 bits64 reductionDataOffsetPos[10];
 bits64 reductionDataOffsets[10];
@@ -884,9 +986,11 @@
 writeOne(f, reserved16);  /* definedFieldCount */
 writeOne(f, reserved64);  /* autoSqlOffset. */
 totalSummaryOffsetPos = ftell(f);
 writeOne(f, totalSummaryOffset);
-for (i=0; i<3; ++i)
+uncompressBufSizePos = ftell(f);
+writeOne(f, uncompressBufSize);
+for (i=0; i<2; ++i)
     writeOne(f, reserved32);
 
 /* Write summary headers */
 for (i=0; i<summaryCount; ++i)
@@ -916,9 +1020,13 @@
 dataOffset = ftell(f);
 writeOne(f, sectionCount);
 struct bwgSection *section;
 for (section = sectionList; section != NULL; section = section->next)
-    bwgSectionWrite(section, f);
+    {
+    bits32 uncSizeOne = bwgSectionWrite(section, doCompress, f);
+    if (uncSizeOne > uncompressBufSize)
+         uncompressBufSize = uncSizeOne;
+    }
 
 /* Write out index - creating a temporary array rather than list representation of
  * sections in the process. */
 indexOffset = ftell(f);
@@ -935,9 +1043,9 @@
 verbose(2, "bwgCreate writing %d summaries\n", summaryCount);
 for (i=0; i<summaryCount; ++i)
     {
     reductionDataOffsets[i] = ftell(f);
-    reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, f);
+    reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, doCompress, f);
     verbose(3, "wrote %d of data, %d of index on level %d\n", (int)(reductionIndexOffsets[i] - reductionDataOffsets[i]), (int)(ftell(f) - reductionIndexOffsets[i]), i);
     }
 
 /* Calculate summary */
@@ -973,8 +1081,17 @@
 writeOne(f, chromTreeOffset);
 fseek(f, totalSummaryOffsetPos, SEEK_SET);
 writeOne(f, totalSummaryOffset);
 
+if (doCompress)
+    {
+    int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
+    if (maxZoomUncompSize > uncompressBufSize)
+	uncompressBufSize = maxZoomUncompSize;
+    fseek(f, uncompressBufSizePos, SEEK_SET);
+    writeOne(f, uncompressBufSize);
+    }
+
 /* Also fill in offsets in zoom headers. */
 for (i=0; i<summaryCount; ++i)
     {
     fseek(f, reductionDataOffsetPos[i], SEEK_SET);
@@ -1048,8 +1165,9 @@
 	char *chromSizes, 	/* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,		/* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot,	/* Number of items in lowest level of tree.  512 is good. */
 	boolean clipDontDie,	/* If TRUE then clip items off end of chrom rather than dying. */
+	boolean compress,	/* If TRUE then compress data. */
 	char *outName)
 /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) 
  * to binary big wig format. */
 {
@@ -1063,8 +1181,8 @@
 struct lm *lm = lmInit(0);
 struct bwgSection *sectionList = bwgParseWig(inName, clipDontDie, chromSizeHash, itemsPerSlot, lm);
 if (sectionList == NULL)
     errAbort("%s is empty of data", inName);
-bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, outName);
+bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, outName);
 lmCleanup(&lm);
 }