src/utils/bedGraphToBigWig/bedGraphToBigWig.c 1.20

1.20 2009/11/13 19:02:38 kent
Adding compression to bigBed. Improving bigWigInfo and bigBedInfo a little.
Index: src/utils/bedGraphToBigWig/bedGraphToBigWig.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/bedGraphToBigWig/bedGraphToBigWig.c,v
retrieving revision 1.19
retrieving revision 1.20
diff -b -B -U 4 -r1.19 -r1.20
--- src/utils/bedGraphToBigWig/bedGraphToBigWig.c	12 Nov 2009 23:15:52 -0000	1.19
+++ src/utils/bedGraphToBigWig/bedGraphToBigWig.c	13 Nov 2009 19:02:38 -0000	1.20
@@ -4,19 +4,22 @@
 #include "localmem.h"
 #include "hash.h"
 #include "options.h"
 #include "sqlNum.h"
+#include "dystring.h"
 #include "cirTree.h"
 #include "sig.h"
+#include "zlibFace.h"
 #include "bPlusTree.h"
 #include "bbiFile.h"
 #include "bwgInternal.h"
 #include "bigWig.h"
 
 static char const rcsid[] = "$Id$";
 
-int blockSize = 256;
-int itemsPerSlot = 1024;
+static int blockSize = 256;
+static int itemsPerSlot = 1024;
+static boolean doCompress = FALSE;
 
 
 void usage()
 /* Explain usage and exit. */
@@ -31,15 +34,17 @@
   "and out.bw is the output indexed big wig file.\n"
   "options:\n"
   "   -blockSize=N - Number of items to bundle in r-tree.  Default %d\n"
   "   -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n"
+  "   -compress - If set use zlib compression."
   , bbiCurrentVersion, blockSize, itemsPerSlot
   );
 }
 
 static struct optionSpec options[] = {
    {"blockSize", OPTION_INT},
    {"itemsPerSlot", OPTION_INT},
+   {"compress", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct sectionItem
@@ -50,12 +55,14 @@
     };
 
 void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, 
 	int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f,
-	int resTryCount, int resScales[], int resSizes[])
+	int resTryCount, int resScales[], int resSizes[], 
+	boolean doCompress, bits32 *retMaxSectionSize)
 /* Read through lf, chunking it into sections that get written to f.  Save info
  * about sections in bounds. */
 {
+int maxSectionSize = 0;
 struct bbiChromUsage *usage = usageList;
 int itemIx = 0, sectionIx = 0;
 bits32 reserved32 = 0;
 UBYTE reserved8 = 0;
@@ -64,8 +71,10 @@
 bits32 resEnds[resTryCount];
 int resTry;
 for (resTry = 0; resTry < resTryCount; ++resTry)
     resEnds[resTry] = 0;
+struct dyString *stream = dyStringNew(0);
+
 for (;;)
     {
     /* Get next line of input if any. */
     char *row[5];
@@ -89,29 +98,44 @@
 	section->range.chromIx = chromId;
 	section->range.start = sectionStart;
 	section->range.end = sectionEnd;
 
-	/* Output section header to file. */
+	/* Output section header to stream. */
+	dyStringClear(stream);
 	UBYTE type = bwgTypeBedGraph;
 	bits16 itemCount = itemIx;
-	writeOne(f, chromId);			// chromId
-	writeOne(f, sectionStart);		// start
-	writeOne(f, sectionEnd);	// end
-	writeOne(f, reserved32);		// itemStep
-	writeOne(f, reserved32);		// itemSpan
-	writeOne(f, type);			// type
-	writeOne(f, reserved8);			// reserved
-	writeOne(f, itemCount);			// itemCount
+	dyStringWriteOne(stream, chromId);			// chromId
+	dyStringWriteOne(stream, sectionStart);		// start
+	dyStringWriteOne(stream, sectionEnd);	// end
+	dyStringWriteOne(stream, reserved32);		// itemStep
+	dyStringWriteOne(stream, reserved32);		// itemSpan
+	dyStringWriteOne(stream, type);			// type
+	dyStringWriteOne(stream, reserved8);			// reserved
+	dyStringWriteOne(stream, itemCount);			// itemCount
 
-	/* Output each item in section to file. */
+	/* Output each item in section to stream. */
 	int i;
 	for (i=0; i<itemIx; ++i)
 	    {
 	    struct sectionItem *item = &items[i];
-	    writeOne(f, item->start);
-	    writeOne(f, item->end);
-	    writeOne(f, item->val);
+	    dyStringWriteOne(stream, item->start);
+	    dyStringWriteOne(stream, item->end);
+	    dyStringWriteOne(stream, item->val);
+	    }
+
+	/* Save stream to file, compressing if need be. */
+	if (stream->stringSize > maxSectionSize)
+	    maxSectionSize = stream->stringSize;
+	if (doCompress)
+	    {
+	    size_t maxCompSize = zCompBufSize(stream->stringSize);
+	    char compBuf[maxCompSize];
+	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
+	    mustWrite(f, compBuf, compSize);
 	    }
+	else
+	    mustWrite(f, stream->string, stream->stringSize);
+
 
 	/* If at end of input we are done. */
 	if (rowSize == 0)
 	    break;
@@ -172,13 +196,15 @@
     lastB = b;
     itemIx += 1;
     }
 assert(sectionIx == sectionCount);
+
+*retMaxSectionSize = maxSectionSize;
 }
 
 static struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
 	struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
-	int zoomIncrement, int blockSize, int itemsPerSlot, 
+	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
 	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
 	struct bbiSummaryElement *totalSum)
 /* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
  * next reduction level.  This is more work than some ways, but it keeps us from having to
@@ -194,8 +220,10 @@
 
 *retDataStart = ftell(f);
 writeOne(f, initialReductionCount);
 boolean firstRow = TRUE;
+
+struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
 for (;;)
     {
     /* Get next line of input if any. */
     char *row[5];
@@ -204,9 +232,9 @@
     /* Output last section and break if at end of file. */
     if (rowSize == 0 && sum != NULL)
 	{
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
-		&boundsPt, boundsEnd, usage->size, lm, f);
+		&boundsPt, boundsEnd, usage->size, lm, stream);
 	break;
 	}
 
     /* Parse out row. */
@@ -238,17 +266,17 @@
     if (differentString(chrom, usage->name))
         {
 	usage = usage->next;
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
-		&boundsPt, boundsEnd, usage->size, lm, f);
+		&boundsPt, boundsEnd, usage->size, lm, stream);
 	sum = NULL;
 	}
 
     /* If start past existing block then output it. */
     else if (sum != NULL && sum->end <= start)
 	{
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
-		&boundsPt, boundsEnd, usage->size, lm, f);
+		&boundsPt, boundsEnd, usage->size, lm, stream);
 	sum = NULL;
 	}
 
     /* If don't have a summary we're working on now, make one. */
@@ -276,9 +304,9 @@
 	if (sum->maxVal < val) sum->maxVal = val;
 	sum->sumData += val * overlap;
 	sum->sumSquares += val*val * overlap;
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
-		&boundsPt, boundsEnd, usage->size, lm, f);
+		&boundsPt, boundsEnd, usage->size, lm, stream);
 	size -= overlap;
 
 	/* Move summary to next part. */
 	sum->start = start = sum->end;
@@ -295,8 +323,9 @@
     if (sum->maxVal < val) sum->maxVal = val;
     sum->sumData += val * size;
     sum->sumSquares += val*val * size;
     }
+bbiSumOutStreamClose(&stream);
 
 /* Write out 1st zoom index. */
 int indexOffset = *retIndexStart = ftell(f);
 assert(boundsPt == boundsEnd);
@@ -318,8 +347,9 @@
 verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes);
 int minDiff = 0, i;
 double aveSize = 0;
 bits64 bedCount = 0;
+bits32 uncompressBufSize = 0;
 struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSize, &bedCount);
 verboseTime(2, "pass1");
 verbose(2, "%d chroms in %s\n", slCount(usageList), inName);
 
@@ -361,10 +391,11 @@
 writeOne(f, sectionCount);
 struct bbiBoundsArray *boundsArray;
 AllocArray(boundsArray, sectionCount);
 lineFileRewind(lf);
+bits32 maxSectionSize = 0;
 writeSections(usageList, lf, itemsPerSlot, boundsArray, sectionCount, f,
-	resTryCount, resScales, resSizes);
+	resTryCount, resScales, resSizes, doCompress, &maxSectionSize);
 verboseTime(2, "pass2");
 
 /* Write out primary data index. */
 bits64 indexOffset = ftell(f);
@@ -389,8 +420,10 @@
     /* Figure out initialReduction for zoom. */
     for (resTry = 0; resTry < resTryCount; ++resTry)
 	{
 	bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk);
+	if (doCompress)
+	    reducedSize /= 2;	// Estimate!
 	if (reducedSize <= maxReducedSize)
 	    {
 	    initialReduction = resScales[resTry];
 	    initialReducedCount = resSizes[resTry];
@@ -406,9 +439,9 @@
 	int zoomIncrement = 4;
 	lineFileRewind(lf);
 	struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, 
 		lf, initialReduction, initialReducedCount,
-		resIncrement, blockSize, itemsPerSlot, lm, 
+		resIncrement, blockSize, itemsPerSlot, doCompress, lm, 
 		f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum);
 	verboseTime(2, "writeReducedOnceReturnReducedTwice");
 	zoomAmounts[0] = initialReduction;
 	zoomLevels = 1;
@@ -422,9 +455,9 @@
 	        break;
 	    zoomCount = rezoomCount;
 	    zoomDataOffsets[zoomLevels] = ftell(f);
 	    zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, 
-	    	blockSize, itemsPerSlot, FALSE, f);
+	    	blockSize, itemsPerSlot, doCompress, f);
 	    zoomAmounts[zoomLevels] = reduction;
 	    ++zoomLevels;
 	    reduction *= zoomIncrement;
 	    rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm);
@@ -434,14 +467,21 @@
 	}
 
     }
 
+/* Figure out buffer size needed for uncompression if need be. */
+if (doCompress)
+    {
+    int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
+    uncompressBufSize = max(maxSectionSize, maxZoomUncompSize);
+    }
+
 /* Go back and rewrite header. */
 rewind(f);
 bits32 sig = bigWigSig;
 bits16 version = bbiCurrentVersion;
 bits16 summaryCount = zoomLevels;
-bits32 reserved16 = 0;
+bits16 reserved16 = 0;
 bits32 reserved32 = 0;
 bits64 reserved64 = 0;
 
 /* Write fixed header */
@@ -454,10 +494,12 @@
 writeOne(f, reserved16);	// fieldCount
 writeOne(f, reserved16);	// definedFieldCount
 writeOne(f, reserved64);	// autoSqlOffset
 writeOne(f, totalSummaryOffset);
-for (i=0; i<3; ++i)
+writeOne(f, uncompressBufSize);
+for (i=0; i<2; ++i)
     writeOne(f, reserved32);
+assert(ftell(f) == 64);
 
 /* Write summary headers with data. */
 verbose(2, "Writing %d levels of zoom\n", zoomLevels);
 for (i=0; i<zoomLevels; ++i)
@@ -490,8 +532,9 @@
 {
 optionInit(&argc, argv, options);
 blockSize = optionInt("blockSize", blockSize);
 itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
+doCompress = optionExists("compress");
 if (argc != 4)
     usage();
 bedGraphToBigWig(argv[1], argv[2], argv[3]);
 return 0;