src/utils/bedToBigBed/bedToBigBed.c 1.18

1.18 2009/11/13 19:02:38 kent
Adding compression to bigBed. Improving bigWigInfo and bigBedInfo a little.
Index: src/utils/bedToBigBed/bedToBigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/bedToBigBed/bedToBigBed.c,v
retrieving revision 1.17
retrieving revision 1.18
diff -b -B -U 4 -r1.17 -r1.18
--- src/utils/bedToBigBed/bedToBigBed.c	12 Nov 2009 23:15:52 -0000	1.17
+++ src/utils/bedToBigBed/bedToBigBed.c	13 Nov 2009 19:02:38 -0000	1.18
@@ -2,13 +2,15 @@
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
+#include "dystring.h"
 #include "obscure.h"
 #include "asParse.h"
 #include "basicBed.h"
 #include "sig.h"
 #include "rangeTree.h"
+#include "zlibFace.h"
 #include "sqlNum.h"
 #include "bigBed.h"
 
 static char const rcsid[] = "$Id$";
@@ -16,8 +18,9 @@
 int blockSize = 256;
 int itemsPerSlot = 512;
 int bedFields = 0;
 char *as = NULL;
+static boolean doCompress = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
@@ -36,8 +39,9 @@
   "   -bedFields=N - Number of fields that fit standard bed definition.  If undefined\n"
   "                  assumes all fields in bed are defined.\n"
   "   -as=fields.as - If have non-standard fields, it's great to put a definition\n"
   "                   of each field in a row in AutoSql format here.\n"
+  "   -compress - If set use zlib compression."
   , bbiCurrentVersion, blockSize, itemsPerSlot
   );
 }
 
@@ -45,26 +49,30 @@
    {"blockSize", OPTION_INT},
    {"itemsPerSlot", OPTION_INT},
    {"bedFields", OPTION_INT},
    {"as", OPTION_STRING},
+   {"compress", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, 
 	bits16 definedFieldCount, int itemsPerSlot, struct bbiBoundsArray *bounds, 
-	int sectionCount, FILE *f, int resTryCount, int resScales[], int resSizes[],
-	bits16 *retFieldCount, bits16 *retDefinedFieldCount)
+	int sectionCount, boolean doCompress, FILE *f, 
+	int resTryCount, int resScales[], int resSizes[],
+	bits16 *retFieldCount, bits16 *retDefinedFieldCount, bits32 *retMaxBlockSize)
 /* Read through lf, writing it in f.  Save starting points of blocks (every itemsPerSlot)
  * to boundsArray */
 {
+int maxBlockSize = 0;
 struct bbiChromUsage *usage = usageList;
 char *line, **row = NULL;
 int fieldCount = 0, fieldAlloc=0, lastField = 0;
 int itemIx = 0, sectionIx = 0;
 bits64 blockOffset = 0;
 int startPos = 0, endPos = 0;
 bits32 chromId = 0;
 boolean allocedAs = FALSE;
+struct dyString *stream = dyStringNew(0);
 
 /* Will keep track of some things that help us determine how much to reduce. */
 bits32 resEnds[resTryCount];
 int resTry;
@@ -142,8 +150,22 @@
 
     /* Check conditions that would end block and save block info and advance to next if need be. */
     if (atEnd || !sameChrom || itemIx >= itemsPerSlot)
         {
+	/* Save stream to file, compressing if need be. */
+	if (stream->stringSize > maxBlockSize)
+	    maxBlockSize = stream->stringSize;
+	if (doCompress)
+	    {
+	    size_t maxCompSize = zCompBufSize(stream->stringSize);
+	    char compBuf[maxCompSize];
+	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
+	    mustWrite(f, compBuf, compSize);
+	    }
+	else
+	    mustWrite(f, stream->string, stream->stringSize);
+	dyStringClear(stream);
+
 	/* Save info on existing block. */
 	struct bbiBoundsArray *b = &bounds[sectionIx];
 	b->offset = blockOffset;
 	b->range.chromIx = chromId;
@@ -181,28 +203,26 @@
 	/* No need to update startPos since list is sorted. */
 	}
 
     /* Write out data. */
-    writeOne(f, chromId);
-    writeOne(f, start);
-    writeOne(f, end);
+    dyStringWriteOne(stream, chromId);
+    dyStringWriteOne(stream, start);
+    dyStringWriteOne(stream, end);
     if (fieldCount > 3)
         {
 	int i;
 	/* Write 3rd through next to last field and a tab separator. */
 	for (i=3; i<lastField; ++i)
 	    {
 	    char *s = row[i];
-	    int len = strlen(s);
-	    mustWrite(f, s, len);
-	    fputc('\t', f);
+	    dyStringAppend(stream, s);
+	    dyStringAppendC(stream, '\t');
 	    }
 	/* Write last field and terminal zero */
 	char *s = row[lastField];
-	int len = strlen(s);
-	mustWrite(f, s, len);
+	dyStringAppend(stream, s);
 	}
-    fputc(0, f);
+    dyStringAppendC(stream, 0);
 
     itemIx += 1;
 
     /* Do zoom counting. */
@@ -224,8 +244,9 @@
 assert(sectionIx == sectionCount);
 freez(&row);
 if (allocedAs)
     asObjectFreeList(&as);
+*retMaxBlockSize = maxBlockSize;
 }
 
 struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom)
 /* Read lines from bed file as long as they match chrom.  Return a rangeTree that
@@ -250,9 +271,9 @@
 }
 
 static struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
 	int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
-	int zoomIncrement, int blockSize, int itemsPerSlot, 
+	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
 	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
 	struct bbiSummaryElement *totalSum)
 /* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
  * next reduction level.  This is more work than some ways, but it keeps us from having to
@@ -274,8 +295,9 @@
  *   2) Stream through the range tree, outputting the initial summary level and
  *      further reducing. 
  */
 boolean firstTime = TRUE;
+struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
 for (usage = usageList; usage != NULL; usage = usage->next)
     {
     struct bbiSummary oneSummary, *sum = NULL;
     struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
@@ -309,9 +331,9 @@
 	/* If start past existing block then output it. */
 	if (sum != NULL && sum->end <= start)
 	    {
 	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
-		&boundsPt, boundsEnd, usage->size, lm, f);
+		&boundsPt, boundsEnd, usage->size, lm, stream);
 	    sum = NULL;
 	    }
 	/* If don't have a summary we're working on now, make one. */
 	if (sum == NULL)
@@ -338,9 +360,9 @@
 	    if (sum->maxVal < val) sum->maxVal = val;
 	    sum->sumData += val * overlap;
 	    sum->sumSquares += val*val * overlap;
 	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
-		    &boundsPt, boundsEnd, usage->size, lm, f);
+		    &boundsPt, boundsEnd, usage->size, lm, stream);
 	    size -= overlap;
 
 	    /* Move summary to next part. */
 	    sum->start = start = sum->end;
@@ -360,12 +382,13 @@
 	}
     if (sum != NULL)
 	{
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
-	    &boundsPt, boundsEnd, usage->size, lm, f);
+	    &boundsPt, boundsEnd, usage->size, lm, stream);
 	}
     rangeTreeFree(&rangeTree);
     }
+bbiSumOutStreamClose(&stream);
 
 /* Write out 1st zoom index. */
 int indexOffset = *retIndexStart = ftell(f);
 assert(boundsPt == boundsEnd);
@@ -386,8 +409,9 @@
 	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
 	bits16 definedFieldCount,  /* Number of defined bed fields - 3-16 or so.  0 means all fields
 				    * are the defined bed ones. */
 	char *asFileName, /* If non-null points to a .as file that describes fields. */
+	boolean doCompress, /* If TRUE then compress data. */
 	char *outName)    /* BigBed output file name. */
 /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
 {
 /* Set up timing measures. */
@@ -411,8 +435,9 @@
 /* Do first pass, mostly just scanning file and counting hits per chromosome. */
 int minDiff = 0;
 double aveSpan = 0;
 bits64 bedCount = 0;
+bits32 uncompressBufSize = 0;
 struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount);
 verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList));
 verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan);
 
@@ -466,10 +491,11 @@
 struct bbiBoundsArray *boundsArray;
 AllocArray(boundsArray, blockCount);
 lineFileRewind(lf);
 bits16 fieldCount=0;
-writeBlocks(usageList, lf, as, definedFieldCount, itemsPerSlot, boundsArray, blockCount, f,
-	resTryCount, resScales, resSizes, &fieldCount, &definedFieldCount);
+bits32 maxBlockSize = 0;
+writeBlocks(usageList, lf, as, definedFieldCount, itemsPerSlot, boundsArray, blockCount, doCompress,
+	f, resTryCount, resScales, resSizes, &fieldCount, &definedFieldCount, &maxBlockSize);
 verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", 
 	(long long)bedCount, fieldCount);
 
 /* Write out primary data index. */
@@ -496,8 +522,10 @@
     /* Figure out initialReduction for zoom. */
     for (resTry = 0; resTry < resTryCount; ++resTry)
 	{
 	bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk);
+	if (doCompress)
+	    reducedSize /= 2;	// Estimate!
 	if (reducedSize <= maxReducedSize)
 	    {
 	    initialReduction = resScales[resTry];
 	    initialReducedCount = resSizes[resTry];
@@ -514,9 +542,9 @@
 	int zoomIncrement = 4;
 	lineFileRewind(lf);
 	struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, 
 		fieldCount, lf, initialReduction, initialReducedCount,
-		resIncrement, blockSize, itemsPerSlot, lm, 
+		resIncrement, blockSize, itemsPerSlot, doCompress, lm, 
 		f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum);
 	verboseTime(1, "pass3 - writeReducedOnceReturnReducedTwice");
 	zoomAmounts[0] = initialReduction;
 	zoomLevels = 1;
@@ -530,9 +558,9 @@
 	        break;
 	    zoomCount = rezoomCount;
 	    zoomDataOffsets[zoomLevels] = ftell(f);
 	    zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, 
-	    	blockSize, itemsPerSlot, FALSE, f);
+	    	blockSize, itemsPerSlot, doCompress, f);
 	    zoomAmounts[zoomLevels] = reduction;
 	    ++zoomLevels;
 	    reduction *= zoomIncrement;
 	    rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm);
@@ -540,8 +568,16 @@
 	lmCleanup(&lm);
 	verboseTime(1, "further reductions");
 	}
     }
+
+/* Figure out buffer size needed for uncompression if need be. */
+if (doCompress)
+    {
+    int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
+    uncompressBufSize = max(maxBlockSize, maxZoomUncompSize);
+    }
+
 /* Go back and rewrite header. */
 rewind(f);
 bits32 sig = bigBedSig;
 bits16 version = bbiCurrentVersion;
@@ -559,10 +595,11 @@
 writeOne(f, fieldCount);
 writeOne(f, definedFieldCount);
 writeOne(f, asOffset);
 writeOne(f, totalSummaryOffset);
+writeOne(f, uncompressBufSize);
 int i;
-for (i=0; i<3; ++i)
+for (i=0; i<2; ++i)
     writeOne(f, reserved32);
 
 /* Write summary headers with data. */
 verbose(2, "Writing %d levels of zoom\n", zoomLevels);
@@ -599,9 +636,9 @@
 void bedToBigBed(char *inName, char *chromSizes, char *outName)
 /* bedToBigBed - Convert bed file to bigBed.. */
 {
 bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, bedFields, as, 
-	outName);
+	doCompress, outName);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
@@ -610,8 +647,9 @@
 blockSize = optionInt("blockSize", blockSize);
 itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
 bedFields = optionInt("bedFields", bedFields);
 as = optionVal("as", as);
+doCompress = optionExists("compress");
 if (argc != 4)
     usage();
 bedToBigBed(argv[1], argv[2], argv[3]);
 optionFree();