src/utils/bedToBigBed/bedToBigBed.c 1.15

1.15 2009/11/07 19:31:19 kent
Adding totalSummary to big files. Should be helpful in quickly calculating ranges to be displayed by default.
Index: src/utils/bedToBigBed/bedToBigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/bedToBigBed/bedToBigBed.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/utils/bedToBigBed/bedToBigBed.c	6 Nov 2009 19:44:28 -0000	1.14
+++ src/utils/bedToBigBed/bedToBigBed.c	7 Nov 2009 19:31:19 -0000	1.15
@@ -251,9 +251,10 @@
 
 static struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
 	int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
 	int zoomIncrement, int blockSize, int itemsPerSlot, 
-	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart)
+	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
+	struct bbiSummaryElement *totalSum)
 /* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
  * next reduction level.  This is more work than some ways, but it keeps us from having to
  * keep the first reduction entirely in memory. */
 {
@@ -272,18 +273,40 @@
  *      This also has the nice side effect of getting rid of overlaps.
  *   2) Stream through the range tree, outputting the initial summary level and
  *      further reducing. 
  */
+boolean firstTime = TRUE;
 for (usage = usageList; usage != NULL; usage = usage->next)
     {
     struct bbiSummary oneSummary, *sum = NULL;
     struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
     struct range *range, *rangeList = rangeTreeList(rangeTree);
     for (range = rangeList; range != NULL; range = range->next)
         {
+	/* Grab values we want from range. */
 	double val = ptToInt(range->val);
 	int start = range->start;
 	int end = range->end;
+	bits32 size = end - start;
+
+	/* Add to total summary. */
+	if (firstTime)
+	    {
+	    totalSum->validCount = size;
+	    totalSum->minVal = totalSum->maxVal = val;
+	    totalSum->sumData = val*size;
+	    totalSum->sumSquares = val*size*size;
+	    firstTime = FALSE;
+	    }
+	else
+	    {
+	    totalSum->validCount += size;
+	    if (val < totalSum->minVal) totalSum->minVal = val;
+	    if (val > totalSum->maxVal) totalSum->maxVal = val;
+	    totalSum->sumData += val*size;
+	    totalSum->sumSquares += val*size*size;
+	    }
+
 	/* If start past existing block then output it. */
 	if (sum != NULL && sum->end <= start)
 	    {
 	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
@@ -305,18 +328,20 @@
 	/* Deal with case where might have to split an item between multiple summaries.  This
 	 * loop handles all but the final affected summary in that case. */
 	while (end > sum->end)
 	    {
-	    verbose(3, "Splitting size %d at %d\n", end - start, end);
 	    /* Fold in bits that overlap with existing summary and output. */
-	    bits32 overlap = rangeIntersection(start, end, sum->start, sum->end);
+	    int overlap = rangeIntersection(start, end, sum->start, sum->end);
+	    assert(overlap > 0);
+	    verbose(3, "Splitting size %d at %d, overlap %d\n", end - start, sum->end, overlap);
 	    sum->validCount += overlap;
 	    if (sum->minVal > val) sum->minVal = val;
 	    if (sum->maxVal < val) sum->maxVal = val;
 	    sum->sumData += val * overlap;
 	    sum->sumSquares += val*val * overlap;
 	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
 		    &boundsPt, boundsEnd, usage->size, lm, f);
+	    size -= overlap;
 
 	    /* Move summary to next part. */
 	    sum->start = start = sum->end;
 	    sum->end = start + initialReduction;
@@ -326,9 +351,8 @@
 	    sum->validCount = 0;
 	    }
 
 	/* Add to summary. */
-	bits32 size = end - start;
 	sum->validCount += size;
 	if (sum->minVal > val) sum->minVal = val;
 	if (sum->maxVal < val) sum->maxVal = val;
 	sum->sumData += val * size;
@@ -388,9 +412,9 @@
 int minDiff = 0;
 double aveSpan = 0;
 bits64 bedCount = 0;
 struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount);
-verboseTime(1, "pass1 - making usageList");
+verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList));
 verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan);
 
 /* Open output file and write dummy header. */
 FILE *f = mustOpen(outName, "wb");
@@ -409,8 +433,14 @@
     carefulClose(&asFile);
     verbose(2, "%s has %d columns\n", asFileName, colCount);
     }
 
+/* Write out dummy total summary. */
+struct bbiSummaryElement totalSum;
+ZeroVar(&totalSum);
+bits64 totalSummaryOffset = ftell(f);
+bbiSummaryElementWrite(f, &totalSum);
+
 /* Write out chromosome/size database. */
 bits64 chromTreeOffset = ftell(f);
 bbiWriteChromInfo(usageList, blockSize, f);
 
@@ -438,9 +468,10 @@
 lineFileRewind(lf);
 bits16 fieldCount=0;
 writeBlocks(usageList, lf, as, definedFieldCount, itemsPerSlot, boundsArray, blockCount, f,
 	resTryCount, resScales, resSizes, &fieldCount, &definedFieldCount);
-verboseTime(1, "pass2 - checking and writing primary data");
+verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", 
+	(long long)bedCount, fieldCount);
 
 /* Write out primary data index. */
 bits64 indexOffset = ftell(f);
 cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount,
@@ -484,9 +515,9 @@
 	lineFileRewind(lf);
 	struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, 
 		fieldCount, lf, initialReduction, initialReducedCount,
 		resIncrement, blockSize, itemsPerSlot, lm, 
-		f, &zoomDataOffsets[0], &zoomIndexOffsets[0]);
+		f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum);
 	verboseTime(1, "pass3 - writeReducedOnceReturnReducedTwice");
 	zoomAmounts[0] = initialReduction;
 	zoomLevels = 1;
 
@@ -515,9 +546,9 @@
 bits32 sig = bigBedSig;
 bits16 version = bbiCurrentVersion;
 bits16 summaryCount = zoomLevels;
 bits32 reserved32 = 0;
-bits32 reserved64 = 0;
+bits64 reserved64 = 0;
 
 /* Write fixed header */
 writeOne(f, sig);
 writeOne(f, version);
@@ -527,10 +558,11 @@
 writeOne(f, indexOffset);
 writeOne(f, fieldCount);
 writeOne(f, definedFieldCount);
 writeOne(f, asOffset);
+writeOne(f, totalSummaryOffset);
 int i;
-for (i=0; i<5; ++i)
+for (i=0; i<3; ++i)
     writeOne(f, reserved32);
 
 /* Write summary headers with data. */
 verbose(2, "Writing %d levels of zoom\n", zoomLevels);
@@ -550,8 +582,12 @@
     writeOne(f, reserved64);
     writeOne(f, reserved64);
     }
 
+/* Write total summary. */
+fseek(f, totalSummaryOffset, SEEK_SET);
+bbiSummaryElementWrite(f, &totalSum);
+
 
 /* Clean up. */
 lineFileClose(&lf);
 carefulClose(&f);