src/utils/bedToBigBed/bedToBigBed.c 1.15
1.15 2009/11/07 19:31:19 kent
Adding totalSummary to big files. Should be helpful in quickly calculating ranges to be displayed by default.
Index: src/utils/bedToBigBed/bedToBigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/bedToBigBed/bedToBigBed.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/utils/bedToBigBed/bedToBigBed.c 6 Nov 2009 19:44:28 -0000 1.14
+++ src/utils/bedToBigBed/bedToBigBed.c 7 Nov 2009 19:31:19 -0000 1.15
@@ -251,9 +251,10 @@
static struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList,
int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount,
int zoomIncrement, int blockSize, int itemsPerSlot,
- struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart)
+ struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
+ struct bbiSummaryElement *totalSum)
/* Write out data reduced by factor of initialReduction. Also calculate and keep in memory
* next reduction level. This is more work than some ways, but it keeps us from having to
* keep the first reduction entirely in memory. */
{
@@ -272,18 +273,40 @@
* This also has the nice side effect of getting rid of overlaps.
* 2) Stream through the range tree, outputting the initial summary level and
* further reducing.
*/
+boolean firstTime = TRUE;
for (usage = usageList; usage != NULL; usage = usage->next)
{
struct bbiSummary oneSummary, *sum = NULL;
struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
struct range *range, *rangeList = rangeTreeList(rangeTree);
for (range = rangeList; range != NULL; range = range->next)
{
+ /* Grab values we want from range. */
double val = ptToInt(range->val);
int start = range->start;
int end = range->end;
+ bits32 size = end - start;
+
+ /* Add to total summary. */
+ if (firstTime)
+ {
+ totalSum->validCount = size;
+ totalSum->minVal = totalSum->maxVal = val;
+ totalSum->sumData = val*size;
+ totalSum->sumSquares = val*size*size;
+ firstTime = FALSE;
+ }
+ else
+ {
+ totalSum->validCount += size;
+ if (val < totalSum->minVal) totalSum->minVal = val;
+ if (val > totalSum->maxVal) totalSum->maxVal = val;
+ totalSum->sumData += val*size;
+ totalSum->sumSquares += val*size*size;
+ }
+
/* If start past existing block then output it. */
if (sum != NULL && sum->end <= start)
{
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
@@ -305,18 +328,20 @@
/* Deal with case where might have to split an item between multiple summaries. This
* loop handles all but the final affected summary in that case. */
while (end > sum->end)
{
- verbose(3, "Splitting size %d at %d\n", end - start, end);
/* Fold in bits that overlap with existing summary and output. */
- bits32 overlap = rangeIntersection(start, end, sum->start, sum->end);
+ int overlap = rangeIntersection(start, end, sum->start, sum->end);
+ assert(overlap > 0);
+ verbose(3, "Splitting size %d at %d, overlap %d\n", end - start, sum->end, overlap);
sum->validCount += overlap;
if (sum->minVal > val) sum->minVal = val;
if (sum->maxVal < val) sum->maxVal = val;
sum->sumData += val * overlap;
sum->sumSquares += val*val * overlap;
bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
&boundsPt, boundsEnd, usage->size, lm, f);
+ size -= overlap;
/* Move summary to next part. */
sum->start = start = sum->end;
sum->end = start + initialReduction;
@@ -326,9 +351,8 @@
sum->validCount = 0;
}
/* Add to summary. */
- bits32 size = end - start;
sum->validCount += size;
if (sum->minVal > val) sum->minVal = val;
if (sum->maxVal < val) sum->maxVal = val;
sum->sumData += val * size;
@@ -388,9 +412,9 @@
int minDiff = 0;
double aveSpan = 0;
bits64 bedCount = 0;
struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount);
-verboseTime(1, "pass1 - making usageList");
+verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList));
verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan);
/* Open output file and write dummy header. */
FILE *f = mustOpen(outName, "wb");
@@ -409,8 +433,14 @@
carefulClose(&asFile);
verbose(2, "%s has %d columns\n", asFileName, colCount);
}
+/* Write out dummy total summary. */
+struct bbiSummaryElement totalSum;
+ZeroVar(&totalSum);
+bits64 totalSummaryOffset = ftell(f);
+bbiSummaryElementWrite(f, &totalSum);
+
/* Write out chromosome/size database. */
bits64 chromTreeOffset = ftell(f);
bbiWriteChromInfo(usageList, blockSize, f);
@@ -438,9 +468,10 @@
lineFileRewind(lf);
bits16 fieldCount=0;
writeBlocks(usageList, lf, as, definedFieldCount, itemsPerSlot, boundsArray, blockCount, f,
resTryCount, resScales, resSizes, &fieldCount, &definedFieldCount);
-verboseTime(1, "pass2 - checking and writing primary data");
+verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)",
+ (long long)bedCount, fieldCount);
/* Write out primary data index. */
bits64 indexOffset = ftell(f);
cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount,
@@ -484,9 +515,9 @@
lineFileRewind(lf);
struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList,
fieldCount, lf, initialReduction, initialReducedCount,
resIncrement, blockSize, itemsPerSlot, lm,
- f, &zoomDataOffsets[0], &zoomIndexOffsets[0]);
+ f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum);
verboseTime(1, "pass3 - writeReducedOnceReturnReducedTwice");
zoomAmounts[0] = initialReduction;
zoomLevels = 1;
@@ -515,9 +546,9 @@
bits32 sig = bigBedSig;
bits16 version = bbiCurrentVersion;
bits16 summaryCount = zoomLevels;
bits32 reserved32 = 0;
-bits32 reserved64 = 0;
+bits64 reserved64 = 0;
/* Write fixed header */
writeOne(f, sig);
writeOne(f, version);
@@ -527,10 +558,11 @@
writeOne(f, indexOffset);
writeOne(f, fieldCount);
writeOne(f, definedFieldCount);
writeOne(f, asOffset);
+writeOne(f, totalSummaryOffset);
int i;
-for (i=0; i<5; ++i)
+for (i=0; i<3; ++i)
writeOne(f, reserved32);
/* Write summary headers with data. */
verbose(2, "Writing %d levels of zoom\n", zoomLevels);
@@ -550,8 +582,12 @@
writeOne(f, reserved64);
writeOne(f, reserved64);
}
+/* Write total summary. */
+fseek(f, totalSummaryOffset, SEEK_SET);
+bbiSummaryElementWrite(f, &totalSum);
+
/* Clean up. */
lineFileClose(&lf);
carefulClose(&f);