src/lib/bigBed.c 1.15
1.15 2009/04/20 23:16:18 mikep
Index: src/lib/bigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bigBed.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/lib/bigBed.c 17 Apr 2009 23:23:23 -0000 1.14
+++ src/lib/bigBed.c 20 Apr 2009 23:16:18 -0000 1.15
@@ -16,8 +16,10 @@
#include "udc.h"
#include "bbiFile.h"
#include "bigBed.h"
+#define TIMING 0
+
struct bbiFile *bigBedFileOpen(char *fileName)
/* Open up big bed file. */
{
return bbiFileOpen(fileName, bigBedSig, "big bed");
@@ -262,8 +264,23 @@
char *asFileName, /* If non-null points to a .as file that describes fields. */
char *outName) /* BigBed output file name. */
/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
{
+bigBedFileCreateDetailed(inName, FALSE, chromSizes, blockSize, itemsPerSlot, definedFieldCount, asFileName, outName);
+}
+
+void bigBedFileCreateDetailed(
+ char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */
+ boolean sorted, /* Input is already sorted */
+ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
+ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */
+ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */
+ bits16 definedFieldCount, /* Number of defined bed fields - 3-16 or so. 0 means all fields
+ * are the defined bed ones. */
+ char *asFileName, /* If non-null points to a .as file that describes fields. */
+ char *outName) /* BigBed output file name. */
+/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
+{
/* This code needs to agree with code in two other places currently - bigWigFileCreate,
* and bbiFileOpen. I'm thinking of refactoring to share at least between
* bigBedFileCreate and bigWigFileCreate. It'd be great so it could be structured
* so that it could send the input in one chromosome at a time, and send in the zoom
@@ -285,52 +302,75 @@
bits64 reductionDataOffsets[10];
bits64 reductionIndexOffsets[10];
bits16 fieldCount;
+uglyTime(NULL); // initialize timing
+
/* Load up as object if defined in file. */
struct asObject *as = NULL;
if (asFileName != NULL)
{
/* Parse it and do sanity check. */
as = asParseFile(asFileName);
if (as->next != NULL)
errAbort("Can only handle .as files containing a single object.");
+ if (TIMING)
+ uglyTime("Parsed file %s", asFileName);
}
/* Load in chromosome sizes. */
struct hash *chromHash = bbiChromSizesFromFile(chromSizes);
verbose(1, "Read %d chromosomes and sizes from %s\n", chromHash->elCount, chromSizes);
+if (TIMING)
+ uglyTime("bbiChromSizesFromFile()");
/* Load and sort input file. */
bits64 fullSize;
struct ppBed *pb, *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, as,
definedFieldCount, &fullSize, &fieldCount);
if (definedFieldCount == 0)
definedFieldCount = fieldCount;
+if (TIMING)
+ uglyTime("ppBedLoadAll()");
bits64 pbCount = slCount(pbList);
+if (TIMING)
+ uglyTime("slCount()");
verbose(1, "Read %llu items from %s\n", pbCount, inName);
-slSort(&pbList, ppBedCmp);
+if (!sorted)
+ {
+ slSort(&pbList, ppBedCmp);
+ if (TIMING)
+ uglyTime("slSort()");
+ }
/* Make chromosomes. */
int chromCount, maxChromNameSize;
struct bbiChromInfo *chromInfoArray;
makeChromInfo(pbList, chromHash, &chromCount, &chromInfoArray, &maxChromNameSize);
verbose(1, "%d of %d chromosomes used (%4.2f%%)\n", chromCount, chromHash->elCount,
100.0*chromCount/chromHash->elCount);
+if (TIMING)
+ uglyTime("makeChromInfo()");
/* Calculate first meaningful reduction size and make first reduction. */
bits16 summaryCount = 0;
bits16 version = 1;
int initialReduction = ppBedAverageSize(pbList)*10;
+if (TIMING)
+ uglyTime("ppBedAverageSize()");
verbose(2, "averageBedSize=%d\n", initialReduction/10);
if (initialReduction > 10000)
initialReduction = 10000;
bits64 lastSummarySize = 0, summarySize;
struct bbiSummary *summaryList, *firstSummaryList;
for (;;)
{
summaryList = summaryOnDepth(pbList, chromInfoArray, initialReduction);
+ if (TIMING)
+ uglyTime("summaryOnDepth(initialReduction=%d)", initialReduction);
summarySize = bbiTotalSummarySize(summaryList);
+ if (TIMING)
+ uglyTime("bbiTotalSummarySize(summarySize=%d,fullSize=%ld,lastSummarySize=%ld)", summarySize, fullSize, lastSummarySize);
if (summarySize >= fullSize && summarySize != lastSummarySize)
{
initialReduction *= 2;
bbiSummaryFreeList(&summaryList);
@@ -343,8 +383,10 @@
reduceSummaries[0] = firstSummaryList = summaryList;
reductionAmounts[0] = initialReduction;
verbose(1, "Initial zoom reduction x%d data size %4.2f%%\n",
initialReduction, 100.0 * summarySize/fullSize);
+if (TIMING)
+ uglyTime("first summary list");
/* Now calculate up to 10 levels of further summary. */
bits64 reduction = initialReduction;
for (i=0; i<ArraySize(reduceSummaries)-1; i++)
@@ -353,9 +395,13 @@
if (reduction > 1000000000)
break;
summaryList = bbiReduceSummaryList(reduceSummaries[summaryCount-1], chromInfoArray,
reduction);
+ if (TIMING)
+ uglyTime("bbiReduceSummaryList(reduction=%ld)", reduction);
summarySize = bbiTotalSummarySize(summaryList);
+ if (TIMING)
+ uglyTime("bbiTotalSummarySize(summarySize=%ld,summaryCount=%ld,lastSummarySize=%ld)", summarySize, summaryCount, lastSummarySize);
if (summarySize != lastSummarySize)
{
reduceSummaries[summaryCount] = summaryList;
reductionAmounts[summaryCount] = reduction;
@@ -365,8 +411,10 @@
if (summaryItemCount <= chromCount)
break;
}
+if (TIMING)
+ uglyTime("reduce summaries");
/* Open output file and write out fixed size header, including some dummy values for
* offsets we'll fill in later. */
FILE *f = mustOpen(outName, "wb");
@@ -394,8 +442,10 @@
reductionDataOffsetPos[i] = ftell(f);
writeOne(f, reserved64); // Fill in with data offset later
writeOne(f, reserved64); // Fill in with index offset later
}
+if (TIMING)
+ uglyTime("write summary headers");
/* Optionally write out .as file. */
if (asFileName != NULL)
{
@@ -416,8 +466,10 @@
bptFileBulkIndexToOpenFile(chromInfoArray, sizeof(chromInfoArray[0]), chromCount, chromBlockSize,
bbiChromInfoKey, maxChromNameSize, bbiChromInfoVal,
sizeof(chromInfoArray[0].id) + sizeof(chromInfoArray[0].size),
f);
+if (TIMING)
+ uglyTime("write chromosome tree");
/* Write the main data. */
dataOffset = ftell(f);
writeOne(f, pbCount);
@@ -435,8 +487,10 @@
}
putc(0, f);
}
+if (TIMING)
+ uglyTime("write the main data");
/* Write the main index. */
indexOffset = ftell(f);
struct ppBed **pbArray;
AllocArray(pbArray, pbCount);
@@ -445,16 +499,20 @@
cirTreeFileBulkIndexToOpenFile(pbArray, sizeof(pbArray[0]), pbCount,
blockSize, itemsPerSlot, NULL, ppBedFetchKey, ppBedFetchOffset,
indexOffset, f);
freez(&pbArray);
+if (TIMING)
+ uglyTime("write the main index");
/* Write out summary sections. */
verbose(2, "bigBedFileCreate writing %d summaries\n", summaryCount);
for (i=0; i<summaryCount; ++i)
{
reductionDataOffsets[i] = ftell(f);
reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, f);
}
+if (TIMING)
+ uglyTime("write summary sections");
/* Go back and fill in offsets properly in header. */
fseek(f, dataOffsetPos, SEEK_SET);
writeOne(f, dataOffset);
@@ -475,8 +533,10 @@
writeOne(f, reductionDataOffsets[i]);
writeOne(f, reductionIndexOffsets[i]);
}
+if (TIMING)
+ uglyTime("filled in header and done.");
carefulClose(&f);
freez(&chromInfoArray);
}