src/lib/bigBed.c 1.15

1.15 2009/04/20 23:16:18 mikep
Index: src/lib/bigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bigBed.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/lib/bigBed.c	17 Apr 2009 23:23:23 -0000	1.14
+++ src/lib/bigBed.c	20 Apr 2009 23:16:18 -0000	1.15
@@ -16,8 +16,10 @@
 #include "udc.h"
 #include "bbiFile.h"
 #include "bigBed.h"
 
+#define TIMING 0
+
 struct bbiFile *bigBedFileOpen(char *fileName)
 /* Open up big bed file. */
 {
 return bbiFileOpen(fileName, bigBedSig, "big bed");
@@ -262,8 +264,23 @@
 	char *asFileName, /* If non-null points to a .as file that describes fields. */
 	char *outName)    /* BigBed output file name. */
 /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
 {
+bigBedFileCreateDetailed(inName, FALSE, chromSizes, blockSize, itemsPerSlot, definedFieldCount, asFileName, outName);
+}
+
+void bigBedFileCreateDetailed(
+	char *inName, 	  /* Input file in a tabular bed format <chrom><start><end> + whatever. */
+	boolean sorted,	  /* Input is already sorted */
+	char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
+	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
+	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
+	bits16 definedFieldCount,  /* Number of defined bed fields - 3-16 or so.  0 means all fields
+				    * are the defined bed ones. */
+	char *asFileName, /* If non-null points to a .as file that describes fields. */
+	char *outName)    /* BigBed output file name. */
+/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
+{
 /* This code needs to agree with code in two other places currently - bigWigFileCreate,
  * and bbiFileOpen.  I'm thinking of refactoring to share at least between
  * bigBedFileCreate and bigWigFileCreate.  It'd be great so it could be structured
  * so that it could send the input in one chromosome at a time, and send in the zoom
@@ -285,52 +302,75 @@
 bits64 reductionDataOffsets[10];
 bits64 reductionIndexOffsets[10];
 bits16 fieldCount;
 
+uglyTime(NULL); // initialize timing
+
 /* Load up as object if defined in file. */
 struct asObject *as = NULL;
 if (asFileName != NULL)
     {
     /* Parse it and do sanity check. */
     as = asParseFile(asFileName);
     if (as->next != NULL)
         errAbort("Can only handle .as files containing a single object.");
+    if (TIMING)
+	uglyTime("Parsed file %s", asFileName);
     }
 
 /* Load in chromosome sizes. */
 struct hash *chromHash = bbiChromSizesFromFile(chromSizes);
 verbose(1, "Read %d chromosomes and sizes from %s\n",  chromHash->elCount, chromSizes);
+if (TIMING)
+    uglyTime("bbiChromSizesFromFile()");
 
 /* Load and sort input file. */
 bits64 fullSize;
 struct ppBed *pb, *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, as, 
 	definedFieldCount, &fullSize, &fieldCount);
 if (definedFieldCount == 0)
     definedFieldCount = fieldCount;
+if (TIMING)
+    uglyTime("ppBedLoadAll()");
 bits64 pbCount = slCount(pbList);
+if (TIMING)
+    uglyTime("slCount()");
 verbose(1, "Read %llu items from %s\n", pbCount, inName);
-slSort(&pbList, ppBedCmp);
+if (!sorted)
+    {
+    slSort(&pbList, ppBedCmp);
+    if (TIMING)
+	uglyTime("slSort()");
+    }
 
 /* Make chromosomes. */
 int chromCount, maxChromNameSize;
 struct bbiChromInfo *chromInfoArray;
 makeChromInfo(pbList, chromHash, &chromCount, &chromInfoArray, &maxChromNameSize);
 verbose(1, "%d of %d chromosomes used (%4.2f%%)\n", chromCount, chromHash->elCount, 
 	100.0*chromCount/chromHash->elCount);
+if (TIMING)
+    uglyTime("makeChromInfo()");
 
 /* Calculate first meaningful reduction size and make first reduction. */
 bits16 summaryCount = 0;
 bits16 version = 1;
 int initialReduction = ppBedAverageSize(pbList)*10;
+if (TIMING)
+    uglyTime("ppBedAverageSize()");
 verbose(2, "averageBedSize=%d\n", initialReduction/10);
 if (initialReduction > 10000)
     initialReduction = 10000;
 bits64 lastSummarySize = 0, summarySize;
 struct bbiSummary *summaryList, *firstSummaryList;
 for (;;)
     {
     summaryList = summaryOnDepth(pbList, chromInfoArray, initialReduction);
+    if (TIMING)
+	uglyTime("summaryOnDepth(initialReduction=%d)", initialReduction);
     summarySize = bbiTotalSummarySize(summaryList);
+    if (TIMING)
+	uglyTime("bbiTotalSummarySize(summarySize=%d,fullSize=%ld,lastSummarySize=%ld)", summarySize, fullSize, lastSummarySize);
     if (summarySize >= fullSize && summarySize != lastSummarySize)
         {
 	initialReduction *= 2;
 	bbiSummaryFreeList(&summaryList);
@@ -343,8 +383,10 @@
 reduceSummaries[0] = firstSummaryList = summaryList;
 reductionAmounts[0] = initialReduction;
 verbose(1, "Initial zoom reduction x%d data size %4.2f%%\n", 
 	initialReduction, 100.0 * summarySize/fullSize);
+if (TIMING)
+    uglyTime("first summary list");
 
 /* Now calculate up to 10 levels of further summary. */
 bits64 reduction = initialReduction;
 for (i=0; i<ArraySize(reduceSummaries)-1; i++)
@@ -353,9 +395,13 @@
     if (reduction > 1000000000)
         break;
     summaryList = bbiReduceSummaryList(reduceSummaries[summaryCount-1], chromInfoArray, 
     	reduction);
+    if (TIMING)
+	uglyTime("bbiReduceSummaryList(reduction=%ld)", reduction);
     summarySize = bbiTotalSummarySize(summaryList);
+    if (TIMING)
+	uglyTime("bbiTotalSummarySize(summarySize=%ld,summaryCount=%ld,lastSummarySize=%ld)", summarySize, summaryCount, lastSummarySize);
     if (summarySize != lastSummarySize)
         {
  	reduceSummaries[summaryCount] = summaryList;
 	reductionAmounts[summaryCount] = reduction;
@@ -365,8 +411,10 @@
     if (summaryItemCount <= chromCount)
         break;
     }
 
+if (TIMING)
+    uglyTime("reduce summaries");
 
 /* Open output file and write out fixed size header, including some dummy values for
  * offsets we'll fill in later. */
 FILE *f = mustOpen(outName, "wb");
@@ -394,8 +442,10 @@
     reductionDataOffsetPos[i] = ftell(f);
     writeOne(f, reserved64);	// Fill in with data offset later
     writeOne(f, reserved64);	// Fill in with index offset later
     }
+if (TIMING)
+    uglyTime("write summary headers");
 
 /* Optionally write out .as file. */
 if (asFileName != NULL)
     {
@@ -416,8 +466,10 @@
 bptFileBulkIndexToOpenFile(chromInfoArray, sizeof(chromInfoArray[0]), chromCount, chromBlockSize,
     bbiChromInfoKey, maxChromNameSize, bbiChromInfoVal, 
     sizeof(chromInfoArray[0].id) + sizeof(chromInfoArray[0].size), 
     f);
+if (TIMING)
+    uglyTime("write chromosome tree");
 
 /* Write the main data. */
 dataOffset = ftell(f);
 writeOne(f, pbCount);
@@ -435,8 +487,10 @@
 	}
     putc(0, f);
     }
 
+if (TIMING)
+    uglyTime("write the main data");
 /* Write the main index. */
 indexOffset = ftell(f);
 struct ppBed **pbArray;
 AllocArray(pbArray, pbCount);
@@ -445,16 +499,20 @@
 cirTreeFileBulkIndexToOpenFile(pbArray, sizeof(pbArray[0]), pbCount,
     blockSize, itemsPerSlot, NULL, ppBedFetchKey, ppBedFetchOffset, 
     indexOffset, f);
 freez(&pbArray);
+if (TIMING)
+    uglyTime("write the main index");
 
 /* Write out summary sections. */
 verbose(2, "bigBedFileCreate writing %d summaries\n", summaryCount);
 for (i=0; i<summaryCount; ++i)
     {
     reductionDataOffsets[i] = ftell(f);
     reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, f);
     }
+if (TIMING)
+    uglyTime("write summary sections");
 
 /* Go back and fill in offsets properly in header. */
 fseek(f, dataOffsetPos, SEEK_SET);
 writeOne(f, dataOffset);
@@ -475,8 +533,10 @@
     writeOne(f, reductionDataOffsets[i]);
     writeOne(f, reductionIndexOffsets[i]);
     }
 
+if (TIMING)
+    uglyTime("filled in header and done.");
 carefulClose(&f);
 freez(&chromInfoArray);
 }