src/utils/bedToBigBed/bedToBigBed.c 57d153c3caf42b22dd4cff6138a54c1b8545333f

57d153c3caf42b22dd4cff6138a54c1b8545333f
kent
  Fri Mar 8 15:25:37 2013 -0800
Fixing a bug where sometimes zoom summaries would not be written out by bedGraphToBigWig.  I'm seeing a lot of code that can be shared between bedGraphToBigWig and bedToBigBed.  Refactored to share some now, will do more shortly.
diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c
index 8a47dc7..e3b81f4 100644
--- src/utils/bedToBigBed/bedToBigBed.c
+++ src/utils/bedToBigBed/bedToBigBed.c
@@ -521,66 +521,67 @@
     {
     if (eim->chunkArrayArray != NULL)
 	{
 	int i;
 	for (i=0; i<eim->indexCount; ++i)
 	    freeMem(eim->chunkArrayArray[i]);
 	}
     freeMem(eim->indexFields);
     freeMem(eim->maxFieldSize);
     freeMem(eim->chunkArrayArray);
     freeMem(eim->fileOffsets);
     freez(pEim);
     }
 }
 
+
 void bbFileCreate(
 	char *inName, 	  /* Input file in a tabular bed format <chrom><start><end> + whatever. */
 	char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
 	char *asText,	  /* Field definitions in a string */
 	struct asObject *as,  /* Field definitions parsed out */
 	boolean doCompress, /* If TRUE then compress data. */
 	struct slName *extraIndexList,	/* List of extra indexes to add */
 	char *outName)    /* BigBed output file name. */
 /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
 {
 /* Set up timing measures. */
 verboseTimeInit();
 struct lineFile *lf = lineFileOpen(inName, TRUE);
 
 bits16 fieldCount = slCount(as->columnList);
 bits16 extraIndexCount = slCount(extraIndexList);
 
 struct bbExIndexMaker *eim = NULL;
 if (extraIndexList != NULL)
     eim = bbExIndexMakerNew(extraIndexList, as);
 
 /* Load in chromosome sizes. */
 struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes);
 verbose(2, "Read %d chromosomes and sizes from %s\n",  chromSizesHash->elCount, chromSizes);
 
 /* Do first pass, mostly just scanning file and counting hits per chromosome. */
 int minDiff = 0;
-double aveSpan = 0;
+double aveSize = 0;
 bits64 bedCount = 0;
 bits32 uncompressBufSize = 0;
 struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, 
-    &minDiff, &aveSpan, &bedCount);
+    &minDiff, &aveSize, &bedCount);
 verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList));
-verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan);
+verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize);
 
 /* Open output file and write dummy header. */
 FILE *f = mustOpen(outName, "wb");
 bbiWriteDummyHeader(f);
 bbiWriteDummyZooms(f);
 
 /* Write out autoSql string */
 bits64 asOffset = ftell(f);
 mustWrite(f, asText, strlen(asText) + 1);
 verbose(2, "as definition has %d columns\n", fieldCount);
 
 /* Write out dummy total summary. */
 struct bbiSummaryElement totalSum;
 ZeroVar(&totalSum);
 bits64 totalSummaryOffset = ftell(f);
@@ -595,51 +596,32 @@
 bits64 extraIndexListOffset = 0;
 bits64 extraIndexListEndOffset = 0;
 if (extraIndexList != NULL)
     {
     extraIndexListOffset = ftell(f);
     int extraIndexSize = 16 + 4*1;   // Fixed record size 16, plus 1 times field size of 4 
     repeatCharOut(f, 0, extraIndexSize*extraIndexCount);
     extraIndexListEndOffset = ftell(f);
     }
 
 /* Write out chromosome/size database. */
 bits64 chromTreeOffset = ftell(f);
 bbiWriteChromInfo(usageList, blockSize, f);
 
 /* Set up to keep track of possible initial reduction levels. */
-int resTryCount = 10, resTry;
-int resIncrement = 4;
-int resScales[resTryCount], resSizes[resTryCount];
-int minZoom = 10;
-int res = aveSpan;
-if (res < minZoom)
-    res = minZoom;
-for (resTry = 0; resTry < resTryCount; ++resTry)
-    {
-    resSizes[resTry] = 0;
-    resScales[resTry] = res;
-    // if aveSpan is large, then the initial value of res is large,
-    //  and we cannot do all 10 levels without overflowing res* integers and other related variables.
-    if (res > 1000000000) 
-	{
-	resTryCount = resTry + 1;  
-	verbose(2, "resTryCount reduced from 10 to %d\n", resTryCount);
-	break;
-	}
-    res *= resIncrement;
-    }
+int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels];
+int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes);
 
 /* Write out primary full resolution data in sections, collect stats to use for reductions. */
 bits64 dataOffset = ftell(f);
 writeOne(f, bedCount);
 bits32 blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot);
 struct bbiBoundsArray *boundsArray;
 AllocArray(boundsArray, blockCount);
 lineFileRewind(lf);
 bits32 maxBlockSize = 0;
 if (eim)
     bbExIndexMakerAllocChunkArrays(eim, bedCount);
 writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress,
 	f, resTryCount, resScales, resSizes, eim, bedCount, fieldCount, &maxBlockSize);
 verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", 
 	(long long)bedCount, fieldCount);
@@ -647,61 +629,72 @@
 /* Write out primary data index. */
 bits64 indexOffset = ftell(f);
 cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount,
     blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
     indexOffset, f);
 freez(&boundsArray);
 verboseTime(2, "index write");
 
 /* Declare arrays and vars that track the zoom levels we actually output. */
 bits32 zoomAmounts[bbiMaxZoomLevels];
 bits64 zoomDataOffsets[bbiMaxZoomLevels];
 bits64 zoomIndexOffsets[bbiMaxZoomLevels];
 int zoomLevels = 0;
 
 /* Write out first zoomed section while storing in memory next zoom level. */
-if (aveSpan > 0)
+/* This is just a block to make some variables more local. */
     {
+    assert(resTryCount > 0);
     bits64 dataSize = indexOffset - dataOffset;
     int maxReducedSize = dataSize/2;
     int initialReduction = 0, initialReducedCount = 0;
 
     /* Figure out initialReduction for zoom. */
+    int resTry;
     for (resTry = 0; resTry < resTryCount; ++resTry)
 	{
 	bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk);
 	if (doCompress)
 	    reducedSize /= 2;	// Estimate!
 	if (reducedSize <= maxReducedSize)
 	    {
 	    initialReduction = resScales[resTry];
 	    initialReducedCount = resSizes[resTry];
 	    break;
 	    }
 	}
     verbose(2, "initialReduction %d, initialReducedCount = %d\n", 
     	initialReduction, initialReducedCount);
     verbose(2, "dataSize %llu, reducedSize %llu, resScales[0] = %d\n", dataSize, (bits64)(initialReducedCount*sizeof(struct bbiSummaryOnDisk)), resScales[0]);
 
-    if (initialReduction > 0)
+    /* Force there to always be at least one zoom.  It may waste a little space on small
+     * files, but it makes files more uniform, and avoids special case code for calculating
+     * overall file summary. */
+    if (initialReduction == 0)
+        {
+	initialReduction = resScales[0];
+	initialReducedCount = resSizes[0];
+	}
+
+    /* This is just a block to make some variables more local. */
         {
 	struct lm *lm = lmInit(0);
-	int zoomIncrement = 4;
+	int zoomIncrement = bbiResIncrement;
 	lineFileRewind(lf);
 	struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, 
 		fieldCount, lf, initialReduction, initialReducedCount, 
-		resIncrement, blockSize, itemsPerSlot, doCompress, lm, 
+		zoomIncrement, blockSize, itemsPerSlot, doCompress, lm, 
 		f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum);
 	verboseTime(1, "pass3 - writeReducedOnceReturnReducedTwice");
 	zoomAmounts[0] = initialReduction;
 	zoomLevels = 1;
 
 	int zoomCount = initialReducedCount;
 	int reduction = initialReduction * zoomIncrement;
 	while (zoomLevels < bbiMaxZoomLevels)
 	    {
 	    int rezoomCount = slCount(rezoomedList);
 	    if (rezoomCount >= zoomCount)
 	        break;
 	    zoomCount = rezoomCount;
 	    zoomDataOffsets[zoomLevels] = ftell(f);
 	    zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList,