57d153c3caf42b22dd4cff6138a54c1b8545333f kent Fri Mar 8 15:25:37 2013 -0800 Fixing a bug where sometimes zoom summaries would not be written out by bedGraphToBigWig. I'm seeing a lot of code that can be shared between bedGraphToBigWig and bedToBigBed. Refactored to share some now, will do more shortly. diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index 8a47dc7..e3b81f4 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -521,66 +521,67 @@ { if (eim->chunkArrayArray != NULL) { int i; for (i=0; i<eim->indexCount; ++i) freeMem(eim->chunkArrayArray[i]); } freeMem(eim->indexFields); freeMem(eim->maxFieldSize); freeMem(eim->chunkArrayArray); freeMem(eim->fileOffsets); freez(pEim); } } + void bbFileCreate( char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */ char *asText, /* Field definitions in a string */ struct asObject *as, /* Field definitions parsed out */ boolean doCompress, /* If TRUE then compress data. */ struct slName *extraIndexList, /* List of extra indexes to add */ char *outName) /* BigBed output file name. */ /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */ { /* Set up timing measures. */ verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); bits16 fieldCount = slCount(as->columnList); bits16 extraIndexCount = slCount(extraIndexList); struct bbExIndexMaker *eim = NULL; if (extraIndexList != NULL) eim = bbExIndexMakerNew(extraIndexList, as); /* Load in chromosome sizes. */ struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "Read %d chromosomes and sizes from %s\n", chromSizesHash->elCount, chromSizes); /* Do first pass, mostly just scanning file and counting hits per chromosome. */ int minDiff = 0; -double aveSpan = 0; +double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, - &minDiff, &aveSpan, &bedCount); + &minDiff, &aveSize, &bedCount); verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList)); -verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan); +verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize); /* Open output file and write dummy header. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out autoSql string */ bits64 asOffset = ftell(f); mustWrite(f, asText, strlen(asText) + 1); verbose(2, "as definition has %d columns\n", fieldCount); /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); @@ -595,51 +596,32 @@ bits64 extraIndexListOffset = 0; bits64 extraIndexListEndOffset = 0; if (extraIndexList != NULL) { extraIndexListOffset = ftell(f); int extraIndexSize = 16 + 4*1; // Fixed record size 16, plus 1 times field size of 4 repeatCharOut(f, 0, extraIndexSize*extraIndexCount); extraIndexListEndOffset = ftell(f); } /* Write out chromosome/size database. */ bits64 chromTreeOffset = ftell(f); bbiWriteChromInfo(usageList, blockSize, f); /* Set up to keep track of possible initial reduction levels. */ -int resTryCount = 10, resTry; -int resIncrement = 4; -int resScales[resTryCount], resSizes[resTryCount]; -int minZoom = 10; -int res = aveSpan; -if (res < minZoom) - res = minZoom; -for (resTry = 0; resTry < resTryCount; ++resTry) - { - resSizes[resTry] = 0; - resScales[resTry] = res; - // if aveSpan is large, then the initial value of res is large, - // and we cannot do all 10 levels without overflowing res* integers and other related variables. - if (res > 1000000000) - { - resTryCount = resTry + 1; - verbose(2, "resTryCount reduced from 10 to %d\n", resTryCount); - break; - } - res *= resIncrement; - } +int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels]; +int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes); /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); writeOne(f, bedCount); bits32 blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); struct bbiBoundsArray *boundsArray; AllocArray(boundsArray, blockCount); lineFileRewind(lf); bits32 maxBlockSize = 0; if (eim) bbExIndexMakerAllocChunkArrays(eim, bedCount); writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress, f, resTryCount, resScales, resSizes, eim, bedCount, fieldCount, &maxBlockSize); verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", (long long)bedCount, fieldCount); @@ -647,61 +629,72 @@ /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); verboseTime(2, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; int zoomLevels = 0; /* Write out first zoomed section while storing in memory next zoom level. */ -if (aveSpan > 0) +/* This is just a block to make some variables more local. */ { + assert(resTryCount > 0); bits64 dataSize = indexOffset - dataOffset; int maxReducedSize = dataSize/2; int initialReduction = 0, initialReducedCount = 0; /* Figure out initialReduction for zoom. */ + int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) { bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk); if (doCompress) reducedSize /= 2; // Estimate! if (reducedSize <= maxReducedSize) { initialReduction = resScales[resTry]; initialReducedCount = resSizes[resTry]; break; } } verbose(2, "initialReduction %d, initialReducedCount = %d\n", initialReduction, initialReducedCount); verbose(2, "dataSize %llu, reducedSize %llu, resScales[0] = %d\n", dataSize, (bits64)(initialReducedCount*sizeof(struct bbiSummaryOnDisk)), resScales[0]); - if (initialReduction > 0) + /* Force there to always be at least one zoom. It may waste a little space on small + * files, but it makes files more uniform, and avoids special case code for calculating + * overall file summary. */ + if (initialReduction == 0) + { + initialReduction = resScales[0]; + initialReducedCount = resSizes[0]; + } + + /* This is just a block to make some variables more local. */ { struct lm *lm = lmInit(0); - int zoomIncrement = 4; + int zoomIncrement = bbiResIncrement; lineFileRewind(lf); struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, fieldCount, lf, initialReduction, initialReducedCount, - resIncrement, blockSize, itemsPerSlot, doCompress, lm, + zoomIncrement, blockSize, itemsPerSlot, doCompress, lm, f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum); verboseTime(1, "pass3 - writeReducedOnceReturnReducedTwice"); zoomAmounts[0] = initialReduction; zoomLevels = 1; int zoomCount = initialReducedCount; int reduction = initialReduction * zoomIncrement; while (zoomLevels < bbiMaxZoomLevels) { int rezoomCount = slCount(rezoomedList); if (rezoomCount >= zoomCount) break; zoomCount = rezoomCount; zoomDataOffsets[zoomLevels] = ftell(f); zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList,