e947f44bed3e61a500f59423d3c560eff9bcf270 galt Fri Dec 13 15:03:41 2013 -0800 Merging in bigWigCat shared branch with --squash. This is Daniel Zerbino's work. diff --git src/lib/bwgCreate.c src/lib/bwgCreate.c index af76cd7..7cf8e3f 100644 --- src/lib/bwgCreate.c +++ src/lib/bwgCreate.c @@ -614,30 +614,102 @@ int i; for (i = 0, uniq = uniqList; i < chromCount; ++i, uniq = uniq->next) { chromArray[i].name = uniq->val; chromArray[i].id = i; chromArray[i].size = hashIntVal(chromSizeHash, uniq->val); } /* Clean up, set return values and go home. */ slFreeList(&uniqList); *retChromCount = chromCount; *retChromArray = chromArray; *retMaxChromNameSize = maxChromNameSize; } +static int bwgStrcmp (const void * A, const void * B) { + char * stringA = *((char **) A); + char * stringB = *((char **) B); + return strcmp(stringA, stringB); +} + +void bwgMakeAllChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash, + int *retChromCount, struct bbiChromInfo **retChromArray, + int *retMaxChromNameSize) +/* Fill in chromId field in sectionList. Return array of chromosome name/ids. + * The chromSizeHash is keyed by name, and has int values. */ +{ +/* Build up list of unique chromosome names. */ +int maxChromNameSize = 0; + +/* Get list of values */ +int chromCount = chromSizeHash->elCount; +char ** chromName, ** chromNames; +AllocArray(chromNames, chromCount); +chromName = chromNames; +struct hashEl* el; +struct hashCookie cookie = hashFirst(chromSizeHash); +for (el = hashNext(&cookie); el; el = hashNext(&cookie)) { + *chromName = el->name; + if (strlen(el->name) > maxChromNameSize) + maxChromNameSize = strlen(el->name); + chromName++; +} +qsort(chromNames, chromCount, sizeof(char *), bwgStrcmp); + +/* Allocate and fill in results array. */ +struct bbiChromInfo *chromArray; +AllocArray(chromArray, chromCount); +int i; +for (i = 0; i < chromCount; ++i) + { + chromArray[i].name = chromNames[i]; + chromArray[i].id = i; + chromArray[i].size = hashIntVal(chromSizeHash, chromNames[i]); + } + +// Assign IDs to sections: +struct bwgSection *section; +char *name = ""; +bits32 chromId = 0; +for (section = sectionList; section != NULL; section = section->next) + { + if (!sameString(section->chrom, name)) + { + for (i = 0; i < chromCount; ++i) + { + if (sameString(section->chrom, chromArray[i].name)) + { + section->chromId = i; + break; + } + } + if (i == chromCount) + errAbort("Could not find %s in list of chromosomes\n", section->chrom); + chromId = section->chromId; + name = section->chrom; + } + else + section->chromId = chromId; + } + +/* Clean up, set return values and go home. */ +*retChromCount = chromCount; +*retChromArray = chromArray; +*retMaxChromNameSize = maxChromNameSize; +} + int bwgAverageResolution(struct bwgSection *sectionList) /* Return the average resolution seen in sectionList. */ { if (sectionList == NULL) return 1; bits64 totalRes = 0; bits32 sectionCount = 0; struct bwgSection *section; int i; for (section = sectionList; section != NULL; section = section->next) { int sectionRes = 0; switch (section->type) { case bwgTypeBedGraph: @@ -784,119 +856,152 @@ case bwgTypeVariableStep: bwgReduceVariableStep(section, chromSize, reduction, &outList); break; case bwgTypeFixedStep: bwgReduceFixedStep(section, chromSize, reduction, &outList); break; default: internalErr(); return 0; } } slReverse(&outList); return outList; } -void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, - int blockSize, int itemsPerSlot, boolean doCompress, char *fileName) -/* Create a bigWig file out of a sorted sectionList. */ -{ -bits64 sectionCount = slCount(sectionList); -FILE *f = mustOpen(fileName, "wb"); -bits32 sig = bigWigSig; -bits16 version = bbiCurrentVersion; -bits16 summaryCount = 0; -bits16 reserved16 = 0; -bits32 reserved32 = 0; -bits64 reserved64 = 0; -bits64 dataOffset = 0, dataOffsetPos; -bits64 indexOffset = 0, indexOffsetPos; -bits64 chromTreeOffset = 0, chromTreeOffsetPos; -bits64 totalSummaryOffset = 0, totalSummaryOffsetPos; -bits32 uncompressBufSize = 0; -bits64 uncompressBufSizePos; -struct bbiSummary *reduceSummaries[10]; -bits32 reductionAmounts[10]; -bits64 reductionDataOffsetPos[10]; -bits64 reductionDataOffsets[10]; -bits64 reductionIndexOffsets[10]; -int i; - -/* Figure out chromosome ID's. */ -struct bbiChromInfo *chromInfoArray; -int chromCount, maxChromNameSize; -bwgMakeChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize); - +static void bwgComputeDynamicSummaries(struct bwgSection *sectionList, struct bbiSummary ** reduceSummaries, bits16 * summaryCount, struct bbiChromInfo *chromInfoArray, int chromCount, bits32 * reductionAmounts, boolean doCompress) { /* Figure out initial summary level - starting with a summary 10 times the amount * of the smallest item. See if summarized data is smaller than half input data, if * not bump up reduction by a factor of 2 until it is, or until further summarying * yeilds no size reduction. */ +int i; int minRes = bwgAverageResolution(sectionList); int initialReduction = minRes*10; bits64 fullSize = bwgTotalSectionSize(sectionList); -bits64 maxReducedSize = fullSize/2; -struct bbiSummary *firstSummaryList = NULL, *summaryList = NULL; bits64 lastSummarySize = 0, summarySize; +bits64 maxReducedSize = fullSize/2; +struct bbiSummary *summaryList = NULL; for (;;) { summaryList = bwgReduceSectionList(sectionList, chromInfoArray, initialReduction); bits64 summarySize = bbiTotalSummarySize(summaryList); if (doCompress) { summarySize *= 2; // Compensate for summary not compressing as well as primary data } if (summarySize >= maxReducedSize && summarySize != lastSummarySize) { /* Need to do more reduction. First scale reduction by amount that it missed * being small enough last time, with an extra 10% for good measure. Then * just to keep from spinning through loop two many times, make sure this is * at least 2x the previous reduction. */ int nextReduction = 1.1 * initialReduction * summarySize / maxReducedSize; if (nextReduction < initialReduction*2) nextReduction = initialReduction*2; initialReduction = nextReduction; bbiSummaryFreeList(&summaryList); lastSummarySize = summarySize; } else break; } -summaryCount = 1; -reduceSummaries[0] = firstSummaryList = summaryList; +*summaryCount = 1; +reduceSummaries[0] = summaryList; reductionAmounts[0] = initialReduction; /* Now calculate up to 10 levels of further summary. */ bits64 reduction = initialReduction; -for (i=0; i<ArraySize(reduceSummaries)-1; i++) +for (i=0; i<9; i++) { reduction *= 4; if (reduction > 1000000000) break; - summaryList = bbiReduceSummaryList(reduceSummaries[summaryCount-1], chromInfoArray, + summaryList = bbiReduceSummaryList(reduceSummaries[*summaryCount-1], chromInfoArray, reduction); summarySize = bbiTotalSummarySize(summaryList); if (summarySize != lastSummarySize) { - reduceSummaries[summaryCount] = summaryList; - reductionAmounts[summaryCount] = reduction; - ++summaryCount; + reduceSummaries[*summaryCount] = summaryList; + reductionAmounts[*summaryCount] = reduction; + ++(*summaryCount); } int summaryItemCount = slCount(summaryList); if (summaryItemCount <= chromCount) break; } +} + +static void bwgComputeFixedSummaries(struct bwgSection * sectionList, struct bbiSummary ** reduceSummaries, bits16 * summaryCount, struct bbiChromInfo *chromInfoArray, bits32 * reductionAmounts) { +// Hack: pre-defining summary levels, set off Ensembl default zoom levels +// The last two values of this array were extrapolated following Jim's formula +int i; +#define REDUCTION_COUNT 10 +bits32 presetReductions[REDUCTION_COUNT] = {30, 65, 130, 260, 450, 648, 950, 1296, 4800, 19200}; + +bits64 reduction = reductionAmounts[0] = presetReductions[0]; +reduceSummaries[0] = bwgReduceSectionList(sectionList, chromInfoArray, presetReductions[0]); + +for (i=1; i<REDUCTION_COUNT; i++) + { + reduction = reductionAmounts[i] = presetReductions[i]; + reduceSummaries[i] = bbiReduceSummaryList(reduceSummaries[i-1], chromInfoArray, + reduction); + } + +*summaryCount = REDUCTION_COUNT; +} + +void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, + int blockSize, int itemsPerSlot, boolean doCompress, boolean keepAllChromosomes, + boolean fixedSummaries, char *fileName) +/* Create a bigWig file out of a sorted sectionList. */ +{ +bits64 sectionCount = slCount(sectionList); +FILE *f = mustOpen(fileName, "wb"); +bits32 sig = bigWigSig; +bits16 version = bbiCurrentVersion; +bits16 summaryCount = 0; +bits16 reserved16 = 0; +bits32 reserved32 = 0; +bits64 reserved64 = 0; +bits64 dataOffset = 0, dataOffsetPos; +bits64 indexOffset = 0, indexOffsetPos; +bits64 chromTreeOffset = 0, chromTreeOffsetPos; +bits64 totalSummaryOffset = 0, totalSummaryOffsetPos; +bits32 uncompressBufSize = 0; +bits64 uncompressBufSizePos; +struct bbiSummary *reduceSummaries[10]; +bits32 reductionAmounts[10]; +bits64 reductionDataOffsetPos[10]; +bits64 reductionDataOffsets[10]; +bits64 reductionIndexOffsets[10]; +int i; + +/* Figure out chromosome ID's. */ +struct bbiChromInfo *chromInfoArray; +int chromCount, maxChromNameSize; +if (keepAllChromosomes) + bwgMakeAllChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize); +else + bwgMakeChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize); + +if (fixedSummaries) + bwgComputeFixedSummaries(sectionList, reduceSummaries, &summaryCount, chromInfoArray, reductionAmounts); +else + bwgComputeDynamicSummaries(sectionList, reduceSummaries, &summaryCount, chromInfoArray, chromCount, reductionAmounts, doCompress); + /* Write fixed header. */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); chromTreeOffsetPos = ftell(f); writeOne(f, chromTreeOffset); dataOffsetPos = ftell(f); writeOne(f, dataOffset); indexOffsetPos = ftell(f); writeOne(f, indexOffset); writeOne(f, reserved16); /* fieldCount */ writeOne(f, reserved16); /* definedFieldCount */ writeOne(f, reserved64); /* autoSqlOffset. */ totalSummaryOffsetPos = ftell(f); writeOne(f, totalSummaryOffset); @@ -950,31 +1055,31 @@ cirTreeFileBulkIndexToOpenFile(sectionArray, sizeof(sectionArray[0]), sectionCount, blockSize, 1, NULL, bwgSectionFetchKey, bwgSectionFetchOffset, indexOffset, f); freez(§ionArray); /* Write out summary sections. */ verbose(2, "bwgCreate writing %d summaries\n", summaryCount); for (i=0; i<summaryCount; ++i) { reductionDataOffsets[i] = ftell(f); reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, doCompress, f); verbose(3, "wrote %d of data, %d of index on level %d\n", (int)(reductionIndexOffsets[i] - reductionDataOffsets[i]), (int)(ftell(f) - reductionIndexOffsets[i]), i); } /* Calculate summary */ -struct bbiSummary *sum = firstSummaryList; +struct bbiSummary *sum = reduceSummaries[0]; if (sum != NULL) { totalSum.validCount = sum->validCount; totalSum.minVal = sum->minVal; totalSum.maxVal = sum->maxVal; totalSum.sumData = sum->sumData; totalSum.sumSquares = sum->sumSquares; for (sum = sum->next; sum != NULL; sum = sum->next) { totalSum.validCount += sum->validCount; if (sum->minVal < totalSum.minVal) totalSum.minVal = sum->minVal; if (sum->maxVal > totalSum.maxVal) totalSum.maxVal = sum->maxVal; totalSum.sumData += sum->sumData; totalSum.sumSquares += sum->sumSquares; } @@ -1080,34 +1185,36 @@ } } } } return sectionList; } void bigWigFileCreate( char *inName, /* Input file in ascii wiggle format. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 512 is good. */ boolean clipDontDie, /* If TRUE then clip items off end of chrom rather than dying. */ boolean compress, /* If TRUE then compress data. */ + boolean keepAllChromosomes, /* If TRUE then store all chromosomes in chromosomal b-tree. */ + boolean fixedSummaries, /* If TRUE then impose fixed summary levels. */ char *outName) /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) * to binary big wig format. */ { /* This code needs to agree with code in two other places currently - bigBedFileCreate, * and bbiFileOpen. I'm thinking of refactoring to share at least between * bigBedFileCreate and bigWigFileCreate. It'd be great so it could be structured * so that it could send the input in one chromosome at a time, and send in the zoom * stuff only after all the chromosomes are done. This'd potentially reduce the memory * footprint by a factor of 2 or 4. Still, for now it works. -JK */ struct hash *chromSizeHash = bbiChromSizesFromFile(chromSizes); struct lm *lm = lmInit(0); struct bwgSection *sectionList = bwgParseWig(inName, clipDontDie, chromSizeHash, itemsPerSlot, lm); if (sectionList == NULL) errAbort("%s is empty of data", inName); -bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, outName); +bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, keepAllChromosomes, fixedSummaries, outName); lmCleanup(&lm); }