12d0f12bd304787c52cab0780e367d36b020f84e kent Tue Feb 26 12:11:18 2013 -0800 Adding name index to bigBed files. The write side I _think_ is working. Still developing read side. diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index 54bbfa9..06e66b5 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -1,733 +1,816 @@ /* bedToBigBed - Convert bed to bigBed.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "obscure.h" #include "asParse.h" #include "basicBed.h" #include "sig.h" #include "rangeTree.h" #include "zlibFace.h" #include "sqlNum.h" +#include "bPlusTree.h" #include "bigBed.h" char *version = "2.3"; int blockSize = 256; int itemsPerSlot = 512; +boolean doNameIndex = FALSE; int bedN = 0; /* number of standard bed fields */ int bedP = 0; /* number of bed plus fields */ char *as = NULL; static boolean doCompress = FALSE; static boolean tabSep = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedToBigBed v. %s - Convert bed file to bigBed. (BigBed version: %d)\n" "usage:\n" " bedToBigBed in.bed chrom.sizes out.bb\n" "Where in.bed is in one of the ascii bed formats, but not including track lines\n" "and chrom.sizes is two column: <chromosome name> <size in bases>\n" "and out.bb is the output indexed big bed file.\n" "Use the script: fetchChromSizes to obtain the actual chrom.sizes information\n" "from UCSC, please do not make up a chrom sizes from your own information.\n" "The in.bed file must be sorted by chromosome,start,\n" " to sort a bed file, use the unix sort command:\n" " sort -k1,1 -k2,2n unsorted.bed > sorted.bed\n" "\n" "options:\n" " -type=bedN[+[P]] : \n" " N is between 3 and 15, \n" " optional (+) if extra \"bedPlus\" fields, \n" " optional P specifies the number of extra fields. Not required, but preferred.\n" " Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n" " (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n" " -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n" " of each field in a row in AutoSql format here.\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" " -unc - If set, do not use compression.\n" " -tab - If set, expect fields to be tab separated, normally\n" " expects white space separator.\n" + " -nameIndex - If set, make an index of the name field\n" , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, {"type", OPTION_STRING}, {"as", OPTION_STRING}, {"unc", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, + {"nameIndex", OPTION_BOOLEAN}, {NULL, 0}, }; -void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, +struct bbNamedFileChunk +/* A name associated with an offset into a possibly large file. */ + { + char *name; + bits64 offset; + bits64 size; + }; + +int bbNamedFileChunkCmpByName(const void *va, const void *vb) +/* Compare two named offset object to facilitate qsorting by name. */ +{ +const struct bbNamedFileChunk *a = va, *b = vb; +return strcmp(a->name, b->name); +} + +static int maxBedNameSize; + +void bbNamedFileChunkKey(const void *va, char *keyBuf) +/* Copy name to keyBuf for bPlusTree maker */ +{ +const struct bbNamedFileChunk *item = va; +strncpy(keyBuf,item->name, maxBedNameSize); +} + +void *bbNamedFileChunkVal(const void *va) +/* Return pointer to val for bPlusTree maker. */ +{ +const struct bbNamedFileChunk *item = va; +return (void *)&item->offset; +} + +static void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, boolean doCompress, FILE *f, int resTryCount, int resScales[], int resSizes[], + struct bbNamedFileChunk *namedChunks, int bedCount, bits16 *retFieldCount, bits32 *retMaxBlockSize) /* Read through lf, writing it in f. Save starting points of blocks (every itemsPerSlot) * to boundsArray */ { int maxBlockSize = 0; struct bbiChromUsage *usage = usageList; char *line, *row[256]; // limit of 256 columns is arbitrary, but useful to catch pathological input int fieldCount = 0, lastField = 0; int itemIx = 0, sectionIx = 0; -bits64 blockOffset = 0; +bits64 blockStartOffset = 0, blockEndOffset = 0; int startPos = 0, endPos = 0; bits32 chromId = 0; boolean allocedAs = FALSE; struct dyString *stream = dyStringNew(0); /* Will keep track of some things that help us determine how much to reduce. */ bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; boolean atEnd = FALSE, sameChrom = FALSE; bits32 start = 0, end = 0; char *chrom = NULL; struct bed *bed; AllocVar(bed); +/* Help keep track of which beds are in current chunk so as to write out + * namedChunks. */ +int sectionStartIx = 0, sectionEndIx = 0; + for (;;) { /* Get next line of input if any. */ if (lineFileNextReal(lf, &line)) { /* First time through figure out the field count and if not set, the bedN. */ if (fieldCount == 0) { if (as == NULL) { if (tabSep) fieldCount = chopByChar(line, '\t', NULL, 256); // Do not use chopString, see GOTCHA else fieldCount = chopByWhite(line, NULL, 0); if (bedN == 0) bedN = fieldCount; if (bedN > 15) { bedN = 15; bedP = fieldCount - bedN; } char *asText = bedAsDef(bedN, fieldCount); as = asParseText(asText); allocedAs = TRUE; freeMem(asText); } else { fieldCount = slCount(as->columnList); // if the .as is specified, the -type must be also so that the number of standard BED columns is known. bedN will be >0. asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard } if (fieldCount > ArraySize(row)) errAbort("Too many fields [%d], current maximum fields limit is %lu", fieldCount, (unsigned long)ArraySize(row)); lastField = fieldCount - 1; *retFieldCount = fieldCount; if (bedP == -1) // user did not specify how many plus columns there are. { bedP = fieldCount - bedN; if (bedP < 1) errAbort("fieldCount input (%d) did not match the specification (%s)\n" , fieldCount, optionVal("type", "")); } if (fieldCount != bedN + bedP) errAbort("fieldCount input (%d) did not match the specification (%s)\n" , fieldCount, optionVal("type", "")); + + if (namedChunks != NULL && fieldCount < 4) + errAbort("No name field in bed, can't index."); } /* Chop up line and make sure the word count is right. */ int wordCount; if (tabSep) wordCount = chopTabs(line, row); else wordCount = chopLine(line, row); lineFileExpectWords(lf, fieldCount, wordCount); loadAndValidateBed(row, bedN, fieldCount, lf, bed, as, FALSE); chrom = bed->chrom; start = bed->chromStart; end = bed->chromEnd; sameChrom = sameString(chrom, usage->name); } else /* No next line */ { atEnd = TRUE; } /* Check conditions that would end block and save block info and advance to next if need be. */ if (atEnd || !sameChrom || itemIx >= itemsPerSlot) { /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxBlockSize) maxBlockSize = stream->stringSize; if (doCompress) { size_t maxCompSize = zCompBufSize(stream->stringSize); // keep around an area of scratch memory static int compBufSize = 0; static char *compBuf = NULL; // check to see if buffer needed for compression is big enough if (compBufSize < maxCompSize) { // free up the old not-big-enough piece freez(&compBuf); // freez knows bout NULL // get new scratch area compBufSize = maxCompSize; compBuf = needLargeMem(compBufSize); } int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); dyStringClear(stream); + /* Save block offset and size for all named chunks in this section. */ + if (namedChunks != NULL) + { + blockEndOffset = ftell(f); + int i; + for (i=sectionStartIx; i<sectionEndIx; ++i) + { + struct bbNamedFileChunk *chunk = namedChunks + i; + chunk->offset = blockStartOffset; + chunk->size = blockEndOffset - blockStartOffset; + } + sectionStartIx = sectionEndIx; + } + /* Save info on existing block. */ struct bbiBoundsArray *b = &bounds[sectionIx]; - b->offset = blockOffset; + b->offset = blockStartOffset; b->range.chromIx = chromId; b->range.start = startPos; b->range.end = endPos; ++sectionIx; itemIx = 0; if (atEnd) break; } /* Advance to next chromosome if need be and get chromosome id. */ if (!sameChrom) { usage = usage->next; assert(usage != NULL); assert(sameString(chrom, usage->name)); for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } chromId = usage->id; /* At start of block we save a lot of info. */ if (itemIx == 0) { - blockOffset = ftell(f); + blockStartOffset = ftell(f); startPos = start; endPos = end; } /* Otherwise just update end. */ { if (endPos < end) endPos = end; /* No need to update startPos since list is sorted. */ } + /* Save name into namedOffset if need be. */ + if (namedChunks != NULL) + { + namedChunks[sectionEndIx].name = cloneString(bed->name); + sectionEndIx += 1; + } + /* Write out data. */ dyStringWriteOne(stream, chromId); dyStringWriteOne(stream, start); dyStringWriteOne(stream, end); if (fieldCount > 3) { int i; /* Write 3rd through next to last field and a tab separator. */ for (i=3; i<lastField; ++i) { char *s = row[i]; dyStringAppend(stream, s); dyStringAppendC(stream, '\t'); } /* Write last field and terminal zero */ char *s = row[lastField]; dyStringAppend(stream, s); } dyStringAppendC(stream, 0); itemIx += 1; /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; if (start >= resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } } assert(sectionIx == sectionCount); freez(&bed); if (allocedAs) asObjectFreeList(&as); *retMaxBlockSize = maxBlockSize; } struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom) /* Read lines from bed file as long as they match chrom. Return a rangeTree that * corresponds to the coverage. */ { struct rbTree *tree = rangeTreeNew(); char *line; while (lineFileNextReal(lf, &line)) { if (!startsWithWord(chrom, line)) { lineFileReuse(lf); break; } char *row[3]; chopLine(line, row); unsigned start = sqlUnsigned(row[1]); unsigned end = sqlUnsigned(row[2]); rangeTreeAddToCoverageDepth(tree, start, end); } return tree; } static struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress, struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart, struct bbiSummaryElement *totalSum) /* Write out data reduced by factor of initialReduction. Also calculate and keep in memory * next reduction level. This is more work than some ways, but it keeps us from having to * keep the first reduction entirely in memory. */ { struct bbiSummary *twiceReducedList = NULL; bits32 doubleReductionSize = initialReduction * zoomIncrement; struct bbiChromUsage *usage = usageList; struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd; boundsPt = AllocArray(boundsArray, initialReductionCount); boundsEnd = boundsPt + initialReductionCount; *retDataStart = ftell(f); writeOne(f, initialReductionCount); /* This gets a little complicated I'm afraid. The strategy is to: * 1) Build up a range tree that represents coverage depth on that chromosome * This also has the nice side effect of getting rid of overlaps. * 2) Stream through the range tree, outputting the initial summary level and * further reducing. */ boolean firstTime = TRUE; struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress); for (usage = usageList; usage != NULL; usage = usage->next) { struct bbiSummary oneSummary, *sum = NULL; struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name); struct range *range, *rangeList = rangeTreeList(rangeTree); for (range = rangeList; range != NULL; range = range->next) { /* Grab values we want from range. */ double val = ptToInt(range->val); int start = range->start; int end = range->end; bits32 size = end - start; /* Add to total summary. */ if (firstTime) { totalSum->validCount = size; totalSum->minVal = totalSum->maxVal = val; totalSum->sumData = val*size; totalSum->sumSquares = val*val*size; firstTime = FALSE; } else { totalSum->validCount += size; if (val < totalSum->minVal) totalSum->minVal = val; if (val > totalSum->maxVal) totalSum->maxVal = val; totalSum->sumData += val*size; totalSum->sumSquares += val*val*size; } /* If start past existing block then output it. */ if (sum != NULL && sum->end <= start) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, usage->size, lm, stream); sum = NULL; } /* If don't have a summary we're working on now, make one. */ if (sum == NULL) { oneSummary.chromId = usage->id; oneSummary.start = start; oneSummary.end = start + initialReduction; if (oneSummary.end > usage->size) oneSummary.end = usage->size; oneSummary.minVal = oneSummary.maxVal = val; oneSummary.sumData = oneSummary.sumSquares = 0.0; oneSummary.validCount = 0; sum = &oneSummary; } /* Deal with case where might have to split an item between multiple summaries. This * loop handles all but the final affected summary in that case. */ while (end > sum->end) { /* Fold in bits that overlap with existing summary and output. */ int overlap = rangeIntersection(start, end, sum->start, sum->end); assert(overlap > 0); verbose(3, "Splitting size %d at %d, overlap %d\n", end - start, sum->end, overlap); sum->validCount += overlap; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * overlap; sum->sumSquares += val*val * overlap; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, usage->size, lm, stream); size -= overlap; /* Move summary to next part. */ sum->start = start = sum->end; sum->end = start + initialReduction; if (sum->end > usage->size) sum->end = usage->size; sum->minVal = sum->maxVal = val; sum->sumData = sum->sumSquares = 0.0; sum->validCount = 0; } /* Add to summary. */ sum->validCount += size; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * size; sum->sumSquares += val*val * size; } if (sum != NULL) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, usage->size, lm, stream); } rangeTreeFree(&rangeTree); } bbiSumOutStreamClose(&stream); /* Write out 1st zoom index. */ int indexOffset = *retIndexStart = ftell(f); assert(boundsPt == boundsEnd); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; } void bbFileCreate( char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */ char *asFileName, /* If non-null points to a .as file that describes fields. */ boolean doCompress, /* If TRUE then compress data. */ + boolean doNameIndex, /* If TRUE then index names as well. */ char *outName) /* BigBed output file name. */ /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */ { /* Set up timing measures. */ verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); /* Load up as object if defined in file. */ struct asObject *as = NULL; if (asFileName != NULL) { /* Parse it and do sanity check. */ as = asParseFile(asFileName); if (as->next != NULL) errAbort("Can only handle .as files containing a single object."); } /* Load in chromosome sizes. */ struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "Read %d chromosomes and sizes from %s\n", chromSizesHash->elCount, chromSizes); /* Do first pass, mostly just scanning file and counting hits per chromosome. */ int minDiff = 0; double aveSpan = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; -struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount); +int maxNameSize = 0; +struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount, &maxNameSize); verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList)); verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan); /* Open output file and write dummy header. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out asFile if any */ bits64 asOffset = 0; if (asFileName != NULL) { int colCount = slCount(as->columnList); asOffset = ftell(f); FILE *asFile = mustOpen(asFileName, "r"); copyOpenFile(asFile, f); fputc(0, f); carefulClose(&asFile); verbose(2, "%s has %d columns\n", asFileName, colCount); } /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); /* Write out chromosome/size database. */ bits64 chromTreeOffset = ftell(f); bbiWriteChromInfo(usageList, blockSize, f); /* Set up to keep track of possible initial reduction levels. */ int resTryCount = 10, resTry; int resIncrement = 4; int resScales[resTryCount], resSizes[resTryCount]; int minZoom = 10; int res = aveSpan; if (res < minZoom) res = minZoom; for (resTry = 0; resTry < resTryCount; ++resTry) { resSizes[resTry] = 0; resScales[resTry] = res; // if aveSpan is large, then the initial value of res is large, // and we cannot do all 10 levels without overflowing res* integers and other related variables. if (res > 1000000000) { resTryCount = resTry + 1; verbose(2, "resTryCount reduced from 10 to %d\n", resTryCount); break; } res *= resIncrement; } /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); writeOne(f, bedCount); bits32 blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); struct bbiBoundsArray *boundsArray; AllocArray(boundsArray, blockCount); lineFileRewind(lf); bits16 fieldCount=0; bits32 maxBlockSize = 0; +struct bbNamedFileChunk *namedChunks = NULL; +if (doNameIndex) + AllocArray(namedChunks, bedCount); writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress, - f, resTryCount, resScales, resSizes, &fieldCount, &maxBlockSize); + f, resTryCount, resScales, resSizes, namedChunks, bedCount, &fieldCount, &maxBlockSize); verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", (long long)bedCount, fieldCount); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); verboseTime(1, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; int zoomLevels = 0; /* Write out first zoomed section while storing in memory next zoom level. */ if (aveSpan > 0) { bits64 dataSize = indexOffset - dataOffset; int maxReducedSize = dataSize/2; int initialReduction = 0, initialReducedCount = 0; /* Figure out initialReduction for zoom. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk); if (doCompress) reducedSize /= 2; // Estimate! if (reducedSize <= maxReducedSize) { initialReduction = resScales[resTry]; initialReducedCount = resSizes[resTry]; break; } } verbose(2, "initialReduction %d, initialReducedCount = %d\n", initialReduction, initialReducedCount); verbose(2, "dataSize %llu, reducedSize %llu, resScales[0] = %d\n", dataSize, (bits64)(initialReducedCount*sizeof(struct bbiSummaryOnDisk)), resScales[0]); if (initialReduction > 0) { struct lm *lm = lmInit(0); int zoomIncrement = 4; lineFileRewind(lf); struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, fieldCount, lf, initialReduction, initialReducedCount, resIncrement, blockSize, itemsPerSlot, doCompress, lm, f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum); verboseTime(1, "pass3 - writeReducedOnceReturnReducedTwice"); zoomAmounts[0] = initialReduction; zoomLevels = 1; int zoomCount = initialReducedCount; int reduction = initialReduction * zoomIncrement; while (zoomLevels < bbiMaxZoomLevels) { int rezoomCount = slCount(rezoomedList); if (rezoomCount >= zoomCount) break; zoomCount = rezoomCount; zoomDataOffsets[zoomLevels] = ftell(f); zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, blockSize, itemsPerSlot, doCompress, f); zoomAmounts[zoomLevels] = reduction; ++zoomLevels; reduction *= zoomIncrement; rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm); } lmCleanup(&lm); verboseTime(1, "further reductions"); } } +/* Write out name index if need be. */ +bits64 nameIndexOffset = 0; +if (doNameIndex) + { + qsort(namedChunks, bedCount, sizeof(namedChunks[0]), bbNamedFileChunkCmpByName); + nameIndexOffset = ftell(f); + maxBedNameSize = maxNameSize; + bptFileBulkIndexToOpenFile(namedChunks, sizeof(namedChunks[0]), bedCount, + blockSize, bbNamedFileChunkKey, maxNameSize, bbNamedFileChunkVal, + sizeof(bits64) + sizeof(bits64), f); + verboseTime(1, "Sorting and writing name index"); + } + /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxBlockSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigBedSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits32 reserved32 = 0; bits64 reserved64 = 0; bits16 definedFieldCount = bedN; /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, fieldCount); writeOne(f, definedFieldCount); writeOne(f, asOffset); writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); -int i; -for (i=0; i<2; ++i) - writeOne(f, reserved32); +writeOne(f, nameIndexOffset); +assert(ftell(f) == 64); /* Write summary headers with data. */ +int i; verbose(2, "Writing %d levels of zoom\n", zoomLevels); for (i=0; i<zoomLevels; ++i) { verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]); writeOne(f, zoomAmounts[i]); writeOne(f, reserved32); writeOne(f, zoomDataOffsets[i]); writeOne(f, zoomIndexOffsets[i]); } /* Write rest of summary headers with no data. */ for (i=zoomLevels; i<bbiMaxZoomLevels; ++i) { writeOne(f, reserved32); writeOne(f, reserved32); writeOne(f, reserved64); writeOne(f, reserved64); } /* Write total summary. */ fseek(f, totalSummaryOffset, SEEK_SET); bbiSummaryElementWrite(f, &totalSum); /* Write end signature. */ fseek(f, 0L, SEEK_END); writeOne(f, sig); /* Clean up. */ lineFileClose(&lf); carefulClose(&f); freeHash(&chromSizesHash); bbiChromUsageFreeList(&usageList); asObjectFreeList(&as); } void bedToBigBed(char *inName, char *chromSizes, char *outName) /* bedToBigBed - Convert bed file to bigBed.. */ { bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, as, - doCompress, outName); + doCompress, doNameIndex, outName); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); as = optionVal("as", as); doCompress = !optionExists("unc"); +doNameIndex = optionExists("nameIndex"); tabSep = optionExists("tab"); if (argc != 4) usage(); if (optionExists("type")) { // parse type char *btype = cloneString(optionVal("type", "")); char *plus = strchr(btype, '+'); if (plus) { *plus++ = 0; if (isdigit(*plus)) bedP = sqlUnsigned(plus); else bedP = -1; } if (!startsWith("bed", btype)) errAbort("type must begin with \"bed\""); btype +=3; bedN = sqlUnsigned(btype); if (bedN < 3) errAbort("Bed must be 3 or higher, found %d\n", bedN); if (bedN > 15) errAbort("Bed must be 15 or lower, found %d\n", bedN); } else { if (as) errAbort("If you specify the .as file, you must specify the -type as well so that the number of standard BED columns is known."); } bedToBigBed(argv[1], argv[2], argv[3]); optionFree(); if (verboseLevel() > 1) printVmPeak(); return 0; }