12d0f12bd304787c52cab0780e367d36b020f84e kent Tue Feb 26 12:11:18 2013 -0800 Adding name index to bigBed files. The write side I _think_ is working. Still developing read side. diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index 54bbfa9..06e66b5 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -1,34 +1,36 @@ /* bedToBigBed - Convert bed to bigBed.. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "obscure.h" #include "asParse.h" #include "basicBed.h" #include "sig.h" #include "rangeTree.h" #include "zlibFace.h" #include "sqlNum.h" +#include "bPlusTree.h" #include "bigBed.h" char *version = "2.3"; int blockSize = 256; int itemsPerSlot = 512; +boolean doNameIndex = FALSE; int bedN = 0; /* number of standard bed fields */ int bedP = 0; /* number of bed plus fields */ char *as = NULL; static boolean doCompress = FALSE; static boolean tabSep = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedToBigBed v. %s - Convert bed file to bigBed. (BigBed version: %d)\n" "usage:\n" " bedToBigBed in.bed chrom.sizes out.bb\n" "Where in.bed is in one of the ascii bed formats, but not including track lines\n" "and chrom.sizes is two column: <chromosome name> <size in bases>\n" @@ -41,74 +43,112 @@ "\n" "options:\n" " -type=bedN[+[P]] : \n" " N is between 3 and 15, \n" " optional (+) if extra \"bedPlus\" fields, \n" " optional P specifies the number of extra fields. Not required, but preferred.\n" " Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n" " (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n" " -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n" " of each field in a row in AutoSql format here.\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" " -unc - If set, do not use compression.\n" " -tab - If set, expect fields to be tab separated, normally\n" " expects white space separator.\n" + " -nameIndex - If set, make an index of the name field\n" , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, {"type", OPTION_STRING}, {"as", OPTION_STRING}, {"unc", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, + {"nameIndex", OPTION_BOOLEAN}, {NULL, 0}, }; -void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, +struct bbNamedFileChunk +/* A name associated with an offset into a possibly large file. */ + { + char *name; + bits64 offset; + bits64 size; + }; + +int bbNamedFileChunkCmpByName(const void *va, const void *vb) +/* Compare two named offset object to facilitate qsorting by name. */ +{ +const struct bbNamedFileChunk *a = va, *b = vb; +return strcmp(a->name, b->name); +} + +static int maxBedNameSize; + +void bbNamedFileChunkKey(const void *va, char *keyBuf) +/* Copy name to keyBuf for bPlusTree maker */ +{ +const struct bbNamedFileChunk *item = va; +strncpy(keyBuf,item->name, maxBedNameSize); +} + +void *bbNamedFileChunkVal(const void *va) +/* Return pointer to val for bPlusTree maker. */ +{ +const struct bbNamedFileChunk *item = va; +return (void *)&item->offset; +} + +static void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, boolean doCompress, FILE *f, int resTryCount, int resScales[], int resSizes[], + struct bbNamedFileChunk *namedChunks, int bedCount, bits16 *retFieldCount, bits32 *retMaxBlockSize) /* Read through lf, writing it in f. Save starting points of blocks (every itemsPerSlot) * to boundsArray */ { int maxBlockSize = 0; struct bbiChromUsage *usage = usageList; char *line, *row[256]; // limit of 256 columns is arbitrary, but useful to catch pathological input int fieldCount = 0, lastField = 0; int itemIx = 0, sectionIx = 0; -bits64 blockOffset = 0; +bits64 blockStartOffset = 0, blockEndOffset = 0; int startPos = 0, endPos = 0; bits32 chromId = 0; boolean allocedAs = FALSE; struct dyString *stream = dyStringNew(0); /* Will keep track of some things that help us determine how much to reduce. */ bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; boolean atEnd = FALSE, sameChrom = FALSE; bits32 start = 0, end = 0; char *chrom = NULL; struct bed *bed; AllocVar(bed); +/* Help keep track of which beds are in current chunk so as to write out + * namedChunks. */ +int sectionStartIx = 0, sectionEndIx = 0; + for (;;) { /* Get next line of input if any. */ if (lineFileNextReal(lf, &line)) { /* First time through figure out the field count and if not set, the bedN. */ if (fieldCount == 0) { if (as == NULL) { if (tabSep) fieldCount = chopByChar(line, '\t', NULL, 256); // Do not use chopString, see GOTCHA else fieldCount = chopByWhite(line, NULL, 0); if (bedN == 0) @@ -132,30 +172,33 @@ if (fieldCount > ArraySize(row)) errAbort("Too many fields [%d], current maximum fields limit is %lu", fieldCount, (unsigned long)ArraySize(row)); lastField = fieldCount - 1; *retFieldCount = fieldCount; if (bedP == -1) // user did not specify how many plus columns there are. { bedP = fieldCount - bedN; if (bedP < 1) errAbort("fieldCount input (%d) did not match the specification (%s)\n" , fieldCount, optionVal("type", "")); } if (fieldCount != bedN + bedP) errAbort("fieldCount input (%d) did not match the specification (%s)\n" , fieldCount, optionVal("type", "")); + + if (namedChunks != NULL && fieldCount < 4) + errAbort("No name field in bed, can't index."); } /* Chop up line and make sure the word count is right. */ int wordCount; if (tabSep) wordCount = chopTabs(line, row); else wordCount = chopLine(line, row); lineFileExpectWords(lf, fieldCount, wordCount); loadAndValidateBed(row, bedN, fieldCount, lf, bed, as, FALSE); chrom = bed->chrom; start = bed->chromStart; end = bed->chromEnd; @@ -187,68 +230,89 @@ // free up the old not-big-enough piece freez(&compBuf); // freez knows bout NULL // get new scratch area compBufSize = maxCompSize; compBuf = needLargeMem(compBufSize); } int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); dyStringClear(stream); + /* Save block offset and size for all named chunks in this section. */ + if (namedChunks != NULL) + { + blockEndOffset = ftell(f); + int i; + for (i=sectionStartIx; i<sectionEndIx; ++i) + { + struct bbNamedFileChunk *chunk = namedChunks + i; + chunk->offset = blockStartOffset; + chunk->size = blockEndOffset - blockStartOffset; + } + sectionStartIx = sectionEndIx; + } + /* Save info on existing block. */ struct bbiBoundsArray *b = &bounds[sectionIx]; - b->offset = blockOffset; + b->offset = blockStartOffset; b->range.chromIx = chromId; b->range.start = startPos; b->range.end = endPos; ++sectionIx; itemIx = 0; if (atEnd) break; } /* Advance to next chromosome if need be and get chromosome id. */ if (!sameChrom) { usage = usage->next; assert(usage != NULL); assert(sameString(chrom, usage->name)); for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } chromId = usage->id; /* At start of block we save a lot of info. */ if (itemIx == 0) { - blockOffset = ftell(f); + blockStartOffset = ftell(f); startPos = start; endPos = end; } /* Otherwise just update end. */ { if (endPos < end) endPos = end; /* No need to update startPos since list is sorted. */ } + /* Save name into namedOffset if need be. */ + if (namedChunks != NULL) + { + namedChunks[sectionEndIx].name = cloneString(bed->name); + sectionEndIx += 1; + } + /* Write out data. */ dyStringWriteOne(stream, chromId); dyStringWriteOne(stream, start); dyStringWriteOne(stream, end); if (fieldCount > 3) { int i; /* Write 3rd through next to last field and a tab separator. */ for (i=3; i<lastField; ++i) { char *s = row[i]; dyStringAppend(stream, s); dyStringAppendC(stream, '\t'); } /* Write last field and terminal zero */ @@ -431,57 +495,59 @@ indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; } void bbFileCreate( char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */ char *asFileName, /* If non-null points to a .as file that describes fields. */ boolean doCompress, /* If TRUE then compress data. */ + boolean doNameIndex, /* If TRUE then index names as well. */ char *outName) /* BigBed output file name. */ /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */ { /* Set up timing measures. */ verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); /* Load up as object if defined in file. */ struct asObject *as = NULL; if (asFileName != NULL) { /* Parse it and do sanity check. */ as = asParseFile(asFileName); if (as->next != NULL) errAbort("Can only handle .as files containing a single object."); } /* Load in chromosome sizes. */ struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "Read %d chromosomes and sizes from %s\n", chromSizesHash->elCount, chromSizes); /* Do first pass, mostly just scanning file and counting hits per chromosome. */ int minDiff = 0; double aveSpan = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; -struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount); +int maxNameSize = 0; +struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount, &maxNameSize); verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList)); verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan); /* Open output file and write dummy header. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out asFile if any */ bits64 asOffset = 0; if (asFileName != NULL) { int colCount = slCount(as->columnList); asOffset = ftell(f); FILE *asFile = mustOpen(asFileName, "r"); @@ -521,32 +587,35 @@ verbose(2, "resTryCount reduced from 10 to %d\n", resTryCount); break; } res *= resIncrement; } /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); writeOne(f, bedCount); bits32 blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); struct bbiBoundsArray *boundsArray; AllocArray(boundsArray, blockCount); lineFileRewind(lf); bits16 fieldCount=0; bits32 maxBlockSize = 0; +struct bbNamedFileChunk *namedChunks = NULL; +if (doNameIndex) + AllocArray(namedChunks, bedCount); writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress, - f, resTryCount, resScales, resSizes, &fieldCount, &maxBlockSize); + f, resTryCount, resScales, resSizes, namedChunks, bedCount, &fieldCount, &maxBlockSize); verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", (long long)bedCount, fieldCount); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); verboseTime(1, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; @@ -598,64 +667,77 @@ break; zoomCount = rezoomCount; zoomDataOffsets[zoomLevels] = ftell(f); zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, blockSize, itemsPerSlot, doCompress, f); zoomAmounts[zoomLevels] = reduction; ++zoomLevels; reduction *= zoomIncrement; rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm); } lmCleanup(&lm); verboseTime(1, "further reductions"); } } +/* Write out name index if need be. */ +bits64 nameIndexOffset = 0; +if (doNameIndex) + { + qsort(namedChunks, bedCount, sizeof(namedChunks[0]), bbNamedFileChunkCmpByName); + nameIndexOffset = ftell(f); + maxBedNameSize = maxNameSize; + bptFileBulkIndexToOpenFile(namedChunks, sizeof(namedChunks[0]), bedCount, + blockSize, bbNamedFileChunkKey, maxNameSize, bbNamedFileChunkVal, + sizeof(bits64) + sizeof(bits64), f); + verboseTime(1, "Sorting and writing name index"); + } + /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxBlockSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigBedSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits32 reserved32 = 0; bits64 reserved64 = 0; bits16 definedFieldCount = bedN; /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, fieldCount); writeOne(f, definedFieldCount); writeOne(f, asOffset); writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); -int i; -for (i=0; i<2; ++i) - writeOne(f, reserved32); +writeOne(f, nameIndexOffset); +assert(ftell(f) == 64); /* Write summary headers with data. */ +int i; verbose(2, "Writing %d levels of zoom\n", zoomLevels); for (i=0; i<zoomLevels; ++i) { verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]); writeOne(f, zoomAmounts[i]); writeOne(f, reserved32); writeOne(f, zoomDataOffsets[i]); writeOne(f, zoomIndexOffsets[i]); } /* Write rest of summary headers with no data. */ for (i=zoomLevels; i<bbiMaxZoomLevels; ++i) { writeOne(f, reserved32); writeOne(f, reserved32); writeOne(f, reserved64); @@ -671,41 +753,42 @@ writeOne(f, sig); /* Clean up. */ lineFileClose(&lf); carefulClose(&f); freeHash(&chromSizesHash); bbiChromUsageFreeList(&usageList); asObjectFreeList(&as); } void bedToBigBed(char *inName, char *chromSizes, char *outName) /* bedToBigBed - Convert bed file to bigBed.. */ { bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, as, - doCompress, outName); + doCompress, doNameIndex, outName); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); as = optionVal("as", as); doCompress = !optionExists("unc"); +doNameIndex = optionExists("nameIndex"); tabSep = optionExists("tab"); if (argc != 4) usage(); if (optionExists("type")) { // parse type char *btype = cloneString(optionVal("type", "")); char *plus = strchr(btype, '+'); if (plus) { *plus++ = 0; if (isdigit(*plus)) bedP = sqlUnsigned(plus); else bedP = -1;