12d0f12bd304787c52cab0780e367d36b020f84e kent Tue Feb 26 12:11:18 2013 -0800 Adding name index to bigBed files. The write side I _think_ is working. Still developing read side. diff --git src/inc/bbiFile.h src/inc/bbiFile.h index 949624d..2d31caa 100644 --- src/inc/bbiFile.h +++ src/inc/bbiFile.h @@ -8,61 +8,62 @@ #include "localmem.h" /* bigWig/bigBed file structure: * fixedWidthHeader * magic# 4 bytes * version 2 bytes * zoomLevels 2 bytes * chromosomeTreeOffset 8 bytes * fullDataOffset 8 bytes * fullIndexOffset 8 bytes * fieldCount 2 bytes (for bigWig 0) * definedFieldCount 2 bytes (for bigWig 0) * autoSqlOffset 8 bytes (for bigWig 0) (0 if no autoSql information) * totalSummaryOffset 8 bytes (0 in earlier versions of file lacking totalSummary) * uncompressBufSize 4 bytes (Size of uncompression buffer. 0 if uncompressed.) - * reserved 8 bytes (0 for now) + * nameIndexOffset 8 bytes (Offset to name index, 0 if no such index) * zoomHeaders there are zoomLevels number of these * reductionLevel 4 bytes * reserved 4 bytes * dataOffset 8 bytes * indexOffset 8 bytes * autoSql string (zero terminated - only present if autoSqlOffset non-zero) * totalSummary - summary of all data in file - only present if totalSummaryOffset non-zero * basesCovered 8 bytes * minVal 8 bytes float (for bigBed minimum depth of coverage) * maxVal 8 bytes float (for bigBed maximum depth of coverage) * sumData 8 bytes float (for bigBed sum of coverage) * sumSquared 8 bytes float (for bigBed sum of coverage squared) * chromosome b+ tree bPlusTree index * full data * sectionCount 8 bytes (item count for bigBeds) * section data section count sections, of three types (bed data for bigBeds) * full index cirTree index * zoom info one of these for each zoom level * zoom data * zoomCount 4 bytes * zoom data there are zoomCount of these items * chromId 4 bytes * chromStart 4 bytes * chromEnd 4 bytes * validCount 4 bytes * minVal 4 bytes float * maxVal 4 bytes float * sumData 4 bytes float * sumSquares 4 bytes float * zoom index cirTree index + * name index [optional] bPlusTree index * magic# 4 bytes - same as magic number at start of header */ #ifndef CIRTREE_H #include "cirTree.h" #endif #define bbiCurrentVersion 4 /* Version history (of file format, not utilities - corresponds to version field in header) * 1 - Initial release * 1 - Unfortunately when attempting a transparent change to encoders, made the sectionCount * field inconsistent, sometimes not present, sometimes 32 bits. Since offset positions * in index were still accurate this did not break most applications, but it did show * up in the summary section of the Table Browser. * 2 - Made sectionCount consistently 64 bits. Also fixed missing zoomCount in first level of @@ -94,32 +95,34 @@ char *fileName; /* Name of file - for better error reporting. */ struct udcFile *udc; /* Open UDC file handle. */ bits32 typeSig; /* bigBedSig or bigWigSig for now. */ boolean isSwapped; /* If TRUE need to byte swap everything. */ struct bptFile *chromBpt; /* Index of chromosomes. */ bits16 version; /* Version number - initially 1. */ bits16 zoomLevels; /* Number of zoom levels. */ bits64 chromTreeOffset; /* Offset to chromosome index. */ bits64 unzoomedDataOffset; /* Start of unzoomed data. */ bits64 unzoomedIndexOffset; /* Start of unzoomed index. */ bits16 fieldCount; /* Number of columns in bed version. */ bits16 definedFieldCount; /* Number of columns using bed standard definitions. */ bits64 asOffset; /* Offset to embedded null-terminated AutoSQL file. */ bits64 totalSummaryOffset; /* Offset to total summary information if any. (On older files have to calculate) */ bits32 uncompressBufSize; /* Size of uncompression buffer, 0 if uncompressed */ + bits64 nameIndexOffset; /* Start of name index or zero if none. */ struct cirTreeFile *unzoomedCir; /* Unzoomed data index in memory - may be NULL. */ struct bbiZoomLevel *levelList; /* List of zoom levels. */ + struct bptFile *nameBpt; /* Index of names, may be NULL */ }; struct bbiFile *bbiFileOpen(char *fileName, bits32 sig, char *typeName); /* Open up big wig or big bed file. */ void bbiFileClose(struct bbiFile **pBwf); /* Close down a big wig/big bed file. */ struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeFile *ctf, char *chrom, bits32 start, bits32 end, bits32 *retChromId); /* Fetch list of file blocks that contain items overlapping chromosome range. */ struct bbiChromIdSize /* We store an id/size pair in chromBpt bPlusTree */ @@ -315,33 +318,33 @@ void bbiWriteFloat(FILE *f, float val); /* Write out floating point val to file. Mostly to convert from double... */ struct hash *bbiChromSizesFromFile(char *fileName); /* Read two column file into hash keyed by chrom. */ bits64 bbiTotalSummarySize(struct bbiSummary *list); /* Return size on disk of all summaries. */ void bbiChromUsageFree(struct bbiChromUsage **pUsage); /* free a single bbiChromUsage structure */ void bbiChromUsageFreeList(struct bbiChromUsage **pList); /* free a list of bbiChromUsage structures */ -struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, - struct hash *chromSizesHash, int *retMinDiff, double *retAveSize, bits64 *retBedCount); -/* Go through bed file and collect chromosomes and statistics. Free with bbiChromUsageFreeList */ +struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, + int *retMinDiff, double *retAveSize, bits64 *retBedCount, int *retMaxNameSize); +/* Go through bed file and collect chromosomes and statistics. */ int bbiCountSectionsNeeded(struct bbiChromUsage *usageList, int itemsPerSlot); /* Count up number of sections needed for data. */ void bbiAddToSummary(bits32 chromId, bits32 chromSize, bits32 start, bits32 end, bits32 validCount, double minVal, double maxVal, double sumData, double sumSquares, int reduction, struct bbiSummary **pOutList); /* Add data range to summary - putting it onto top of list if possible, otherwise * expanding list. */ void bbiAddRangeToSummary(bits32 chromId, bits32 chromSize, bits32 start, bits32 end, double val, int reduction, struct bbiSummary **pOutList); /* Add chromosome range to summary - putting it onto top of list if possible, otherwise * expanding list. */