cb217f20b4a701edfa88f4db623266a9bf25202e kent Mon Mar 4 22:58:07 2013 -0800 Some more steps towards supporting multiple extra indexes in bigBed files. diff --git src/inc/bbiFile.h src/inc/bbiFile.h index 22d585a..fbbc150 100644 --- src/inc/bbiFile.h +++ src/inc/bbiFile.h @@ -24,31 +24,31 @@ * zoomHeaders there are zoomLevels number of these * reductionLevel 4 bytes * reserved 4 bytes * dataOffset 8 bytes * indexOffset 8 bytes * autoSql string (zero terminated - only present if autoSqlOffset non-zero) * totalSummary - summary of all data in file - only present if totalSummaryOffset non-zero * basesCovered 8 bytes * minVal 8 bytes float (for bigBed minimum depth of coverage) * maxVal 8 bytes float (for bigBed maximum depth of coverage) * sumData 8 bytes float (for bigBed sum of coverage) * sumSquared 8 bytes float (for bigBed sum of coverage squared) * extendedHeader * extensionSize 2 size of extended header in bytes - currently 64 * extraIndexCount 2 number of extra fields we will be indexing - * extraIndexOffset 8 Offset to list of non-chrom/start/end indexes + * extraIndexListOffset 8 Offset to list of non-chrom/start/end indexes * reserved 48 All zeroes for now * extraIndexList - one of these for each extraIndex * type 2 Type of index. Always 0 for bPlusTree now * fieldCount 2 Number of fields used in this index. Always 1 for now * reserved 12 All zeroes for now * fieldList - one of these for each field being used in _this_ index * fieldId 2 index of field within record * reserved 2 All zeroes for now * chromosome b+ tree bPlusTree index * full data * sectionCount 8 bytes (item count for bigBeds) * section data section count sections, of three types (bed data for bigBeds) * full index cirTree index * zoom info one of these for each zoom level * zoom data @@ -105,36 +105,43 @@ { struct bbiFile *next; /* Next in list. */ char *fileName; /* Name of file - for better error reporting. */ struct udcFile *udc; /* Open UDC file handle. */ bits32 typeSig; /* bigBedSig or bigWigSig for now. */ boolean isSwapped; /* If TRUE need to byte swap everything. */ struct bptFile *chromBpt; /* Index of chromosomes. */ bits16 version; /* Version number - initially 1. */ bits16 zoomLevels; /* Number of zoom levels. */ bits64 chromTreeOffset; /* Offset to chromosome index. */ bits64 unzoomedDataOffset; /* Start of unzoomed data. */ bits64 unzoomedIndexOffset; /* Start of unzoomed index. */ bits16 fieldCount; /* Number of columns in bed version. */ bits16 definedFieldCount; /* Number of columns using bed standard definitions. */ bits64 asOffset; /* Offset to embedded null-terminated AutoSQL file. */ - bits64 totalSummaryOffset; /* Offset to total summary information if any. (On older files have to calculate) */ + bits64 totalSummaryOffset; /* Offset to total summary information if any. + (On older files have to calculate) */ bits32 uncompressBufSize; /* Size of uncompression buffer, 0 if uncompressed */ - bits64 nameIndexOffset; /* Start of name index or zero if none. */ + bits64 extensionOffset; /* Start of header extension block or 0 if none. */ struct cirTreeFile *unzoomedCir; /* Unzoomed data index in memory - may be NULL. */ struct bbiZoomLevel *levelList; /* List of zoom levels. */ - struct bptFile *nameBpt; /* Index of names, may be NULL */ + + /* Fields based on extension block. */ + bits16 extensionSize; /* Size of extension block */ + bits16 extraIndexCount; /* Number of extra indexes (on fields other than chrom,start,end */ + bits64 extraIndexListOffset; /* Offset to list of extra indexes */ + + struct bptFile *nameBpt; /* Index of names, may be NULL */ // uglyf - remove }; struct bbiFile *bbiFileOpen(char *fileName, bits32 sig, char *typeName); /* Open up big wig or big bed file. */ void bbiFileClose(struct bbiFile **pBwf); /* Close down a big wig/big bed file. */ struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeFile *ctf, char *chrom, bits32 start, bits32 end, bits32 *retChromId); /* Fetch list of file blocks that contain items overlapping chromosome range. */ struct bbiChromIdSize /* We store an id/size pair in chromBpt bPlusTree */ @@ -338,30 +345,42 @@ void bbiWriteFloat(FILE *f, float val); /* Write out floating point val to file. Mostly to convert from double... */ struct hash *bbiChromSizesFromFile(char *fileName); /* Read two column file into hash keyed by chrom. */ bits64 bbiTotalSummarySize(struct bbiSummary *list); /* Return size on disk of all summaries. */ void bbiChromUsageFree(struct bbiChromUsage **pUsage); /* free a single bbiChromUsage structure */ void bbiChromUsageFreeList(struct bbiChromUsage **pList); /* free a list of bbiChromUsage structures */ +struct bbExIndexMaker +/* A helper structure to make indexes beyond primary one */ + { + bits16 indexCount; /* Number of extra indexes. */ + /* Kind of wish next four fields, all of which are arrays indexed + * by the same thing, were a single array of a structure instead. */ + bits16 *indexFields; /* array of field ids, one for each extra index. */ + int *maxFieldSize; /* array of maximum sizes seen for this field. */ + struct bbNamedFileChunk **chunkArrayArray; /* where we keep name/start/size triples */ + bits64 *fileOffsets; /* array of file offsets where indexes starts. */ + int recordCount; /* number of records in file. */ + }; struct bbiChromUsage *bbiChromUsageFromBedFile(struct lineFile *lf, struct hash *chromSizesHash, struct bbExIndexMaker *eim, int *retMinDiff, double *retAveSize, bits64 *retBedCount); /* Go through bed file and collect chromosomes and statistics. If eim parameter is non-NULL * collect max field sizes there too. */ int bbiCountSectionsNeeded(struct bbiChromUsage *usageList, int itemsPerSlot); /* Count up number of sections needed for data. */ void bbiAddToSummary(bits32 chromId, bits32 chromSize, bits32 start, bits32 end, bits32 validCount, double minVal, double maxVal, double sumData, double sumSquares, int reduction, struct bbiSummary **pOutList); /* Add data range to summary - putting it onto top of list if possible, otherwise * expanding list. */