cfa3310ec7d0c3ef45a3f647c7e7164453d9d4e0 kent Tue Mar 5 01:01:42 2013 -0800 A little more progress on multiple extra index change. First time reader and writer have both worked in any small way together. diff --git src/lib/bigBed.c src/lib/bigBed.c index a663195..1108797 100644 --- src/lib/bigBed.c +++ src/lib/bigBed.c @@ -190,44 +190,44 @@ summarySize, summary); } boolean bigBedSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, enum bbiSummaryType summaryType, int summarySize, double *summaryValues) /* Fill in summaryValues with data from indicated chromosome range in bigBed file. * Be sure to initialize summaryValues to a default value, which will not be touched * for regions without data in file. (Generally you want the default value to either * be 0.0 or nan("") depending on the application.) Returns FALSE if no data * at that position. */ { return bbiSummaryArray(bbi, chrom, start, end, bigBedCoverageIntervals, summaryType, summarySize, summaryValues); } +#ifdef OLD void bigBedAttachNameIndex(struct bbiFile *bbi) /* Attach name index part of bbiFile to bbi */ { -#ifdef OLD if (bbi->nameBpt == NULL) { if (bbi->nameIndexOffset == 0) errAbort("%s has no name index", bbi->fileName); udcSeek(bbi->udc, bbi->nameIndexOffset); bbi->nameBpt = bptFileAttach(bbi->fileName, bbi->udc); } -#endif /* OLD */ uglyAbort("bigBedAttachNameIndex() - no can do"); } +#endif /* OLD */ struct offsetSize /* Simple file offset and file size. */ { bits64 offset; bits64 size; }; static int cmpOffsetSizeRef(const void *va, const void *vb) /* Compare to sort slRef pointing to offsetSize. Sort is kind of hokey, * but guarantees all items that are the same will be next to each other * at least, which is all we care about. */ { const struct slRef *a = *((struct slRef **)va); const struct slRef *b = *((struct slRef **)vb); @@ -259,52 +259,53 @@ fos->size = byteSwap64(lastOffsetSize.size); } else { fos->offset = lastOffsetSize.offset; fos->size = lastOffsetSize.size; } slAddHead(&fosList, fos); } } slReverse(&fosList); return fosList; } -static struct fileOffsetSize *bigBedChunksMatchingName(struct bbiFile *bbi, char *name) +static struct fileOffsetSize *bigBedChunksMatchingName(struct bbiFile *bbi, + struct bptFile *index, char *name) /* Get list of file chunks that match name. Can slFreeList this when done. */ { -struct slRef *blockList = bptFileFindMultiple(bbi->nameBpt, +struct slRef *blockList = bptFileFindMultiple(index, name, strlen(name), sizeof(struct offsetSize)); struct fileOffsetSize *fosList = fosFromRedundantBlockList(&blockList, bbi->isSwapped); slRefFreeListAndVals(&blockList); return fosList; } static struct fileOffsetSize *bigBedChunksMatchingNames(struct bbiFile *bbi, - char **names, int nameCount) + struct bptFile *index, char **names, int nameCount) /* Get list of file chunks that match any of the names. Can slFreeList this when done. */ { /* Go through all names and make a blockList that includes all blocks with any hit to any name. * Many of these blocks will occur multiple times. */ struct slRef *blockList = NULL; int nameIx; for (nameIx = 0; nameIx < nameCount; ++nameIx) { char *name = names[nameIx]; - struct slRef *oneList = bptFileFindMultiple(bbi->nameBpt, + struct slRef *oneList = bptFileFindMultiple(index, name, strlen(name), sizeof(struct offsetSize)); blockList = slCat(oneList, blockList); } /* Create nonredundant list of blocks. */ struct fileOffsetSize *fosList = fosFromRedundantBlockList(&blockList, bbi->isSwapped); /* Clean up and resturn result. */ slRefFreeListAndVals(&blockList); return fosList; } typedef boolean (*BbFirstWordMatch)(char *line, void *target); /* A function that returns TRUE if first word in tab-separated line matches target. */ @@ -392,48 +393,49 @@ interval->rest = cloneString(dy->string); interval->chromId = chromIx; slAddHead(&intervalList, interval); } } /* Clean up temporary buffers. */ dyStringFree(&dy); freez(&uncompressedData); freez(&rawData); } slReverse(&intervalList); return intervalList; } -struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, char *name, struct lm *lm) + + +struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, struct bptFile *index, + char *name, struct lm *lm) /* Return list of intervals matching file. These intervals will be allocated out of lm. */ { -bigBedAttachNameIndex(bbi); -struct fileOffsetSize *fosList = bigBedChunksMatchingName(bbi, name); +struct fileOffsetSize *fosList = bigBedChunksMatchingName(bbi, index, name); struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, bbFirstWordMatchesName, name, lm); slFreeList(&fosList); return intervalList; } -struct bigBedInterval *bigBedMultiNameQuery(struct bbiFile *bbi, char **names, - int nameCount, struct lm *lm) +struct bigBedInterval *bigBedMultiNameQuery(struct bbiFile *bbi, struct bptFile *index, + char **names, int nameCount, struct lm *lm) /* Fetch all records matching any of the names. Return list is allocated out of lm. */ { /* Set up name index and get list of chunks that match any of our names. */ -bigBedAttachNameIndex(bbi); -struct fileOffsetSize *fosList = bigBedChunksMatchingNames(bbi, names, nameCount); +struct fileOffsetSize *fosList = bigBedChunksMatchingNames(bbi, index, names, nameCount); /* Create hash of all names. */ struct hash *hash = newHash(0); int nameIx; for (nameIx=0; nameIx < nameCount; ++nameIx) hashAdd(hash, names[nameIx], NULL); /* Get intervals where name matches hash target. */ struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, bbFirstWordIsInHash, hash, lm); /* Clean up and return results. */ slFreeList(&fosList); hashFree(&hash); @@ -473,44 +475,50 @@ } char *bigBedAutoSqlText(struct bbiFile *bbi) /* Get autoSql text if any associated with file. Do a freeMem of this when done. */ { if (bbi->asOffset == 0) return NULL; struct udcFile *f = bbi->udc; udcSeek(f, bbi->asOffset); return udcReadStringAndZero(f); } struct asObject *bigBedAs(struct bbiFile *bbi) /* Get autoSql object definition if any associated with file. */ { +struct asObject *as = bbi->cachedAs; +if (as != NULL) + return as; if (bbi->asOffset == 0) return NULL; char *asText = bigBedAutoSqlText(bbi); -struct asObject *as = asParseText(asText); +bbi->cachedAs = as = asParseText(asText); freeMem(asText); return as; } struct asObject *bigBedAsOrDefault(struct bbiFile *bbi) // Get asObject associated with bigBed - if none exists in file make it up from field counts. { -struct asObject *as = bigBedAs(bbi); +struct asObject *as = bbi->cachedAs; +if (as != NULL) + return as; +as = bigBedAs(bbi); if (as == NULL) - as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount)); + bbi->cachedAs = as = asParseText(bedAsDef(bbi->definedFieldCount, bbi->fieldCount)); return as; } struct asObject *bigBedFileAsObjOrDefault(char *fileName) // Get asObject associated with bigBed file, or the default. { struct bbiFile *bbi = bigBedFileOpen(fileName); if (bbi) { struct asObject *as = bigBedAsOrDefault(bbi); bbiFileClose(&bbi); return as; } return NULL; } @@ -531,59 +539,105 @@ /* See if we have any extra indexes, and if so seek to there. */ bits64 offset = bbi->extraIndexListOffset; if (offset == 0) return NULL; udcSeek(udc, offset); /* Construct list of field that are being indexed. List is list of * field numbers within asObj. */ int i; struct slInt *intList = NULL, *intEl; for (i=0; iextraIndexCount; ++i) { bits16 type,fieldCount; type = udcReadBits16(udc, isSwapped); fieldCount = udcReadBits16(udc, isSwapped); - udcSeekCur(udc, 12); // skip over reserved bits + udcSeekCur(udc, sizeof(bits64)); // skip over fileOffset + udcSeekCur(udc, 4); // skip over reserved bits if (fieldCount == 1) { bits16 fieldId = udcReadBits16(udc, isSwapped); udcSeekCur(udc, 2); // skip over reserved bits intEl = slIntNew(fieldId); slAddHead(&intList, intEl); } else { warn("Not yet understanding indexes on multiple fields at once."); internalErr(); } } /* Now have to make an asObject to find out name that corresponds to this field. */ struct asObject *as = bigBedAsOrDefault(bbi); /* Make list of field names out of list of field numbers */ struct slName *nameList = NULL; for (intEl = intList; intEl != NULL; intEl = intEl->next) { struct asColumn *col = slElementFromIx(as->columnList, intEl->val); if (col == NULL) { warn("Inconsistent bigBed file %s", bbi->fileName); internalErr(); } slNameAddHead(&nameList, col->name); } -asObjectFree(&as); return nameList; } struct bptFile *bigBedOpenExtraIndex(struct bbiFile *bbi, char *fieldName) /* Return index associated with fieldName. Aborts if no such index. */ { -uglyAbort("Coming soon."); +struct udcFile *udc = bbi->udc; +boolean isSwapped = bbi->isSwapped; +struct asObject *as = bigBedAsOrDefault(bbi); +struct asColumn *col = asColumnFind(as, fieldName); +if (col == NULL) + errAbort("No field %s in %s", fieldName, bbi->fileName); +int colIx = slIxFromElement(as->columnList, col); + +/* See if we have any extra indexes, and if so seek to there. */ +bits64 offset = bbi->extraIndexListOffset; +if (offset == 0) + errAbort("%s has no indexes", bbi->fileName); +udcSeek(udc, offset); + +/* Go through each extra index and see if it's a match */ +int i; +for (i=0; iextraIndexCount; ++i) + { + bits16 type = udcReadBits16(udc, isSwapped); + bits16 fieldCount = udcReadBits16(udc, isSwapped); + bits64 fileOffset = udcReadBits64(udc, isSwapped); + udcSeekCur(udc, 4); // skip over reserved bits + + if (type != 0) + { + warn("Don't understand type %d", type); + internalErr(); + } + if (fieldCount == 1) + { + bits16 fieldId = udcReadBits16(udc, isSwapped); + udcSeekCur(udc, 2); // skip over reserved bits + if (fieldId == colIx) + { + udcSeek(udc, fileOffset); + struct bptFile *bpt = bptFileAttach(bbi->fileName, udc); + return bpt; + } + } + else + { + warn("Not yet understanding indexes on multiple fields at once."); + internalErr(); + } + } + +errAbort("%s is not indexed in %s", fieldName, bbi->fileName); return NULL; }