efd363d44327be79c6b602c165cae2bf19242461 kent Wed Feb 27 22:08:39 2013 -0800 Adding routine to do relatively efficient finding of multiple names in a bigBed name index, and giving bigBedNamedItems a command line option to do this. diff --git src/lib/bigBed.c src/lib/bigBed.c index 246690c..07f225c 100644 --- src/lib/bigBed.c +++ src/lib/bigBed.c @@ -219,71 +219,139 @@ { bits64 offset; bits64 size; }; static int cmpOffsetSizeRef(const void *va, const void *vb) /* Compare to sort slRef pointing to offsetSize. Sort is kind of hokey, * but guarantees all items that are the same will be next to each other * at least, which is all we care about. */ { const struct slRef *a = *((struct slRef **)va); const struct slRef *b = *((struct slRef **)vb); return memcmp(a->val, b->val, sizeof(struct offsetSize)); } -static struct fileOffsetSize *bigBedChunksMatchingName(struct bbiFile *bbi, char *name) -/* Get list of file chunks that match name. Can slFreeList this when done. */ +static struct fileOffsetSize *fosFromRedundantBlockList(struct slRef **pBlockList, + boolean isSwapped) +/* Convert from list of references to offsetSize format to list of fileOffsetSize + * format, while removing redundancy. Sorts *pBlockList as a side effect. */ { -bigBedAttachNameIndex(bbi); -struct slRef *blockList = bptFileFindMultiple(bbi->nameBpt, name, strlen(name), sizeof(struct offsetSize)); -slSort(&blockList, cmpOffsetSizeRef); +/* Sort input so it it easy to uniquify. */ +slSort(pBlockList, cmpOffsetSizeRef); +struct slRef *blockList = *pBlockList; +/* Make new fileOffsetSize for each unique offsetSize. */ struct fileOffsetSize *fosList = NULL, *fos; struct offsetSize lastOffsetSize = {0,0}; struct slRef *blockRef; for (blockRef = blockList; blockRef != NULL; blockRef = blockRef->next) { if (memcmp(&lastOffsetSize, blockRef->val, sizeof(lastOffsetSize)) != 0) { memcpy(&lastOffsetSize, blockRef->val, sizeof(lastOffsetSize)); AllocVar(fos); - if (bbi->isSwapped) + if (isSwapped) { fos->offset = byteSwap64(lastOffsetSize.offset); fos->size = byteSwap64(lastOffsetSize.size); } else { fos->offset = lastOffsetSize.offset; fos->size = lastOffsetSize.size; } slAddHead(&fosList, fos); } } -slRefFreeListAndVals(&blockList); slReverse(&fosList); return fosList; } -struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, char *name, struct lm *lm) -/* Return list of intervals matching file. These intervals will be allocated out of lm. */ + +static struct fileOffsetSize *bigBedChunksMatchingName(struct bbiFile *bbi, char *name) +/* Get list of file chunks that match name. Can slFreeList this when done. */ +{ +struct slRef *blockList = bptFileFindMultiple(bbi->nameBpt, + name, strlen(name), sizeof(struct offsetSize)); +struct fileOffsetSize *fosList = fosFromRedundantBlockList(&blockList, bbi->isSwapped); +slRefFreeListAndVals(&blockList); +return fosList; +} + +static struct fileOffsetSize *bigBedChunksMatchingNames(struct bbiFile *bbi, + char **names, int nameCount) +/* Get list of file chunks that match any of the names. Can slFreeList this when done. */ +{ +/* Go through all names and make a blockList that includes all blocks with any hit to any name. + * Many of these blocks will occur multiple times. */ +struct slRef *blockList = NULL; +int nameIx; +for (nameIx = 0; nameIx < nameCount; ++nameIx) + { + char *name = names[nameIx]; + struct slRef *oneList = bptFileFindMultiple(bbi->nameBpt, + name, strlen(name), sizeof(struct offsetSize)); + blockList = slCat(oneList, blockList); + } + +/* Create nonredundant list of blocks. */ +struct fileOffsetSize *fosList = fosFromRedundantBlockList(&blockList, bbi->isSwapped); + +/* Clean up and resturn result. */ +slRefFreeListAndVals(&blockList); +return fosList; +} + +typedef boolean (*BbFirstWordMatch)(char *line, void *target); +/* A function that returns TRUE if first word in tab-separated line matches target. */ + + +static boolean bbFirstWordMatchesName(char *line, void *target) +/* Return true if first word of line is same as target, which is just a string. */ +{ +char *name = target; +return startsWithWordByDelimiter(name, '\t', line); +} + +static boolean bbFirstWordIsInHash(char *line, void *target) +/* Return true if first word of line is same as target, which is just a string. */ +{ +/* Isolate out first word. */ +int firstWordSize; +char *tab = strchr(line, '\t'); +if (tab == NULL) + firstWordSize = strlen(line); +else + firstWordSize = tab - line; +char firstWord[firstWordSize+1]; +memcpy(firstWord, line, firstWordSize); +firstWord[firstWordSize] = 0; + +/* Return boolean value that reflects whether we found it in hash */ +struct hash *hash = target; +return hashLookup(hash, firstWord) != NULL; +} + +static struct bigBedInterval *bigBedIntervalsMatchingName(struct bbiFile *bbi, + struct fileOffsetSize *fosList, BbFirstWordMatch matcher, void *target, struct lm *lm) +/* Return list of intervals inside of sectors of bbiFile defined by fosList where the name + * matches target somehow. */ { -bigBedAttachNameIndex(bbi); -boolean isSwapped = bbi->isSwapped; -struct fileOffsetSize *fos, *fosList = bigBedChunksMatchingName(bbi, name); struct bigBedInterval *interval, *intervalList = NULL; +struct fileOffsetSize *fos; +boolean isSwapped = bbi->isSwapped; for (fos = fosList; fos != NULL; fos = fos->next) { /* Read in raw data */ udcSeek(bbi->udc, fos->offset); char *rawData = needLargeMem(fos->size); udcRead(bbi->udc, rawData, fos->size); /* Optionally uncompress data, and set data pointer to uncompressed version. */ char *uncompressedData = NULL; char *data = NULL; int dataSize = 0; if (bbi->uncompressBufSize > 0) { data = uncompressedData = needLargeMem(bbi->uncompressBufSize); dataSize = zUncompress(rawData, fos->size, uncompressedData, bbi->uncompressBufSize); @@ -301,51 +369,86 @@ /* Read next record into local variables. */ while (blockPt < blockEnd) { bits32 chromIx = memReadBits32(&blockPt, isSwapped); bits32 s = memReadBits32(&blockPt, isSwapped); bits32 e = memReadBits32(&blockPt, isSwapped); int c; dyStringClear(dy); while ((c = *blockPt++) >= 0) { if (c == 0) break; dyStringAppendC(dy, c); } - if (startsWithWordByDelimiter(name, '\t', dy->string)) + if ((*matcher)(dy->string, target)) { lmAllocVar(lm, interval); interval->start = s; interval->end = e; interval->rest = cloneString(dy->string); interval->chromId = chromIx; slAddHead(&intervalList, interval); } } /* Clean up temporary buffers. */ dyStringFree(&dy); freez(&uncompressedData); freez(&rawData); } -slFreeList(&fosList); slReverse(&intervalList); return intervalList; } +struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, char *name, struct lm *lm) +/* Return list of intervals matching file. These intervals will be allocated out of lm. */ +{ +bigBedAttachNameIndex(bbi); +struct fileOffsetSize *fosList = bigBedChunksMatchingName(bbi, name); +struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, + bbFirstWordMatchesName, name, lm); +slFreeList(&fosList); +return intervalList; +} + +struct bigBedInterval *bigBedMultiNameQuery(struct bbiFile *bbi, char **names, + int nameCount, struct lm *lm) +/* Fetch all records matching any of the names. Return list is allocated out of lm. */ +{ +/* Set up name index and get list of chunks that match any of our names. */ +bigBedAttachNameIndex(bbi); +struct fileOffsetSize *fosList = bigBedChunksMatchingNames(bbi, names, nameCount); + +/* Create hash of all names. */ +struct hash *hash = newHash(0); +int nameIx; +for (nameIx=0; nameIx < nameCount; ++nameIx) + hashAdd(hash, names[nameIx], NULL); + + +/* Get intervals where name matches hash target. */ +struct bigBedInterval *intervalList = bigBedIntervalsMatchingName(bbi, fosList, + bbFirstWordIsInHash, hash, lm); + +/* Clean up and return results. */ +slFreeList(&fosList); +hashFree(&hash); +return intervalList; +} + void bigBedIntervalListToBedFile(struct bbiFile *bbi, struct bigBedInterval *intervalList, FILE *f) /* Write out big bed interval list to bed file, looking up chromosome. */ { char chromName[bbi->chromBpt->keySize+1]; int lastChromId = -1; struct bigBedInterval *interval; for (interval = intervalList; interval != NULL; interval = interval->next) { bbiCachedChromLookup(bbi, interval->chromId, lastChromId, chromName, sizeof(chromName)); lastChromId = interval->chromId; fprintf(f, "%s\t%u\t%u\t%s\n", chromName, interval->start, interval->end, interval->rest); } } int bigBedIntervalToRowLookupChrom(struct bigBedInterval *interval,