d10159e362021b3fbbcf57d04526c756004fdb73 kent Tue Feb 26 22:53:33 2013 -0800 Making it so that multiple items can have same name, and all such items are found. diff --git src/utils/bigBedNamedItems/bigBedNamedItems.c src/utils/bigBedNamedItems/bigBedNamedItems.c index 6829294..a16e7a7 100644 --- src/utils/bigBedNamedItems/bigBedNamedItems.c +++ src/utils/bigBedNamedItems/bigBedNamedItems.c @@ -1,130 +1,176 @@ /* bigBedNamedItems - Extract item(s) of given name(s) from bigBed. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "localmem.h" #include "udc.h" #include "bPlusTree.h" #include "bigBed.h" #include "obscure.h" #include "zlibFace.h" void usage() /* Explain usage and exit. */ { errAbort( "bigBedNamedItems - Extract item of given name from bigBed\n" "usage:\n" " bigBedNamedItems file.bb name output.bed\n" "options:\n" " -xxx=XXX\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { {NULL, 0}, }; void bigBedAttachNameIndex(struct bbiFile *bbi) /* Attach name index part of bbiFile to bbi */ { if (bbi->nameBpt == NULL) { if (bbi->nameIndexOffset == 0) errAbort("%s has no name index", bbi->fileName); udcSeek(bbi->udc, bbi->nameIndexOffset); bbi->nameBpt = bptFileAttach(bbi->fileName, bbi->udc); } } -boolean bigBedNameQuery(struct bbiFile *bbi, char *name, FILE *f) -/* Write item matching name to file. Return TRUE if anything written. */ +struct offsetSize +/* Simple file offset and file size. */ + { + bits64 offset; + bits64 size; + }; + +int cmpOffsetSizeRef(const void *va, const void *vb) +/* Compare to sort slRef pointing to offsetSize. Sort is kind of hokey, + * but guarantees all items that are the same will be next to each other + * at least, which is all we care about. */ +{ +const struct slRef *a = *((struct slRef **)va); +const struct slRef *b = *((struct slRef **)vb); +return memcmp(a->val, b->val, sizeof(struct offsetSize)); +} + +struct fileOffsetSize *bigBedChunksMatchingName(struct bbiFile *bbi, char *name) +/* Get list of file chunks that match name. Can slFreeList this when done. */ { bigBedAttachNameIndex(bbi); -boolean isSwapped = bbi->isSwapped; -struct offsetSize {bits64 offset; bits64 size;} block; -boolean didWrite = FALSE; -if (bptFileFind(bbi->nameBpt, name, strlen(name), &block, sizeof(block))) +struct slRef *blockList = bptFileFindMultiple(bbi->nameBpt, name, strlen(name), sizeof(struct offsetSize)); +slSort(&blockList, cmpOffsetSizeRef); + +struct fileOffsetSize *fosList = NULL, *fos; +struct offsetSize lastOffsetSize = {0,0}; +struct slRef *blockRef; +for (blockRef = blockList; blockRef != NULL; blockRef = blockRef->next) + { + if (memcmp(&lastOffsetSize, blockRef->val, sizeof(lastOffsetSize)) != 0) { + memcpy(&lastOffsetSize, blockRef->val, sizeof(lastOffsetSize)); + AllocVar(fos); if (bbi->isSwapped) { - block.offset = byteSwap64(block.offset); - block.size = byteSwap64(block.size); + fos->offset = byteSwap64(lastOffsetSize.offset); + fos->size = byteSwap64(lastOffsetSize.size); + } + else + { + fos->offset = lastOffsetSize.offset; + fos->size = lastOffsetSize.size; + } + slAddHead(&fosList, fos); + } + } +slRefFreeListAndVals(&blockList); +slReverse(&fosList); +return fosList; } +boolean bigBedNameQuery(struct bbiFile *bbi, char *name, FILE *f) +/* Write item matching name to file. Return TRUE if anything written. */ +{ +bigBedAttachNameIndex(bbi); +boolean isSwapped = bbi->isSwapped; +struct fileOffsetSize *fos, *fosList = bigBedChunksMatchingName(bbi, name); +boolean didWrite = FALSE; +for (fos = fosList; fos != NULL; fos = fos->next) + { /* Read in raw data */ - udcSeek(bbi->udc, block.offset); - char *rawData = needLargeMem(block.size); - udcRead(bbi->udc, rawData, block.size); + udcSeek(bbi->udc, fos->offset); + char *rawData = needLargeMem(fos->size); + udcRead(bbi->udc, rawData, fos->size); /* Optionally uncompress data, and set data pointer to uncompressed version. */ char *uncompressedData = NULL; char *data = NULL; int dataSize = 0; if (bbi->uncompressBufSize > 0) { data = uncompressedData = needLargeMem(bbi->uncompressBufSize); - dataSize = zUncompress(rawData, block.size, uncompressedData, bbi->uncompressBufSize); + dataSize = zUncompress(rawData, fos->size, uncompressedData, bbi->uncompressBufSize); } else { data = rawData; - dataSize = block.size; + dataSize = fos->size; } /* Set up for "memRead" routines to more or less treat memory block like file */ char *blockPt = data, *blockEnd = data + dataSize; struct dyString *dy = dyStringNew(32); // Keep bits outside of chrom/start/end here /* Read next record into local variables. */ while (blockPt < blockEnd) { bits32 chromIx = memReadBits32(&blockPt, isSwapped); bits32 s = memReadBits32(&blockPt, isSwapped); bits32 e = memReadBits32(&blockPt, isSwapped); int c; dyStringClear(dy); while ((c = *blockPt++) >= 0) { if (c == 0) break; dyStringAppendC(dy, c); } if (startsWithWordByDelimiter(name, '\t', dy->string)) { char chromName[bbi->chromBpt->keySize+1]; bptStringKeyAtPos(bbi->chromBpt, chromIx, chromName, sizeof(chromName)); fprintf(f, "%s\t%u\t%u\t%s\n", chromName, s, e, dy->string); didWrite = TRUE; } } /* Clean up temporary buffers. */ dyStringFree(&dy); freez(&uncompressedData); freez(&rawData); } +slFreeList(&fosList); return didWrite; } void bigBedNamedItems(char *bigBedFile, char *name, char *outFile) /* bigBedNamedItems - Extract item(s) of given name(s) from bigBed. */ { struct bbiFile *bbi = bigBedFileOpen(bigBedFile); FILE *f = mustOpen(outFile, "w"); bigBedNameQuery(bbi, name, f); carefulClose(&f); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); bigBedNamedItems(argv[1], argv[2], argv[3]); return 0; }