c442d3afa2063b13c887983131439855665a79a0 kent Wed Feb 27 10:39:02 2013 -0800 Moving general purpose routines from bigBedNamedItems to appropriate library. diff --git src/lib/bigBed.c src/lib/bigBed.c index 480e797..246690c 100644 --- src/lib/bigBed.c +++ src/lib/bigBed.c @@ -190,30 +190,194 @@ summarySize, summary); } boolean bigBedSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end, enum bbiSummaryType summaryType, int summarySize, double *summaryValues) /* Fill in summaryValues with data from indicated chromosome range in bigBed file. * Be sure to initialize summaryValues to a default value, which will not be touched * for regions without data in file. (Generally you want the default value to either * be 0.0 or nan("") depending on the application.) Returns FALSE if no data * at that position. */ { return bbiSummaryArray(bbi, chrom, start, end, bigBedCoverageIntervals, summaryType, summarySize, summaryValues); } +void bigBedAttachNameIndex(struct bbiFile *bbi) +/* Attach name index part of bbiFile to bbi */ +{ +if (bbi->nameBpt == NULL) + { + if (bbi->nameIndexOffset == 0) + errAbort("%s has no name index", bbi->fileName); + udcSeek(bbi->udc, bbi->nameIndexOffset); + bbi->nameBpt = bptFileAttach(bbi->fileName, bbi->udc); + } +} + +struct offsetSize +/* Simple file offset and file size. */ + { + bits64 offset; + bits64 size; + }; + +static int cmpOffsetSizeRef(const void *va, const void *vb) +/* Compare to sort slRef pointing to offsetSize. Sort is kind of hokey, + * but guarantees all items that are the same will be next to each other + * at least, which is all we care about. */ +{ +const struct slRef *a = *((struct slRef **)va); +const struct slRef *b = *((struct slRef **)vb); +return memcmp(a->val, b->val, sizeof(struct offsetSize)); +} + +static struct fileOffsetSize *bigBedChunksMatchingName(struct bbiFile *bbi, char *name) +/* Get list of file chunks that match name. Can slFreeList this when done. */ +{ +bigBedAttachNameIndex(bbi); +struct slRef *blockList = bptFileFindMultiple(bbi->nameBpt, name, strlen(name), sizeof(struct offsetSize)); +slSort(&blockList, cmpOffsetSizeRef); + +struct fileOffsetSize *fosList = NULL, *fos; +struct offsetSize lastOffsetSize = {0,0}; +struct slRef *blockRef; +for (blockRef = blockList; blockRef != NULL; blockRef = blockRef->next) + { + if (memcmp(&lastOffsetSize, blockRef->val, sizeof(lastOffsetSize)) != 0) + { + memcpy(&lastOffsetSize, blockRef->val, sizeof(lastOffsetSize)); + AllocVar(fos); + if (bbi->isSwapped) + { + fos->offset = byteSwap64(lastOffsetSize.offset); + fos->size = byteSwap64(lastOffsetSize.size); + } + else + { + fos->offset = lastOffsetSize.offset; + fos->size = lastOffsetSize.size; + } + slAddHead(&fosList, fos); + } + } +slRefFreeListAndVals(&blockList); +slReverse(&fosList); +return fosList; +} + +struct bigBedInterval *bigBedNameQuery(struct bbiFile *bbi, char *name, struct lm *lm) +/* Return list of intervals matching file. These intervals will be allocated out of lm. */ +{ +bigBedAttachNameIndex(bbi); +boolean isSwapped = bbi->isSwapped; +struct fileOffsetSize *fos, *fosList = bigBedChunksMatchingName(bbi, name); +struct bigBedInterval *interval, *intervalList = NULL; +for (fos = fosList; fos != NULL; fos = fos->next) + { + /* Read in raw data */ + udcSeek(bbi->udc, fos->offset); + char *rawData = needLargeMem(fos->size); + udcRead(bbi->udc, rawData, fos->size); + + /* Optionally uncompress data, and set data pointer to uncompressed version. */ + char *uncompressedData = NULL; + char *data = NULL; + int dataSize = 0; + if (bbi->uncompressBufSize > 0) + { + data = uncompressedData = needLargeMem(bbi->uncompressBufSize); + dataSize = zUncompress(rawData, fos->size, uncompressedData, bbi->uncompressBufSize); + } + else + { + data = rawData; + dataSize = fos->size; + } + + /* Set up for "memRead" routines to more or less treat memory block like file */ + char *blockPt = data, *blockEnd = data + dataSize; + struct dyString *dy = dyStringNew(32); // Keep bits outside of chrom/start/end here + + + /* Read next record into local variables. */ + while (blockPt < blockEnd) + { + bits32 chromIx = memReadBits32(&blockPt, isSwapped); + bits32 s = memReadBits32(&blockPt, isSwapped); + bits32 e = memReadBits32(&blockPt, isSwapped); + int c; + dyStringClear(dy); + while ((c = *blockPt++) >= 0) + { + if (c == 0) + break; + dyStringAppendC(dy, c); + } + if (startsWithWordByDelimiter(name, '\t', dy->string)) + { + lmAllocVar(lm, interval); + interval->start = s; + interval->end = e; + interval->rest = cloneString(dy->string); + interval->chromId = chromIx; + slAddHead(&intervalList, interval); + } + } + + /* Clean up temporary buffers. */ + dyStringFree(&dy); + freez(&uncompressedData); + freez(&rawData); + } +slFreeList(&fosList); +slReverse(&intervalList); +return intervalList; +} + +void bigBedIntervalListToBedFile(struct bbiFile *bbi, struct bigBedInterval *intervalList, FILE *f) +/* Write out big bed interval list to bed file, looking up chromosome. */ +{ +char chromName[bbi->chromBpt->keySize+1]; +int lastChromId = -1; +struct bigBedInterval *interval; +for (interval = intervalList; interval != NULL; interval = interval->next) + { + bbiCachedChromLookup(bbi, interval->chromId, lastChromId, chromName, sizeof(chromName)); + lastChromId = interval->chromId; + fprintf(f, "%s\t%u\t%u\t%s\n", chromName, interval->start, interval->end, interval->rest); + } +} + +int bigBedIntervalToRowLookupChrom(struct bigBedInterval *interval, + struct bigBedInterval *prevInterval, struct bbiFile *bbi, + char *chromBuf, int chromBufSize, char *startBuf, char *endBuf, char **row, int rowSize) +/* Convert bigBedInterval to array of chars equivalend to what you'd get by parsing the + * bed file. If you already know what chromosome the interval is on use the simpler + * bigBedIntervalToRow. This one will look up the chromosome based on the chromId field + * of the interval, which is relatively time consuming. To avoid doing this unnecessarily + * pass in a non-NULL prevInterval, and if the chromId is the same on prevInterval as this, + * it will avoid the lookup. The chromBufSize should be at greater or equal to + * bbi->chromBpt->keySize+1. The startBuf and endBuf are used to hold the ascii representation of + * start and end, and should be 16 bytes. Note that the interval->rest string will have zeroes + * inserted as a side effect. Returns number of fields in row. */ +{ +int lastChromId = (prevInterval == NULL ? -1 : prevInterval->chromId); +bbiCachedChromLookup(bbi, interval->chromId, lastChromId, chromBuf, chromBufSize); +return bigBedIntervalToRow(interval, chromBuf, startBuf, endBuf, row, rowSize); +} + char *bigBedAutoSqlText(struct bbiFile *bbi) /* Get autoSql text if any associated with file. Do a freeMem of this when done. */ { if (bbi->asOffset == 0) return NULL; struct udcFile *f = bbi->udc; udcSeek(f, bbi->asOffset); return udcReadStringAndZero(f); } struct asObject *bigBedAs(struct bbiFile *bbi) /* Get autoSql object definition if any associated with file. */ { if (bbi->asOffset == 0) return NULL;