src/lib/bigBed.c 1.10
1.10 2009/02/10 22:06:09 kent
Adding fieldCount and definedFieldCount to bbiFile header for bigBeds.
Index: src/lib/bigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bigBed.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/lib/bigBed.c 9 Feb 2009 02:51:42 -0000 1.9
+++ src/lib/bigBed.c 10 Feb 2009 22:06:09 -0000 1.10
@@ -1,521 +1,532 @@
/* bigBed - interface to binary file with bed-style values (that is a bunch of
* possibly overlapping regions. */
#include "common.h"
#include "hash.h"
#include "linefile.h"
#include "localmem.h"
#include "obscure.h"
#include "dystring.h"
#include "rangeTree.h"
#include "cirTree.h"
#include "bPlusTree.h"
#include "sig.h"
#include "udc.h"
#include "bbiFile.h"
#include "bigBed.h"
struct bbiFile *bigBedFileOpen(char *fileName)
/* Open up big bed file. */
{
return bbiFileOpen(fileName, bigBedSig, "big bed");
}
struct ppBed
/* A partially parsed out bed record plus some extra fields. */
{
struct ppBed *next; /* Next in list. */
char *chrom; /* Chromosome name (not allocated here) */
bits32 start, end; /* Range inside chromosome - half open zero based. */
char *rest; /* The rest of the bed. */
bits64 fileOffset; /* File offset. */
bits32 chromId; /* Chromosome ID. */
};
static int ppBedCmp(const void *va, const void *vb)
/* Compare to sort based on chrom,start,end. */
{
const struct ppBed *a = *((struct ppBed **)va);
const struct ppBed *b = *((struct ppBed **)vb);
int dif;
dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
{
dif = a->start - b->start;
if (dif == 0)
dif = a->end - b->end;
}
return dif;
}
static struct cirTreeRange ppBedFetchKey(const void *va, void *context)
/* Fetch ppBed key for r-tree */
{
struct cirTreeRange res;
const struct ppBed *a = *((struct ppBed **)va);
res.chromIx = a->chromId;
res.start = a->start;
res.end = a->end;
return res;
}
static bits64 ppBedFetchOffset(const void *va, void *context)
/* Fetch ppBed file offset for r-tree */
{
const struct ppBed *a = *((struct ppBed **)va);
return a->fileOffset;
}
static struct ppBed *ppBedLoadAll(char *fileName, struct hash *chromHash, struct lm *lm,
- bits64 *retDiskSize)
+ bits64 *retDiskSize, bits16 *retFieldCount)
/* Read bed file and return it as list of ppBeds. The whole thing will
* be allocated in the passed in lm - don't ppBedFreeList or slFree
* list! */
{
struct ppBed *pbList = NULL, *pb;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *line;
bits64 diskSize = 0;
+int fieldCount = 0;
while (lineFileNextReal(lf, &line))
{
+ if (fieldCount == 0)
+ fieldCount = chopByWhite(line, NULL, 0);
int i;
char *words[3];
for (i=0; i<3; ++i)
words[i] = nextWord(&line);
lmAllocVar(lm, pb);
char *chrom = words[0];
struct hashEl *hel = hashLookup(chromHash, chrom);
if (hel == NULL)
errAbort("%s is not in chrom.sizes line %d of %s", chrom, lf->lineIx, lf->fileName);
pb->chrom = hel->name;
pb->start = lineFileNeedNum(lf, words, 1);
pb->end = lineFileNeedNum(lf, words, 2);
int len = 0;
line = skipLeadingSpaces(line);
if (line != NULL)
{
len = strlen(line);
pb->rest = lmCloneString(lm, skipLeadingSpaces(line));
}
diskSize += len + 1 + 3*sizeof(bits32);
slAddHead(&pbList, pb);
}
slReverse(&pbList);
*retDiskSize = diskSize;
+*retFieldCount = fieldCount;
return pbList;
}
static double ppBedAverageSize(struct ppBed *pbList)
/* Return size of average bed in list. */
{
struct ppBed *pb;
double total = 0.0;
long count = 0;
for (pb = pbList; pb != NULL; pb = pb->next)
{
total += pb->end - pb->start;
count += 1;
}
return total/count;
}
static void makeChromInfo(struct ppBed *pbList, struct hash *chromSizeHash,
int *retChromCount, struct bbiChromInfo **retChromArray,
int *retMaxChromNameSize)
/* Fill in chromId field in pbList. Return array of chromosome name/ids.
* The chromSizeHash is keyed by name, and has int values. */
{
/* Build up list of unique chromosome names. */
struct ppBed *pb;
char *chromName = "";
int chromCount = 0;
int maxChromNameSize = 0;
struct slRef *uniq, *uniqList = NULL;
for (pb = pbList; pb != NULL; pb = pb->next)
{
if (!sameString(pb->chrom, chromName))
{
chromName = pb->chrom;
refAdd(&uniqList, chromName);
++chromCount;
int len = strlen(chromName);
if (len > maxChromNameSize)
maxChromNameSize = len;
}
pb->chromId = chromCount-1;
}
slReverse(&uniqList);
/* Allocate and fill in results array. */
struct bbiChromInfo *chromArray;
AllocArray(chromArray, chromCount);
int i;
for (i = 0, uniq = uniqList; i < chromCount; ++i, uniq = uniq->next)
{
chromArray[i].name = uniq->val;
chromArray[i].id = i;
chromArray[i].size = hashIntVal(chromSizeHash, uniq->val);
}
/* Clean up, set return values and go home. */
slFreeList(&uniqList);
*retChromCount = chromCount;
*retChromArray = chromArray;
*retMaxChromNameSize = maxChromNameSize;
}
static struct bbiSummary *summaryOnDepth(struct ppBed *pbList, struct bbiChromInfo *chromInfoArray,
int reduction)
/* Produce a summary based on depth of coverage. */
{
struct bbiSummary *outList = NULL;
struct ppBed *chromStart, *chromEnd;
for (chromStart = pbList; chromStart != NULL; chromStart = chromEnd)
{
/* Figure out where end of data on this chromosome is. */
bits32 chromId = chromStart->chromId;
for (chromEnd = chromStart; chromEnd != NULL; chromEnd = chromEnd->next)
if (chromEnd->chromId != chromId)
break;
/* Get a range tree */
struct rbTree *chromRanges = rangeTreeNew();
struct ppBed *pb;
for (pb = chromStart; pb != chromEnd; pb = pb->next)
rangeTreeAddToCoverageDepth(chromRanges, pb->start, pb->end);
struct range *range, *rangeList = rangeTreeList(chromRanges);
verbose(3, "Got %d ranges on %d\n", slCount(rangeList), chromId);
for (range = rangeList; range != NULL; range = range->next)
{
bbiAddRangeToSummary(chromId, chromInfoArray[chromId].size, range->start, range->end,
ptToInt(range->val), reduction, &outList);
}
}
slReverse(&outList);
return outList;
}
void bigBedFileCreate(
char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */
char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */
int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */
+ bits16 definedFieldCount, /* Number of defined bed fields - 3-16 or so. 0 means all fields
+ * are the defined bed ones. */
char *outName) /* BigBed output file name. */
/* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
{
/* This code needs to agree with code in two other places currently - bigWigFileCreate,
* and bbiFileOpen. I'm thinking of refactoring to share at least between
* bigBedFileCreate and bigWigFileCreate. It'd be great so it could be structured
* so that it could send the input in one chromosome at a time, and send in the zoom
* stuff only after all the chromosomes are done. This'd potentially reduce the memory
* footprint by a factor of 2 or 4. Still, for now it works. -JK */
/* Declare a bunch of variables. */
int i;
bits32 reserved32 = 0;
bits64 reserved64 = 0;
bits64 dataOffset = 0, dataOffsetPos;
bits64 indexOffset = 0, indexOffsetPos;
bits64 chromTreeOffset = 0, chromTreeOffsetPos;
bits32 sig = bigBedSig;
struct bbiSummary *reduceSummaries[10];
bits32 reductionAmounts[10];
bits64 reductionDataOffsetPos[10];
bits64 reductionDataOffsets[10];
bits64 reductionIndexOffsets[10];
+bits16 fieldCount;
/* Load in chromosome sizes. */
struct hash *chromHash = bbiChromSizesFromFile(chromSizes);
verbose(1, "Read %d chromosomes and sizes from %s\n", chromHash->elCount, chromSizes);
/* Load and sort input file. */
bits64 fullSize;
-struct ppBed *pb, *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, &fullSize);
+struct ppBed *pb, *pbList = ppBedLoadAll(inName, chromHash, chromHash->lm, &fullSize, &fieldCount);
+if (definedFieldCount == 0)
+ definedFieldCount = fieldCount;
bits64 pbCount = slCount(pbList);
verbose(1, "Read %llu items from %s\n", pbCount, inName);
slSort(&pbList, ppBedCmp);
/* Make chromosomes. */
int chromCount, maxChromNameSize;
struct bbiChromInfo *chromInfoArray;
makeChromInfo(pbList, chromHash, &chromCount, &chromInfoArray, &maxChromNameSize);
verbose(1, "%d of %d chromosomes used (%4.2f%%)\n", chromCount, chromHash->elCount,
100.0*chromCount/chromHash->elCount);
/* Calculate first meaningful reduction size and make first reduction. */
bits16 summaryCount = 0;
bits16 version = 1;
int initialReduction = ppBedAverageSize(pbList)*10;
verbose(2, "averageBedSize=%d\n", initialReduction/10);
if (initialReduction > 10000)
initialReduction = 10000;
bits64 lastSummarySize = 0, summarySize;
struct bbiSummary *summaryList, *firstSummaryList;
for (;;)
{
summaryList = summaryOnDepth(pbList, chromInfoArray, initialReduction);
summarySize = bbiTotalSummarySize(summaryList);
if (summarySize >= fullSize && summarySize != lastSummarySize)
{
initialReduction *= 2;
bbiSummaryFreeList(&summaryList);
lastSummarySize = summarySize;
}
else
break;
}
summaryCount = 1;
reduceSummaries[0] = firstSummaryList = summaryList;
reductionAmounts[0] = initialReduction;
verbose(1, "Initial zoom reduction x%d data size %4.2f%%\n",
initialReduction, 100.0 * summarySize/fullSize);
/* Now calculate up to 10 levels of further summary. */
bits64 reduction = initialReduction;
for (i=0; i<ArraySize(reduceSummaries)-1; i++)
{
reduction *= 10;
if (reduction > 1000000000)
break;
summaryList = bbiReduceSummaryList(reduceSummaries[summaryCount-1], chromInfoArray,
reduction);
summarySize = bbiTotalSummarySize(summaryList);
if (summarySize != lastSummarySize)
{
reduceSummaries[summaryCount] = summaryList;
reductionAmounts[summaryCount] = reduction;
++summaryCount;
}
int summaryItemCount = slCount(summaryList);
if (summaryItemCount <= chromCount)
break;
}
/* Open output file and write out fixed size header, including some dummy values for
* offsets we'll fill in later. */
FILE *f = mustOpen(outName, "wb");
writeOne(f, sig);
writeOne(f, version);
writeOne(f, summaryCount);
chromTreeOffsetPos = ftell(f);
writeOne(f, chromTreeOffset);
dataOffsetPos = ftell(f);
writeOne(f, dataOffset);
indexOffsetPos = ftell(f);
writeOne(f, indexOffset);
-for (i=0; i<8; ++i)
+writeOne(f, fieldCount);
+writeOne(f, definedFieldCount);
+for (i=0; i<7; ++i)
writeOne(f, reserved32);
/* Write summary headers */
for (i=0; i<summaryCount; ++i)
{
writeOne(f, reductionAmounts[i]);
writeOne(f, reserved32);
reductionDataOffsetPos[i] = ftell(f);
writeOne(f, reserved64); // Fill in with data offset later
writeOne(f, reserved64); // Fill in with index offset later
}
/* Write chromosome bPlusTree */
chromTreeOffset = ftell(f);
int chromBlockSize = min(blockSize, chromCount);
bptFileBulkIndexToOpenFile(chromInfoArray, sizeof(chromInfoArray[0]), chromCount, chromBlockSize,
bbiChromInfoKey, maxChromNameSize, bbiChromInfoVal,
sizeof(chromInfoArray[0].id) + sizeof(chromInfoArray[0].size),
f);
/* Write the main data. */
dataOffset = ftell(f);
writeOne(f, pbCount);
for (pb = pbList; pb != NULL; pb = pb->next)
{
pb->fileOffset = ftell(f);
writeOne(f, pb->chromId);
writeOne(f, pb->start);
writeOne(f, pb->end);
if (pb->rest != NULL)
{
int len = 0;
len = strlen(pb->rest);
mustWrite(f, pb->rest, len);
}
putc(0, f);
}
/* Write the main index. */
indexOffset = ftell(f);
struct ppBed **pbArray;
AllocArray(pbArray, pbCount);
for (pb = pbList, i=0; pb != NULL; pb = pb->next, ++i)
pbArray[i] = pb;
cirTreeFileBulkIndexToOpenFile(pbArray, sizeof(pbArray[0]), pbCount,
blockSize, itemsPerSlot, NULL, ppBedFetchKey, ppBedFetchOffset,
indexOffset, f);
freez(&pbArray);
/* Write out summary sections. */
verbose(2, "bigBedFileCreate writing %d summaries\n", summaryCount);
for (i=0; i<summaryCount; ++i)
{
reductionDataOffsets[i] = ftell(f);
reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, f);
}
/* Go back and fill in offsets properly in header. */
fseek(f, dataOffsetPos, SEEK_SET);
writeOne(f, dataOffset);
fseek(f, indexOffsetPos, SEEK_SET);
writeOne(f, indexOffset);
fseek(f, chromTreeOffsetPos, SEEK_SET);
writeOne(f, chromTreeOffset);
/* Also fill in offsets in zoom headers. */
for (i=0; i<summaryCount; ++i)
{
fseek(f, reductionDataOffsetPos[i], SEEK_SET);
writeOne(f, reductionDataOffsets[i]);
writeOne(f, reductionIndexOffsets[i]);
}
carefulClose(&f);
freez(&chromInfoArray);
}
struct bigBedInterval *bigBedIntervalQuery(struct bbiFile *bbi, char *chrom,
bits32 start, bits32 end, struct lm *lm)
/* Get data for interval. Return list allocated out of lm. */
{
bbiAttachUnzoomedCir(bbi);
struct bigBedInterval *el, *list = NULL;
bits32 chromId;
struct fileOffsetSize *blockList = bbiOverlappingBlocks(bbi, bbi->unzoomedCir,
chrom, start, end, &chromId);
struct fileOffsetSize *block;
struct udcFile *f = bbi->udc;
boolean isSwapped = bbi->isSwapped;
struct dyString *dy = dyStringNew(32);
for (block = blockList; block != NULL; block = block->next)
{
bits64 endPos = block->offset + block->size;
udcSeek(f, block->offset);
while (udcTell(f) < endPos)
{
/* Read next record into local variables. */
bits32 chr = udcReadBits32(f, isSwapped); // Read and discard chromId
bits32 s = udcReadBits32(f, isSwapped);
bits32 e = udcReadBits32(f, isSwapped);
int c;
dyStringClear(dy);
while ((c = udcGetChar(f)) >= 0)
{
if (c == 0)
break;
dyStringAppendC(dy, c);
}
/* If we're actually in range then copy it into a new element and add to list. */
if (chr == chromId && rangeIntersection(s, e, start, end) > 0)
{
lmAllocVar(lm, el);
el->start = s;
el->end = e;
if (dy->stringSize > 0)
el->rest = lmCloneString(lm, dy->string);
slAddHead(&list, el);
}
}
}
slFreeList(&blockList);
slReverse(&list);
return list;
}
int bigBedIntervalToRow(struct bigBedInterval *interval, char *chrom, char *startBuf, char *endBuf,
char **row, int rowSize)
/* Convert bigBedInterval into an array of chars equivalent to what you'd get by
* parsing the bed file. The startBuf and endBuf are used to hold the ascii representation of
* start and end. Note that the interval->rest string will have zeroes inserted as a side effect.
*/
{
int fieldCount = 3;
sprintf(startBuf, "%u", interval->start);
sprintf(endBuf, "%u", interval->end);
row[0] = chrom;
row[1] = startBuf;
row[2] = endBuf;
if (!isEmpty(interval->rest))
{
int wordCount = chopByWhite(interval->rest, row+3, rowSize-3);
fieldCount += wordCount;
}
return fieldCount;
}
struct bbiInterval *bigBedCoverageIntervals(struct bbiFile *bbi,
char *chrom, bits32 start, bits32 end, struct lm *lm)
/* Return intervals where the val is the depth of coverage. */
{
/* Get list of overlapping intervals */
struct bigBedInterval *bi, *biList = bigBedIntervalQuery(bbi, chrom, start, end, lm);
if (biList == NULL)
return NULL;
/* Make a range tree that collects coverage. */
struct rbTree *rangeTree = rangeTreeNew();
for (bi = biList; bi != NULL; bi = bi->next)
rangeTreeAddToCoverageDepth(rangeTree, bi->start, bi->end);
struct range *range, *rangeList = rangeTreeList(rangeTree);
/* Convert rangeList to bbiInterval list. */
struct bbiInterval *bwi, *bwiList = NULL;
for (range = rangeList; range != NULL; range = range->next)
{
lmAllocVar(lm, bwi);
bwi->start = range->start;
if (bwi->start < start)
bwi->start = start;
bwi->end = range->end;
if (bwi->end > end)
bwi->end = end;
bwi->val = ptToInt(range->val);
slAddHead(&bwiList, bwi);
}
slReverse(&bwiList);
/* Clean up and go home. */
rangeTreeFree(&rangeTree);
return bwiList;
}
boolean bigBedSummaryArrayExtended(char *fileName, char *chrom, bits32 start, bits32 end,
int summarySize, struct bbiSummaryElement *summary)
/* Get extended summary information for summarySize evenely spaced elements into
* the summary array. */
{
struct bbiFile *bbi = bigBedFileOpen(fileName);
boolean ret = bbiSummaryArrayExtended(bbi, chrom, start, end, bigBedCoverageIntervals,
summarySize, summary);
bbiFileClose(&bbi);
return ret;
}
boolean bigBedSummaryArray(char *fileName, char *chrom, bits32 start, bits32 end,
enum bbiSummaryType summaryType, int summarySize, double *summaryValues)
/* Fill in summaryValues with data from indicated chromosome range in bigBed file.
* Be sure to initialize summaryValues to a default value, which will not be touched
* for regions without data in file. (Generally you want the default value to either
* be 0.0 or nan("") depending on the application.) Returns FALSE if no data
* at that position. */
{
struct bbiFile *bbi = bigBedFileOpen(fileName);
boolean ret = bbiSummaryArray(bbi, chrom, start, end, bigBedCoverageIntervals,
summaryType, summarySize, summaryValues);
bbiFileClose(&bbi);
return ret;
}