src/lib/bwgQuery.c 1.23

1.23 2009/11/12 23:15:52 kent
First cut of compressed bigWig/bigBed stuff. So far read side should be complete including Genome Browser, Table Browser, wigToBigWig and bigWig utility functions. Still to do bedGraphToBigWig and bedToBigBed.
Index: src/lib/bwgQuery.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bwgQuery.c,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 4 -r1.22 -r1.23
--- src/lib/bwgQuery.c	10 Nov 2009 05:46:05 -0000	1.22
+++ src/lib/bwgQuery.c	12 Nov 2009 23:15:52 -0000	1.23
@@ -13,8 +13,9 @@
 #include "bPlusTree.h"
 #include "cirTree.h"
 #include "rangeTree.h"
 #include "udc.h"
+#include "zlibFace.h"
 #include "bbiFile.h"
 #include "bwgInternal.h"
 #include "bigWig.h"
 #include "bigBed.h"
@@ -38,9 +39,10 @@
     UBYTE reserved;	/* Always zero for now. */
     bits16 itemCount;	/* Number of items in block. */
     };
 
-void bwgSectionHeadRead(struct bbiFile *bwf, struct bwgSectionHead *head)
+#ifdef OLD
+static void bwgSectionHeadRead(struct bbiFile *bwf, struct bwgSectionHead *head)
 /* Read section header. */
 {
 struct udcFile *udc = bwf->udc;
 boolean isSwapped = bwf->isSwapped;
@@ -52,10 +54,11 @@
 head->type = udcGetChar(udc);
 head->reserved = udcGetChar(udc);
 head->itemCount = udcReadBits16(udc, isSwapped);
 }
+#endif /* OLD */
 
-void bwgSectionHeadFromMem(char **pPt, struct bwgSectionHead *head, boolean isSwapped)
+static void bwgSectionHeadFromMem(char **pPt, struct bwgSectionHead *head, boolean isSwapped)
 /* Read section header. */
 {
 char *pt = *pPt;
 head->chromId = memReadBits32(&pt, isSwapped);
@@ -68,16 +71,14 @@
 head->itemCount = memReadBits16(&pt, isSwapped);
 *pPt = pt;
 }
 
-static int bigWigBlockDumpIntersectingRange(struct bbiFile *bwf, char *chrom, 
-	bits32 rangeStart, bits32 rangeEnd, int maxCount, FILE *out)
+static int bigWigBlockDumpIntersectingRange(boolean isSwapped, char *blockPt, char *blockEnd, 
+	char *chrom, bits32 rangeStart, bits32 rangeEnd, int maxCount, FILE *out)
 /* Print out info on parts of block that intersect start-end, block starting at current position. */
 {
-boolean isSwapped = bwf->isSwapped;
-struct udcFile *udc = bwf->udc;
 struct bwgSectionHead head;
-bwgSectionHeadRead(bwf, &head);
+bwgSectionHeadFromMem(&blockPt, &head, isSwapped);
 bits16 i;
 float val;
 int outCount = 0;
 
@@ -87,11 +88,11 @@
 	{
 	fprintf(out, "#bedGraph section %s:%u-%u\n",  chrom, head.start, head.end);
 	for (i=0; i<head.itemCount; ++i)
 	    {
-	    bits32 start = udcReadBits32(udc, isSwapped);
-	    bits32 end = udcReadBits32(udc, isSwapped);
-	    udcMustReadOne(udc, val);
+	    bits32 start = memReadBits32(&blockPt, isSwapped);
+	    bits32 end = memReadBits32(&blockPt, isSwapped);
+	    val = memReadFloat(&blockPt, isSwapped);
 	    if (rangeIntersection(rangeStart, rangeEnd, start, end) > 0)
 		{
 		fprintf(out, "%s\t%u\t%u\t%g\n", chrom, start, end, val);
 		++outCount;
@@ -105,10 +106,10 @@
 	{
 	fprintf(out, "variableStep chrom=%s span=%u\n", chrom, head.itemSpan);
 	for (i=0; i<head.itemCount; ++i)
 	    {
-	    bits32 start = udcReadBits32(udc, isSwapped);
-	    udcMustReadOne(udc, val);
+	    bits32 start = memReadBits32(&blockPt, isSwapped);
+	    val = memReadFloat(&blockPt, isSwapped);
 	    if (rangeIntersection(rangeStart, rangeEnd, start, start+head.itemSpan) > 0)
 		{
 		fprintf(out, "%u\t%g\n", start+1, val);
 		++outCount;
@@ -123,15 +124,15 @@
 	boolean gotStart = FALSE;
 	bits32 start = head.start;
 	for (i=0; i<head.itemCount; ++i)
 	    {
-	    udcMustReadOne(udc, val);
+	    val = memReadFloat(&blockPt, isSwapped);
 	    if (rangeIntersection(rangeStart, rangeEnd, start, start+head.itemSpan) > 0)
 	        {
 		if (!gotStart)
 		    {
 		    fprintf(out, "fixedStep chrom=%s start=%u step=%u span=%u\n", 
-			    chrom, start, head.itemStep, head.itemSpan);
+			    chrom, start+1, head.itemStep, head.itemSpan);
 		    gotStart = TRUE;
 		    }
 		fprintf(out, "%g\n", val);
 		++outCount;
@@ -145,18 +146,12 @@
     default:
         internalErr();
 	break;
     }
+assert(blockPt == blockEnd);
 return outCount;
 }
 
-void bigWigBlockDump(struct bbiFile *bwf, char *chrom, FILE *out)
-/* Print out info on block starting at current position. */
-{
-bigWigBlockDumpIntersectingRange(bwf, chrom, 0, BIGNUM, 0, out);
-}
-
-
 struct bbiInterval *bigWigIntervalQuery(struct bbiFile *bwf, char *chrom, bits32 start, bits32 end,
 	struct lm *lm)
 /* Get data for interval.  Return list allocated out of lm. */
 {
@@ -165,24 +160,50 @@
 bbiAttachUnzoomedCir(bwf);
 struct bbiInterval *el, *list = NULL;
 struct fileOffsetSize *blockList = bbiOverlappingBlocks(bwf, bwf->unzoomedCir, 
 	chrom, start, end, NULL);
-struct fileOffsetSize *block;
+struct fileOffsetSize *block, *beforeGap, *afterGap;
 struct udcFile *udc = bwf->udc;
 boolean isSwapped = bwf->isSwapped;
 float val;
 int i;
 
-// slSort(&blockList, fileOffsetSizeCmp);
-struct fileOffsetSize *mergedBlocks = fileOffsetSizeMerge(blockList);
-for (block = mergedBlocks; block != NULL; block = block->next)
-    {
-    udcSeek(udc, block->offset);
-    char *blockBuf = needLargeMem(block->size);
-    udcRead(udc, blockBuf, block->size);
-    char *blockPt = blockBuf, *blockEnd = blockBuf + block->size;
-    while (blockPt < blockEnd)
+/* Set up for uncompression optionally. */
+char *uncompressBuf = NULL;
+if (bwf->uncompressBufSize > 0)
+    uncompressBuf = needLargeMem(bwf->uncompressBufSize);
+
+/* This loop is a little complicated because we merge the read requests for efficiency, but we 
+ * have to then go back through the data one unmerged block at a time. */
+for (block = blockList; block != NULL; )
+    {
+    /* Find contigious blocks and read them into mergedBuf. */
+    fileOffsetSizeFindGap(block, &beforeGap, &afterGap);
+    bits64 mergedOffset = block->offset;
+    bits64 mergedSize = beforeGap->offset + beforeGap->size - mergedOffset;
+    udcSeek(udc, mergedOffset);
+    char *mergedBuf = needLargeMem(mergedSize);
+    udcMustRead(udc, mergedBuf, mergedSize);
+    char *blockBuf = mergedBuf;
+
+    /* Loop through individual blocks within merged section. */
+    for (;block != afterGap; block = block->next)
+        {
+	/* Uncompress if necessary. */
+	char *blockPt, *blockEnd;
+	if (uncompressBuf)
+	    {
+	    blockPt = uncompressBuf;
+	    int uncSize = zUncompress(blockBuf, block->size, uncompressBuf, bwf->uncompressBufSize);
+	    blockEnd = blockPt + uncSize;
+	    }
+	else
 	{
+	    blockPt = blockBuf;
+	    blockEnd = blockPt + block->size;
+	    }
+
+	/* Deal with insides of block. */
 	struct bwgSectionHead head;
 	bwgSectionHeadFromMem(&blockPt, &head, isSwapped);
 	switch (head.type)
 	    {
@@ -252,11 +273,14 @@
 	    default:
 		internalErr();
 		break;
 	    }
+	assert(blockPt == blockEnd);
+	blockBuf += block->size;
 	}
+    freeMem(mergedBuf);
     }
-slFreeList(&mergedBlocks);
+freeMem(uncompressBuf);
 slFreeList(&blockList);
 slReverse(&list);
 return list;
 }
@@ -270,24 +294,65 @@
    errAbort("Trying to do bigWigIntervalDump on a non big-wig file.");
 bbiAttachUnzoomedCir(bwf);
 struct fileOffsetSize *blockList = bbiOverlappingBlocks(bwf, bwf->unzoomedCir, 
 	chrom, start, end, NULL);
-struct fileOffsetSize *block;
+struct fileOffsetSize *block, *beforeGap, *afterGap;
 struct udcFile *udc = bwf->udc;
 int printCount = 0;
 
-for (block = blockList; block != NULL; block = block->next)
-    {
-    udcSeek(udc, block->offset);
-    int oneCount = bigWigBlockDumpIntersectingRange(bwf, chrom, start, end, maxCount, out);
+/* Set up for uncompression optionally. */
+char *uncompressBuf = NULL;
+if (bwf->uncompressBufSize > 0)
+    uncompressBuf = needLargeMem(bwf->uncompressBufSize);
+
+/* This loop is a little complicated because we merge the read requests for efficiency, but we 
+ * have to then go back through the data one unmerged block at a time. */
+for (block = blockList; block != NULL; )
+    {
+    /* Find contigious blocks and read them into mergedBuf. */
+    fileOffsetSizeFindGap(block, &beforeGap, &afterGap);
+    bits64 mergedOffset = block->offset;
+    bits64 mergedSize = beforeGap->offset + beforeGap->size - mergedOffset;
+    udcSeek(udc, mergedOffset);
+    char *mergedBuf = needLargeMem(mergedSize);
+    udcMustRead(udc, mergedBuf, mergedSize);
+    char *blockBuf = mergedBuf;
+
+    /* Loop through individual blocks within merged section. */
+    for (;block != afterGap; block = block->next)
+        {
+	/* Uncompress if necessary. */
+	char *blockPt, *blockEnd;
+	if (uncompressBuf)
+	    {
+	    blockPt = uncompressBuf;
+	    int uncSize = zUncompress(blockBuf, block->size, uncompressBuf, bwf->uncompressBufSize);
+	    blockEnd = blockPt + uncSize;
+	    }
+	else
+	    {
+	    blockPt = blockBuf;
+	    blockEnd = blockPt + block->size;
+	    }
+
+	/* Do the actual dump. */
+	int oneCount = bigWigBlockDumpIntersectingRange(bwf->isSwapped, blockPt, blockEnd, 
+		chrom, start, end, maxCount, out);
+
+	/* Keep track of how many dumped, not exceeding maximum. */
     printCount += oneCount;
     if (maxCount != 0)
         {
 	if (oneCount >= maxCount)
 	    break;
 	maxCount -= oneCount;
 	}
+	blockBuf += block->size;
     }
+    freeMem(mergedBuf);
+    }
+freeMem(uncompressBuf);
+
 slFreeList(&blockList);
 return printCount;
 }