src/lib/bbiRead.c 1.11

1.11 2009/08/05 23:06:27 kent
Fixed integer overflow that was causing right half of display to be empty at some zoom values. Added back a comment that got deleted as well.
Index: src/lib/bbiRead.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/lib/bbiRead.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 1000000 -r1.10 -r1.11
--- src/lib/bbiRead.c	15 Mar 2009 00:17:52 -0000	1.10
+++ src/lib/bbiRead.c	5 Aug 2009 23:06:27 -0000	1.11
@@ -1,569 +1,574 @@
 /* bbiRead - Big Binary Indexed file.  Stuff that's common between bigWig and bigBed on the 
  * read side. */
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "obscure.h"
 #include "localmem.h"
 #include "bPlusTree.h"
 #include "cirTree.h"
 #include "udc.h"
 #include "bbiFile.h"
 
 struct bbiZoomLevel *bbiBestZoom(struct bbiZoomLevel *levelList, int desiredReduction)
 /* Return zoom level that is the closest one that is less than or equal to 
  * desiredReduction. */
 {
 if (desiredReduction < 0)
    errAbort("bad value %d for desiredReduction in bbiBestZoom", desiredReduction);
 if (desiredReduction <= 1)
     return NULL;
 int closestDiff = BIGNUM;
 struct bbiZoomLevel *closestLevel = NULL;
 struct bbiZoomLevel *level;
 
 for (level = levelList; level != NULL; level = level->next)
     {
     int diff = desiredReduction - level->reductionLevel;
     if (diff >= 0 && diff < closestDiff)
         {
 	closestDiff = diff;
 	closestLevel = level;
 	}
     }
 return closestLevel;
 }
 
 struct bbiFile *bbiFileOpen(char *fileName, bits32 sig, char *typeName)
 /* Open up big wig or big bed file. */
 {
 /* This code needs to agree with code in two other places currently - bigBedFileCreate,
  * and bigWigFileCreate.  I'm thinking of refactoring to share at least between
  * bigBedFileCreate and bigWigFileCreate.  It'd be great so it could be structured
  * so that it could send the input in one chromosome at a time, and send in the zoom
  * stuff only after all the chromosomes are done.  This'd potentially reduce the memory
  * footprint by a factor of 2 or 4.  Still, for now it works. -JK */
 struct bbiFile *bbi;
 AllocVar(bbi);
 bbi->fileName = cloneString(fileName);
 struct udcFile *udc = bbi->udc = udcFileOpen(fileName, udcDefaultDir());
 
 /* Read magic number at head of file and use it to see if we are proper file type, and
  * see if we are byte-swapped. */
 bits32 magic;
 boolean isSwapped = FALSE;
 udcMustRead(udc, &magic, sizeof(magic));
 if (magic != sig)
     {
     magic = byteSwap32(magic);
     isSwapped = TRUE;
     if (magic != sig)
        errAbort("%s is not a %s file", fileName, typeName);
     }
 bbi->typeSig = sig;
 bbi->isSwapped = isSwapped;
 
 /* Read rest of defined bits of header, byte swapping as needed. */
 bbi->version = udcReadBits16(udc, isSwapped);
 bbi->zoomLevels = udcReadBits16(udc, isSwapped);
 bbi->chromTreeOffset = udcReadBits64(udc, isSwapped);
 bbi->unzoomedDataOffset = udcReadBits64(udc, isSwapped);
 bbi->unzoomedIndexOffset = udcReadBits64(udc, isSwapped);
 bbi->fieldCount = udcReadBits16(udc, isSwapped);
 bbi->definedFieldCount = udcReadBits16(udc, isSwapped);
 bbi->asOffset = udcReadBits64(udc, isSwapped);
 
 /* Skip over reserved area. */
 udcSeek(udc, udcTell(udc) + 20);
 
 /* Read zoom headers. */
 int i;
 struct bbiZoomLevel *level, *levelList = NULL;
 for (i=0; i<bbi->zoomLevels; ++i)
     {
     AllocVar(level);
     level->reductionLevel = udcReadBits32(udc, isSwapped);
     level->reserved = udcReadBits32(udc, isSwapped);
     level->dataOffset = udcReadBits64(udc, isSwapped);
     level->indexOffset = udcReadBits64(udc, isSwapped);
     slAddHead(&levelList, level);
     }
 slReverse(&levelList);
 bbi->levelList = levelList;
 
 /* Attach B+ tree of chromosome names and ids. */
 udcSeek(udc, bbi->chromTreeOffset);
 bbi->chromBpt =  bptFileAttach(fileName, udc);
 
 return bbi;
 }
 
 void bbiFileClose(struct bbiFile **pBwf)
 /* Close down a big wig/big bed file. */
 {
 struct bbiFile *bwf = *pBwf;
 if (bwf != NULL)
     {
     cirTreeFileDetach(&bwf->unzoomedCir);
     slFreeList(&bwf->levelList);
     slFreeList(&bwf->levelList);
     bptFileDetach(&bwf->chromBpt);
     udcFileClose(&bwf->udc);
     freeMem(bwf->fileName);
     freez(pBwf);
     }
 }
 
 
 struct fileOffsetSize *bbiOverlappingBlocks(struct bbiFile *bbi, struct cirTreeFile *ctf,
 	char *chrom, bits32 start, bits32 end, bits32 *retChromId)
 /* Fetch list of file blocks that contain items overlapping chromosome range. */
 {
 struct bbiChromIdSize idSize;
 if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize)))
     return NULL;
 if (bbi->isSwapped)
     idSize.chromId = byteSwap32(idSize.chromId);
 if (retChromId != NULL)
     *retChromId = idSize.chromId;
 return cirTreeFindOverlappingBlocks(ctf, idSize.chromId, start, end);
 }
 
 struct chromNameCallbackContext
 /* Some stuff that the bPlusTree traverser needs for context. */
     {
     struct bbiChromInfo *list;		/* The list we are building. */
     boolean isSwapped;			/* Need to byte-swap things? */
     };
 
 static void chromNameCallback(void *context, void *key, int keySize, void *val, int valSize)
 /* Callback that captures chromInfo from bPlusTree. */
 {
 struct chromNameCallbackContext *c = context;
 struct bbiChromInfo *info;
 struct bbiChromIdSize *idSize = val;
 assert(valSize == sizeof(*idSize));
 AllocVar(info);
 info->name = cloneStringZ(key, keySize);
 info->id = idSize->chromId;
 info->size = idSize->chromSize;
 if (c->isSwapped)
     {
     info->id = byteSwap32(info->id);
     info->size = byteSwap32(info->size);
     }
 slAddHead(&c->list, info);
 }
 
 struct bbiChromInfo *bbiChromList(struct bbiFile *bbi)
 /* Return list of chromosomes. */
 {
 struct chromNameCallbackContext context;
 context.list = NULL;
 context.isSwapped = bbi->isSwapped;
 bptFileTraverse(bbi->chromBpt, &context, chromNameCallback);
 slReverse(&context.list);
 return context.list;
 }
 
 bits32 bbiChromSize(struct bbiFile *bbi, char *chrom)
 /* Return chromosome size, or 0 if no such chromosome in file. */
 {
 struct bbiChromIdSize idSize;
 if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize)))
     return 0;
 return idSize.chromSize;
 }
 
 void bbiChromInfoFree(struct bbiChromInfo **pInfo)
 /* Free up one chromInfo */
 {
 struct bbiChromInfo *info = *pInfo;
 if (info != NULL)
     {
     freeMem(info->name);
     freez(pInfo);
     }
 }
 
 void bbiChromInfoFreeList(struct bbiChromInfo **pList)
 /* Free a list of dynamically allocated bbiChromInfo's */
 {
 struct bbiChromInfo *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     bbiChromInfoFree(&el);
     }
 *pList = NULL;
 }
 
 void bbiAttachUnzoomedCir(struct bbiFile *bbi)
 /* Make sure unzoomed cir is attached. */
 {
 if (bbi->unzoomedCir == NULL)
     {
     udcSeek(bbi->udc, bbi->unzoomedIndexOffset);
     bbi->unzoomedCir = cirTreeFileAttach(bbi->fileName, bbi->udc);
     }
 }
 
 enum bbiSummaryType bbiSummaryTypeFromString(char *string)
 /* Return summary type given a descriptive string. */
 {
 if (sameWord(string, "mean") || sameWord(string, "average"))
     return bbiSumMean;
 else if (sameWord(string, "max") || sameWord(string, "maximum"))
     return bbiSumMax;
 else if (sameWord(string, "min") || sameWord(string, "minimum"))
     return bbiSumMin;
 else if (sameWord(string, "coverage") || sameWord(string, "dataCoverage"))
     return bbiSumCoverage;
 else
     {
     errAbort("Unknown bbiSummaryType %s", string);
     return bbiSumMean;	/* Keep compiler quiet. */
     }
 }
 
 char *bbiSummaryTypeToString(enum bbiSummaryType type)
 /* Convert summary type from enum to string representation. */
 {
 switch (type)
     {
     case bbiSumMean:
         return "mean";
     case bbiSumMax:
         return "max";
     case bbiSumMin:
         return "min";
     case bbiSumCoverage:
         return "coverage";
     default:
 	errAbort("Unknown bbiSummaryType %d", (int)type);
 	return NULL;
     }
 }
 
 #ifdef UNUSED
 static void bbiSummaryOnDiskRead(struct bbiFile *bbi, struct bbiSummaryOnDisk *sum)
 /* Read in summary from file. */
 {
 struct udcFile *udc = bbi->udc;
 boolean isSwapped = bbi->isSwapped;
 sum->chromId = udcReadBits32(udc, isSwapped);
 sum->start = udcReadBits32(udc, isSwapped);
 sum->end = udcReadBits32(udc, isSwapped);
 sum->validCount = udcReadBits32(udc, isSwapped);
 udcMustReadOne(udc, sum->minVal);
 udcMustReadOne(udc, sum->maxVal);
 udcMustReadOne(udc, sum->sumData);
 udcMustReadOne(udc, sum->sumSquares);
 }
 #endif /* UNUSED */
 
 static struct bbiSummary *bbiSummaryFromOnDisk(struct bbiSummaryOnDisk *in)
 /* Create a bbiSummary unlinked to anything from input in onDisk format. */
 {
 struct bbiSummary *out;
 AllocVar(out);
 out->chromId = in->chromId;
 out->start = in->start;
 out->end = in->end;
 out->validCount = in->validCount;
 out->minVal = in->minVal;
 out->maxVal = in->maxVal;
 out->sumData = in->sumData;
 out->sumSquares = in->sumSquares;
 return out;
 }
 
 static struct bbiSummary *bbiSummariesInRegion(struct bbiZoomLevel *zoom, struct bbiFile *bbi, 
 	int chromId, bits32 start, bits32 end)
 /* Return list of all summaries in region at given zoom level of bbiFile. */
 {
 struct bbiSummary *sumList = NULL, *sum;
 struct udcFile *udc = bbi->udc;
 udcSeek(udc, zoom->indexOffset);
 struct cirTreeFile *ctf = cirTreeFileAttach(bbi->fileName, bbi->udc);
 struct fileOffsetSize *fragList = cirTreeFindOverlappingBlocks(ctf, chromId, start, end);
 struct fileOffsetSize *block, *blockList = fileOffsetSizeMerge(fragList);
 for (block = blockList; block != NULL; block = block->next)
     {
     /* Read info we need into memory. */
     udcSeek(udc, block->offset);
     char *blockBuf = needLargeMem(block->size);
     udcRead(udc, blockBuf, block->size);
     char *blockPt = blockBuf;
 
     struct bbiSummaryOnDisk *dSum;
     int itemSize = sizeof(*dSum);
     assert(block->size % itemSize == 0);
     int itemCount = block->size / itemSize;
     int i;
     for (i=0; i<itemCount; ++i)
 	{
 	dSum = (void *)blockPt;
 	blockPt += sizeof(*dSum);
 	if (dSum->chromId == chromId)
 	    {
 	    int s = max(dSum->start, start);
 	    int e = min(dSum->end, end);
 	    if (s < e)
 		{
 		sum = bbiSummaryFromOnDisk(dSum);
 		slAddHead(&sumList, sum);
 		}
 	    }
 	}
     freeMem(blockBuf);
     }
 slFreeList(&blockList);
 slFreeList(&fragList);
 cirTreeFileDetach(&ctf);
 slReverse(&sumList);
 return sumList;
 }
 
 static bits32 bbiSummarySlice(struct bbiFile *bbi, bits32 baseStart, bits32 baseEnd, 
 	struct bbiSummary *sumList, struct bbiSummaryElement *el)
 /* Update retVal with the average value if there is any data in interval.  Return number
  * of valid data bases in interval. */
 {
 bits32 validCount = 0;
 
 if (sumList != NULL)
     {
     double minVal = sumList->minVal;
     double maxVal = sumList->maxVal;
     double sumData = 0, sumSquares = 0;
 
     struct bbiSummary *sum;
     for (sum = sumList; sum != NULL && sum->start < baseEnd; sum = sum->next)
 	{
 	int overlap = rangeIntersection(baseStart, baseEnd, sum->start, sum->end);
 	if (overlap > 0)
 	    {
 	    double overlapFactor = (double)overlap / (sum->end - sum->start);
 	    validCount += sum->validCount * overlapFactor;
 	    sumData += sum->sumData * overlapFactor;
 	    sumSquares += sum->sumSquares * overlapFactor;
 	    if (maxVal < sum->maxVal)
 		maxVal = sum->maxVal;
 	    if (minVal > sum->minVal)
 		minVal = sum->minVal;
 	    }
 	}
     if (validCount > 0)
 	{
 	el->validCount = validCount;
 	el->minVal = minVal;
 	el->maxVal = maxVal;
 	el->sumData = sumData;
 	el->sumSquares = sumSquares;
 	}
     }
 return validCount;
 }
 
 static int bbiChromId(struct bbiFile *bbi, char *chrom)
 /* Return chromosome size */
 {
 struct bbiChromIdSize idSize;
 if (!bptFileFind(bbi->chromBpt, chrom, strlen(chrom), &idSize, sizeof(idSize)))
     return -1;
 return idSize.chromId;
 }
 
 static boolean bbiSummaryArrayFromZoom(struct bbiZoomLevel *zoom, struct bbiFile *bbi, 
 	char *chrom, bits32 start, bits32 end,
 	int summarySize, struct bbiSummaryElement *summary)
 /* Look up region in index and get data at given zoom level.  Summarize this data
  * in the summary array. */
 {
 boolean result = FALSE;
 int chromId = bbiChromId(bbi, chrom);
 if (chromId < 0)
     return FALSE;
 struct bbiSummary *sum, *sumList = bbiSummariesInRegion(zoom, bbi, chromId, start, end);
 if (sumList != NULL)
     {
     int i;
     bits32 baseStart = start, baseEnd;
     bits32 baseCount = end - start;
     sum = sumList;
     for (i=0; i<summarySize; ++i)
         {
 	/* Calculate end of this part of summary */
 	baseEnd = start + (bits64)baseCount*(i+1)/summarySize;
 
         /* Advance sum to skip over parts we are no longer interested in. */
 	while (sum != NULL && sum->end <= baseStart)
 	    sum = sum->next;
 
 	if (bbiSummarySlice(bbi, baseStart, baseEnd, sum, &summary[i]))
 	    result = TRUE;
 
 	/* Next time round start where we left off. */
 	baseStart = baseEnd;
 	}
     slFreeList(&sumList);
     }
 return result;
 }
 
 static bits32 bbiIntervalSlice(struct bbiFile *bbi, bits32 baseStart, bits32 baseEnd, 
 	struct bbiInterval *intervalList, struct bbiSummaryElement *el)
 /* Update retVal with the average value if there is any data in interval.  Return number
  * of valid data bases in interval. */
 {
 bits32 validCount = 0;
 
 if (intervalList != NULL)
     {
     struct bbiInterval *interval;
     double sumData = 0, sumSquares = 0;
     double minVal = intervalList->val;
     double maxVal = intervalList->val;
 
     for (interval = intervalList; interval != NULL && interval->start < baseEnd; 
 	    interval = interval->next)
 	{
 	int overlap = rangeIntersection(baseStart, baseEnd, interval->start, interval->end);
 	if (overlap > 0)
 	    {
 	    int intervalSize = interval->end - interval->start;
 	    double overlapFactor = (double)overlap / intervalSize;
 	    double intervalWeight = intervalSize * overlapFactor;
 	    validCount += intervalWeight;
 	    sumData += interval->val * intervalWeight;
 	    sumSquares += interval->val * interval->val * intervalWeight;
 	    if (maxVal < interval->val)
 		maxVal = interval->val;
 	    if (minVal > interval->val)
 		minVal = interval->val;
 	    }
 	}
     el->validCount = validCount;
     el->minVal = minVal;
     el->maxVal = maxVal;
     el->sumData = sumData;
     el->sumSquares = sumSquares;
     }
 return validCount;
 }
 
 
 static boolean bbiSummaryArrayFromFull(struct bbiFile *bbi, 
 	char *chrom, bits32 start, bits32 end, BbiFetchIntervals fetchIntervals,
 	int summarySize, struct bbiSummaryElement *summary)
 /* Summarize data, not using zoom. */
 {
 struct bbiInterval *intervalList = NULL, *interval;
 struct lm *lm = lmInit(0);
 intervalList = (*fetchIntervals)(bbi, chrom, start, end, lm);
 boolean result = FALSE;
 if (intervalList != NULL);
     {
     int i;
     bits32 baseStart = start, baseEnd;
     bits32 baseCount = end - start;
     interval = intervalList;
     for (i=0; i<summarySize; ++i)
         {
 	/* Calculate end of this part of summary */
-	baseEnd = start + baseCount*(i+1)/summarySize;
+	baseEnd = start + (bits64)baseCount*(i+1)/summarySize;
 	int end1 = baseEnd;
 	if (end1 == baseStart)
 	    end1 = baseStart+1;
 
         /* Advance interval to skip over parts we are no longer interested in. */
 	while (interval != NULL && interval->end <= baseStart)
 	    interval = interval->next;
 
 	if (bbiIntervalSlice(bbi, baseStart, end1, interval, &summary[i]))
 	    result = TRUE;
 
 	/* Next time round start where we left off. */
 	baseStart = baseEnd;
 	}
     }
 lmCleanup(&lm);
 return result;
 }
 
 boolean bbiSummaryArrayExtended(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end,
 	BbiFetchIntervals fetchIntervals,
 	int summarySize, struct bbiSummaryElement *summary)
 /* Fill in summary with  data from indicated chromosome range in bigWig file. 
  * Returns FALSE if no data at that position. */
 {
 boolean result = FALSE;
 
 /* Protect from bad input. */
 if (start >= end)
     return result;
 bzero(summary, summarySize * sizeof(summary[0]));
 
 /* Figure out what size of data we want.  We actually want to get 4 data points per summary
  * value if possible to minimize the effect of a data point being split between summary pixels. */
 bits32 baseSize = end - start; 
 int fullReduction = (baseSize/summarySize);
 int zoomLevel = fullReduction/4;
 if (zoomLevel < 0)
     zoomLevel = 0;
 
 /* Get the closest zoom level less than what we're looking for. */
 struct bbiZoomLevel *zoom = bbiBestZoom(bbi->levelList, zoomLevel);
 if (zoom != NULL)
     result = bbiSummaryArrayFromZoom(zoom, bbi, chrom, start, end, summarySize, summary);
 else
     result = bbiSummaryArrayFromFull(bbi, chrom, start, end, fetchIntervals, summarySize, summary);
 return result;
 }
 
 boolean bbiSummaryArray(struct bbiFile *bbi, char *chrom, bits32 start, bits32 end,
 	BbiFetchIntervals fetchIntervals,
 	enum bbiSummaryType summaryType, int summarySize, double *summaryValues)
+/* Fill in summaryValues with  data from indicated chromosome range in bigWig file.
+ * Be sure to initialize summaryValues to a default value, which will not be touched
+ * for regions without data in file.  (Generally you want the default value to either
+ * be 0.0 or nan("") depending on the application.)  Returns FALSE if no data
+ * at that position. */
 {
 struct bbiSummaryElement *elements;
 AllocArray(elements, summarySize);
 boolean ret = bbiSummaryArrayExtended(bbi, chrom, start, end, 
 	fetchIntervals, summarySize, elements);
 if (ret)
     {
     int i;
     for (i=0; i<summarySize; ++i)
         {
 	struct bbiSummaryElement *el = &elements[i];
 	if (el->validCount > 0)
 	    {
 	    double val;
 	    switch (summaryType)
 		{
 		case bbiSumMean:
 		    val = el->sumData/el->validCount;
 		    break;
 		case bbiSumMax:
 		    val = el->maxVal;
 		    break;
 		case bbiSumMin:
 		    val = el->minVal;
 		    break;
 		case bbiSumCoverage:
 		    val = (double)el->validCount/(end-start);
 		    break;
 		default:
 		    internalErr();
 		    val = 0.0;
 		    break;
 		}
 	    summaryValues[i] = val;
 	    }
 	}
     }
 freeMem(elements);
 return ret;
 }