src/utils/bedGraphToBigWig/bedGraphToBigWig.c 1.21

1.21 2009/11/13 23:46:50 kent
Fixed bug in sumSquares.
Index: src/utils/bedGraphToBigWig/bedGraphToBigWig.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/bedGraphToBigWig/bedGraphToBigWig.c,v
retrieving revision 1.20
retrieving revision 1.21
diff -b -B -U 1000000 -r1.20 -r1.21
--- src/utils/bedGraphToBigWig/bedGraphToBigWig.c	13 Nov 2009 19:02:38 -0000	1.20
+++ src/utils/bedGraphToBigWig/bedGraphToBigWig.c	13 Nov 2009 23:46:50 -0000	1.21
@@ -1,541 +1,541 @@
 /* bedGraphToBigWig - Convert a bedGraph program to bigWig.. */
 #include "common.h"
 #include "linefile.h"
 #include "localmem.h"
 #include "hash.h"
 #include "options.h"
 #include "sqlNum.h"
 #include "dystring.h"
 #include "cirTree.h"
 #include "sig.h"
 #include "zlibFace.h"
 #include "bPlusTree.h"
 #include "bbiFile.h"
 #include "bwgInternal.h"
 #include "bigWig.h"
 
 static char const rcsid[] = "$Id$";
 
 static int blockSize = 256;
 static int itemsPerSlot = 1024;
 static boolean doCompress = FALSE;
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bedGraphToBigWig v %d - Convert a bedGraph program to bigWig.\n"
   "usage:\n"
   "   bedGraphToBigWig in.bedGraph chrom.sizes out.bw\n"
   "where in.bedGraph is a four column file in the format:\n"
   "      <chrom> <start> <end> <value>\n"
   "and chrom.sizes is two column: <chromosome name> <size in bases>\n"
   "and out.bw is the output indexed big wig file.\n"
   "options:\n"
   "   -blockSize=N - Number of items to bundle in r-tree.  Default %d\n"
   "   -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n"
   "   -compress - If set use zlib compression."
   , bbiCurrentVersion, blockSize, itemsPerSlot
   );
 }
 
 static struct optionSpec options[] = {
    {"blockSize", OPTION_INT},
    {"itemsPerSlot", OPTION_INT},
    {"compress", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 struct sectionItem
 /* An item in a section of a bedGraph. */
     {
     bits32 start, end;			/* Position in chromosome, half open. */
     float val;				/* Single precision value. */
     };
 
 void writeSections(struct bbiChromUsage *usageList, struct lineFile *lf, 
 	int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, FILE *f,
 	int resTryCount, int resScales[], int resSizes[], 
 	boolean doCompress, bits32 *retMaxSectionSize)
 /* Read through lf, chunking it into sections that get written to f.  Save info
  * about sections in bounds. */
 {
 int maxSectionSize = 0;
 struct bbiChromUsage *usage = usageList;
 int itemIx = 0, sectionIx = 0;
 bits32 reserved32 = 0;
 UBYTE reserved8 = 0;
 struct sectionItem items[itemsPerSlot];
 struct sectionItem *lastB = NULL;
 bits32 resEnds[resTryCount];
 int resTry;
 for (resTry = 0; resTry < resTryCount; ++resTry)
     resEnds[resTry] = 0;
 struct dyString *stream = dyStringNew(0);
 
 for (;;)
     {
     /* Get next line of input if any. */
     char *row[5];
     int rowSize = lineFileChopNext(lf, row, ArraySize(row));
 
     /* Figure out whether need to output section. */
     boolean sameChrom = FALSE;
     if (rowSize > 0)
 	sameChrom = sameString(row[0], usage->name);
     if (itemIx >= itemsPerSlot || rowSize == 0 || !sameChrom)
         {
 	/* Figure out section position. */
 	bits32 chromId = usage->id;
 	bits32 sectionStart = items[0].start;
 	bits32 sectionEnd = items[itemIx-1].end;
 
 	/* Save section info for indexing. */
 	assert(sectionIx < sectionCount);
 	struct bbiBoundsArray *section = &bounds[sectionIx++];
 	section->offset = ftell(f);
 	section->range.chromIx = chromId;
 	section->range.start = sectionStart;
 	section->range.end = sectionEnd;
 
 	/* Output section header to stream. */
 	dyStringClear(stream);
 	UBYTE type = bwgTypeBedGraph;
 	bits16 itemCount = itemIx;
 	dyStringWriteOne(stream, chromId);			// chromId
 	dyStringWriteOne(stream, sectionStart);		// start
 	dyStringWriteOne(stream, sectionEnd);	// end
 	dyStringWriteOne(stream, reserved32);		// itemStep
 	dyStringWriteOne(stream, reserved32);		// itemSpan
 	dyStringWriteOne(stream, type);			// type
 	dyStringWriteOne(stream, reserved8);			// reserved
 	dyStringWriteOne(stream, itemCount);			// itemCount
 
 	/* Output each item in section to stream. */
 	int i;
 	for (i=0; i<itemIx; ++i)
 	    {
 	    struct sectionItem *item = &items[i];
 	    dyStringWriteOne(stream, item->start);
 	    dyStringWriteOne(stream, item->end);
 	    dyStringWriteOne(stream, item->val);
 	    }
 
 	/* Save stream to file, compressing if need be. */
 	if (stream->stringSize > maxSectionSize)
 	    maxSectionSize = stream->stringSize;
 	if (doCompress)
 	    {
 	    size_t maxCompSize = zCompBufSize(stream->stringSize);
 	    char compBuf[maxCompSize];
 	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
 	    mustWrite(f, compBuf, compSize);
 	    }
 	else
 	    mustWrite(f, stream->string, stream->stringSize);
 
 
 	/* If at end of input we are done. */
 	if (rowSize == 0)
 	    break;
 
 	/* Set up for next section. */
 	itemIx = 0;
 
 	if (!sameChrom)
 	    {
 	    usage = usage->next;
 	    assert(usage != NULL);
 	    assert(sameString(row[0], usage->name));
 	    lastB = NULL;
 	    for (resTry = 0; resTry < resTryCount; ++resTry)
 		resEnds[resTry] = 0;
 	    }
 	}
 
     /* Parse out input. */
     lineFileExpectWords(lf, 4, rowSize);
     bits32 start = lineFileNeedNum(lf, row, 1);
     bits32 end = lineFileNeedNum(lf, row, 2);
     float val = lineFileNeedDouble(lf, row, 3);
 
     /* Verify that inputs meets our assumption - that it is a sorted bedGraph file. */
     if (start > end)
         errAbort("Start (%u) after end (%u) line %d of %s", start, end, lf->lineIx, lf->fileName);
     if (lastB != NULL)
         {
 	if (lastB->start > start)
 	    errAbort("BedGraph not sorted on start line %d of %s", lf->lineIx, lf->fileName);
 	if (lastB->end > start)
 	    errAbort("Overlapping regions in bedGraph line %d of %s", lf->lineIx, lf->fileName);
 	}
 
 
     /* Do zoom counting. */
     for (resTry = 0; resTry < resTryCount; ++resTry)
         {
 	bits32 resEnd = resEnds[resTry];
 	if (start >= resEnd)
 	    {
 	    resSizes[resTry] += 1;
 	    resEnds[resTry] = resEnd = start + resScales[resTry];
 	    }
 	while (end > resEnd)
 	    {
 	    resSizes[resTry] += 1;
 	    resEnds[resTry] = resEnd = resEnd + resScales[resTry];
 	    }
 	}
 
     /* Save values in output array. */
     struct sectionItem *b = &items[itemIx];
     b->start = start;
     b->end = end;
     b->val = val;
     lastB = b;
     itemIx += 1;
     }
 assert(sectionIx == sectionCount);
 
 *retMaxSectionSize = maxSectionSize;
 }
 
 static struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
 	struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
 	int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress,
 	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart,
 	struct bbiSummaryElement *totalSum)
 /* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
  * next reduction level.  This is more work than some ways, but it keeps us from having to
  * keep the first reduction entirely in memory. */
 {
 struct bbiSummary *twiceReducedList = NULL;
 bits32 doubleReductionSize = initialReduction * zoomIncrement;
 struct bbiChromUsage *usage = usageList;
 struct bbiSummary oneSummary, *sum = NULL;
 struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd;
 boundsPt = AllocArray(boundsArray, initialReductionCount);
 boundsEnd = boundsPt + initialReductionCount;
 
 *retDataStart = ftell(f);
 writeOne(f, initialReductionCount);
 boolean firstRow = TRUE;
 
 struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress);
 for (;;)
     {
     /* Get next line of input if any. */
     char *row[5];
     int rowSize = lineFileChopNext(lf, row, ArraySize(row));
 
     /* Output last section and break if at end of file. */
     if (rowSize == 0 && sum != NULL)
 	{
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
 		&boundsPt, boundsEnd, usage->size, lm, stream);
 	break;
 	}
 
     /* Parse out row. */
     char *chrom = row[0];
     bits32 start = sqlUnsigned(row[1]);
     bits32 end = sqlUnsigned(row[2]);
     float val = sqlFloat(row[3]);
 
     /* Update total summary stuff. */
     bits32 size = end-start;
     if (firstRow)
 	{
         totalSum->validCount = size;
 	totalSum->minVal = totalSum->maxVal = val;
 	totalSum->sumData = val*size;
-	totalSum->sumSquares = val*size*size;
+	totalSum->sumSquares = val*val*size;
 	firstRow = FALSE;
 	}
     else
         {
 	totalSum->validCount += size;
 	if (val < totalSum->minVal) totalSum->minVal = val;
 	if (val > totalSum->maxVal) totalSum->maxVal = val;
 	totalSum->sumData += val*size;
-	totalSum->sumSquares += val*size*size;
+	totalSum->sumSquares += val*val*size;
 	}
 
     /* If new chromosome output existing block. */
     if (differentString(chrom, usage->name))
         {
 	usage = usage->next;
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize,
 		&boundsPt, boundsEnd, usage->size, lm, stream);
 	sum = NULL;
 	}
 
     /* If start past existing block then output it. */
     else if (sum != NULL && sum->end <= start)
 	{
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
 		&boundsPt, boundsEnd, usage->size, lm, stream);
 	sum = NULL;
 	}
 
     /* If don't have a summary we're working on now, make one. */
     if (sum == NULL)
         {
 	oneSummary.chromId = usage->id;
 	oneSummary.start = start;
 	oneSummary.end = start + initialReduction;
 	if (oneSummary.end > usage->size) oneSummary.end = usage->size;
 	oneSummary.minVal = oneSummary.maxVal = val;
 	oneSummary.sumData = oneSummary.sumSquares = 0.0;
 	oneSummary.validCount = 0;
 	sum = &oneSummary;
 	}
     
     /* Deal with case where might have to split an item between multiple summaries.  This
      * loop handles all but the final affected summary in that case. */
     while (end > sum->end)
         {
 	verbose(3, "Splitting start %d, end %d, sum->start %d, sum->end %d\n", start, end, sum->start, sum->end);
 	/* Fold in bits that overlap with existing summary and output. */
 	bits32 overlap = rangeIntersection(start, end, sum->start, sum->end);
 	sum->validCount += overlap;
 	if (sum->minVal > val) sum->minVal = val;
 	if (sum->maxVal < val) sum->maxVal = val;
 	sum->sumData += val * overlap;
 	sum->sumSquares += val*val * overlap;
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
 		&boundsPt, boundsEnd, usage->size, lm, stream);
 	size -= overlap;
 
 	/* Move summary to next part. */
 	sum->start = start = sum->end;
 	sum->end = start + initialReduction;
 	if (sum->end > usage->size) sum->end = usage->size;
 	sum->minVal = sum->maxVal = val;
 	sum->sumData = sum->sumSquares = 0.0;
 	sum->validCount = 0;
 	}
 
     /* Add to summary. */
     sum->validCount += size;
     if (sum->minVal > val) sum->minVal = val;
     if (sum->maxVal < val) sum->maxVal = val;
     sum->sumData += val * size;
     sum->sumSquares += val*val * size;
     }
 bbiSumOutStreamClose(&stream);
 
 /* Write out 1st zoom index. */
 int indexOffset = *retIndexStart = ftell(f);
 assert(boundsPt == boundsEnd);
 cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount,
     blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
     indexOffset, f);
 
 freez(&boundsArray);
 slReverse(&twiceReducedList);
 return twiceReducedList;
 }
 
 void bedGraphToBigWig(char *inName, char *chromSizes, char *outName)
 /* bedGraphToBigWig - Convert a bedGraph program to bigWig.. */
 {
 verboseTime(2, NULL);
 struct lineFile *lf = lineFileOpen(inName, TRUE);
 struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes);
 verbose(2, "%d chroms in %s\n", chromSizesHash->elCount, chromSizes);
 int minDiff = 0, i;
 double aveSize = 0;
 bits64 bedCount = 0;
 bits32 uncompressBufSize = 0;
 struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSize, &bedCount);
 verboseTime(2, "pass1");
 verbose(2, "%d chroms in %s\n", slCount(usageList), inName);
 
 /* Write out dummy header, zoom offsets. */
 FILE *f = mustOpen(outName, "wb");
 bbiWriteDummyHeader(f);
 bbiWriteDummyZooms(f);
 
 /* Write out dummy total summary. */
 struct bbiSummaryElement totalSum;
 ZeroVar(&totalSum);
 bits64 totalSummaryOffset = ftell(f);
 bbiSummaryElementWrite(f, &totalSum);
 
 /* Write out chromosome/size database. */
 bits64 chromTreeOffset = ftell(f);
 bbiWriteChromInfo(usageList, blockSize, f);
 
 /* Set up to keep track of possible initial reduction levels. */
 int resTryCount = 10, resTry;
 int resIncrement = 4;
 int resScales[resTryCount], resSizes[resTryCount];
 int res = minDiff * 2;
 if (res > 0)
     {
     for (resTry = 0; resTry < resTryCount; ++resTry)
 	{
 	resSizes[resTry] = 0;
 	resScales[resTry] = res;
 	res *= resIncrement;
 	}
     }
 else
     resTryCount = 0;
 
 /* Write out primary full resolution data in sections, collect stats to use for reductions. */
 bits64 dataOffset = ftell(f);
 bits64 sectionCount = bbiCountSectionsNeeded(usageList, itemsPerSlot);
 writeOne(f, sectionCount);
 struct bbiBoundsArray *boundsArray;
 AllocArray(boundsArray, sectionCount);
 lineFileRewind(lf);
 bits32 maxSectionSize = 0;
 writeSections(usageList, lf, itemsPerSlot, boundsArray, sectionCount, f,
 	resTryCount, resScales, resSizes, doCompress, &maxSectionSize);
 verboseTime(2, "pass2");
 
 /* Write out primary data index. */
 bits64 indexOffset = ftell(f);
 cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), sectionCount,
     blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
     indexOffset, f);
 verboseTime(2, "index write");
 
 /* Declare arrays and vars that track the zoom levels we actually output. */
 bits32 zoomAmounts[bbiMaxZoomLevels];
 bits64 zoomDataOffsets[bbiMaxZoomLevels];
 bits64 zoomIndexOffsets[bbiMaxZoomLevels];
 int zoomLevels = 0;
 
 /* Write out first zoomed section while storing in memory next zoom level. */
 if (minDiff > 0)
     {
     bits64 dataSize = indexOffset - dataOffset;
     int maxReducedSize = dataSize/2;
     int initialReduction = 0, initialReducedCount = 0;
 
     /* Figure out initialReduction for zoom. */
     for (resTry = 0; resTry < resTryCount; ++resTry)
 	{
 	bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk);
 	if (doCompress)
 	    reducedSize /= 2;	// Estimate!
 	if (reducedSize <= maxReducedSize)
 	    {
 	    initialReduction = resScales[resTry];
 	    initialReducedCount = resSizes[resTry];
 	    break;
 	    }
 	}
     verbose(2, "initialReduction %d, initialReducedCount = %d\n", 
     	initialReduction, initialReducedCount);
 
     if (initialReduction > 0)
         {
 	struct lm *lm = lmInit(0);
 	int zoomIncrement = 4;
 	lineFileRewind(lf);
 	struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, 
 		lf, initialReduction, initialReducedCount,
 		resIncrement, blockSize, itemsPerSlot, doCompress, lm, 
 		f, &zoomDataOffsets[0], &zoomIndexOffsets[0], &totalSum);
 	verboseTime(2, "writeReducedOnceReturnReducedTwice");
 	zoomAmounts[0] = initialReduction;
 	zoomLevels = 1;
 
 	int zoomCount = initialReducedCount;
 	int reduction = initialReduction * zoomIncrement;
 	while (zoomLevels < bbiMaxZoomLevels)
 	    {
 	    int rezoomCount = slCount(rezoomedList);
 	    if (rezoomCount >= zoomCount)
 	        break;
 	    zoomCount = rezoomCount;
 	    zoomDataOffsets[zoomLevels] = ftell(f);
 	    zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, 
 	    	blockSize, itemsPerSlot, doCompress, f);
 	    zoomAmounts[zoomLevels] = reduction;
 	    ++zoomLevels;
 	    reduction *= zoomIncrement;
 	    rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm);
 	    }
 	lmCleanup(&lm);
 	verboseTime(2, "further reductions");
 	}
 
     }
 
 /* Figure out buffer size needed for uncompression if need be. */
 if (doCompress)
     {
     int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
     uncompressBufSize = max(maxSectionSize, maxZoomUncompSize);
     }
 
 /* Go back and rewrite header. */
 rewind(f);
 bits32 sig = bigWigSig;
 bits16 version = bbiCurrentVersion;
 bits16 summaryCount = zoomLevels;
 bits16 reserved16 = 0;
 bits32 reserved32 = 0;
 bits64 reserved64 = 0;
 
 /* Write fixed header */
 writeOne(f, sig);
 writeOne(f, version);
 writeOne(f, summaryCount);
 writeOne(f, chromTreeOffset);
 writeOne(f, dataOffset);
 writeOne(f, indexOffset);
 writeOne(f, reserved16);	// fieldCount
 writeOne(f, reserved16);	// definedFieldCount
 writeOne(f, reserved64);	// autoSqlOffset
 writeOne(f, totalSummaryOffset);
 writeOne(f, uncompressBufSize);
 for (i=0; i<2; ++i)
     writeOne(f, reserved32);
 assert(ftell(f) == 64);
 
 /* Write summary headers with data. */
 verbose(2, "Writing %d levels of zoom\n", zoomLevels);
 for (i=0; i<zoomLevels; ++i)
     {
     verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]);
     writeOne(f, zoomAmounts[i]);
     writeOne(f, reserved32);
     writeOne(f, zoomDataOffsets[i]);
     writeOne(f, zoomIndexOffsets[i]);
     }
 /* Write rest of summary headers with no data. */
 for (i=zoomLevels; i<bbiMaxZoomLevels; ++i)
     {
     writeOne(f, reserved32);
     writeOne(f, reserved32);
     writeOne(f, reserved64);
     writeOne(f, reserved64);
     }
 
 /* Write total summary. */
 fseek(f, totalSummaryOffset, SEEK_SET);
 bbiSummaryElementWrite(f, &totalSum);
 
 lineFileClose(&lf);
 carefulClose(&f);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 blockSize = optionInt("blockSize", blockSize);
 itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
 doCompress = optionExists("compress");
 if (argc != 4)
     usage();
 bedGraphToBigWig(argv[1], argv[2], argv[3]);
 return 0;
 }