src/utils/bedToBigBed/bedToBigBed.c 1.12

1.12 2009/11/05 19:35:01 kent
Bumping version number. Making sectionCount and zoomCount more consistent.
Index: src/utils/bedToBigBed/bedToBigBed.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/bedToBigBed/bedToBigBed.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -b -B -U 1000000 -r1.11 -r1.12
--- src/utils/bedToBigBed/bedToBigBed.c	28 Sep 2009 18:20:28 -0000	1.11
+++ src/utils/bedToBigBed/bedToBigBed.c	5 Nov 2009 19:35:01 -0000	1.12
@@ -1,580 +1,583 @@
 /* bedToBigBed - Convert bed to bigBed.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "obscure.h"
 #include "asParse.h"
 #include "basicBed.h"
 #include "sig.h"
 #include "rangeTree.h"
 #include "sqlNum.h"
 #include "bigBed.h"
 
 static char const rcsid[] = "$Id$";
 
 int blockSize = 1024;
 int itemsPerSlot = 256;
 int bedFields = 0;
 char *as = NULL;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bedToBigBed - Convert bed file to bigBed.\n"
   "usage:\n"
   "   bedToBigBed in.bed chrom.sizes out.bb\n"
   "Where in.bed is in one of the ascii bed formats, but not including track lines\n"
   "and chrom.sizes is two column: <chromosome name> <size in bases>\n"
   "and out.bb is the output indexed big bed file.\n"
   "The in.bed file must be sorted by chromosome,start.\n"
   "\n"
   "options:\n"
   "   -blockSize=N - Number of items to bundle in r-tree.  Default %d\n"
   "   -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n"
   "   -bedFields=N - Number of fields that fit standard bed definition.  If undefined\n"
   "                  assumes all fields in bed are defined.\n"
   "   -as=fields.as - If have non-standard fields, it's great to put a definition of\n"
   "                   each field in a row in AutoSql format here.\n"
   , blockSize, itemsPerSlot
   );
 }
 
 static struct optionSpec options[] = {
    {"blockSize", OPTION_INT},
    {"itemsPerSlot", OPTION_INT},
    {"bedFields", OPTION_INT},
    {"as", OPTION_STRING},
    {NULL, 0},
 };
 
 void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, 
 	bits16 definedFieldCount, int itemsPerSlot, struct bbiBoundsArray *bounds, 
 	int sectionCount, FILE *f, int resTryCount, int resScales[], int resSizes[],
 	bits16 *retFieldCount, bits16 *retDefinedFieldCount)
 /* Read through lf, writing it in f.  Save starting points of blocks (every itemsPerSlot)
  * to boundsArray */
 {
 struct bbiChromUsage *usage = usageList;
 char *line, **row = NULL;
 int fieldCount = 0, fieldAlloc=0, lastField = 0;
 int itemIx = 0, sectionIx = 0;
 bits64 blockOffset = 0;
 int startPos = 0, endPos = 0;
 bits32 chromId = 0;
 boolean allocedAs = FALSE;
 
 /* Will keep track of some things that help us determine how much to reduce. */
 bits32 resEnds[resTryCount];
 int resTry;
 for (resTry = 0; resTry < resTryCount; ++resTry)
     resEnds[resTry] = 0;
 boolean atEnd = FALSE, sameChrom = FALSE;
 bits32 start = 0, end = 0;
 char *chrom = NULL;
 
 for (;;)
     {
     /* Get next line of input if any. */
     if (lineFileNextReal(lf, &line))
 	{
 	/* First time through figure out the field count, and if not set, the defined field count. */
 	if (fieldCount == 0)
 	    {
 	    if (as == NULL)
 		{
 		fieldCount = chopByWhite(line, NULL, 0);
 		if (definedFieldCount == 0)
 		    definedFieldCount = fieldCount;
 		char *asText = bedAsDef(definedFieldCount, fieldCount);
 		as = asParseText(asText);
 		allocedAs = TRUE;
 		freeMem(asText);
 		}
 	    else
 		{
 		fieldCount = slCount(as->columnList);
 		}
 	    fieldAlloc = fieldCount + 1;
 	    lastField = fieldCount - 1;
 	    AllocArray(row, fieldAlloc);
 	    *retFieldCount = fieldCount;
 	    *retDefinedFieldCount = definedFieldCount;
 	    }
 
 	/* Chop up line and make sure the word count is right. */
 	int wordCount = chopByWhite(line, row, fieldAlloc);
 	lineFileExpectWords(lf, fieldCount, wordCount);
 
 	/* Parse out first three fields. */
 	chrom = row[0];
 	start = lineFileNeedNum(lf, row, 1);
 	end = lineFileNeedNum(lf, row, 2);
 
 	/* Check remaining fields are formatted right. */
 	if (fieldCount > 3)
 	    {
 	    /* Go through and check that numerical strings really are numerical. */
 	    struct asColumn *asCol = slElementFromIx(as->columnList, 3);
 	    int i;
 	    for (i=3; i<fieldCount; ++i)
 		{
 		enum asTypes type = asCol->lowType->type;
 		if (! (asCol->isList || asCol->isArray))
 		    {
 		    if (asTypesIsInt(type))
 			lineFileNeedFullNum(lf, row, i);
 		    else if (asTypesIsFloating(type))
 			lineFileNeedDouble(lf, row, i);
 		    }
 		asCol = asCol->next;
 		}
 	    }
 
 	sameChrom = sameString(chrom, usage->name);
 	}
     else  /* No next line */
 	{
 	atEnd = TRUE;
 	}
 
 
     /* Check conditions that would end block and save block info and advance to next if need be. */
     if (atEnd || !sameChrom || itemIx >= itemsPerSlot)
         {
 	/* Save info on existing block. */
 	struct bbiBoundsArray *b = &bounds[sectionIx];
 	b->offset = blockOffset;
 	b->range.chromIx = chromId;
 	b->range.start = startPos;
 	b->range.end = endPos;
 	++sectionIx;
 	itemIx = 0;
 
 	if (atEnd)
 	    break;
 	}
 
     /* Advance to next chromosome if need be and get chromosome id. */
     if (!sameChrom)
         {
 	usage = usage->next;
 	assert(usage != NULL);
 	assert(sameString(chrom, usage->name));
 	for (resTry = 0; resTry < resTryCount; ++resTry)
 	    resEnds[resTry] = 0;
 	}
     chromId = usage->id;
 
     /* At start of block we save a lot of info. */
     if (itemIx == 0)
         {
 	blockOffset = ftell(f);
 	startPos = start;
 	endPos = end;
 	}
     /* Otherwise just update end. */
         {
 	if (endPos < end)
 	    endPos = end;
 	/* No need to update startPos since list is sorted. */
 	}
 
     /* Write out data. */
     writeOne(f, chromId);
     writeOne(f, start);
     writeOne(f, end);
     if (fieldCount > 3)
         {
 	int i;
 	/* Write 3rd through next to last field and a tab separator. */
 	for (i=3; i<lastField; ++i)
 	    {
 	    char *s = row[i];
 	    int len = strlen(s);
 	    mustWrite(f, s, len);
 	    fputc('\t', f);
 	    }
 	/* Write last field and terminal zero */
 	char *s = row[lastField];
 	int len = strlen(s);
 	mustWrite(f, s, len);
 	}
     fputc(0, f);
 
     itemIx += 1;
 
     /* Do zoom counting. */
     for (resTry = 0; resTry < resTryCount; ++resTry)
         {
 	bits32 resEnd = resEnds[resTry];
 	if (start >= resEnd)
 	    {
 	    resSizes[resTry] += 1;
 	    resEnds[resTry] = resEnd = start + resScales[resTry];
 	    }
 	while (end > resEnd)
 	    {
 	    resSizes[resTry] += 1;
 	    resEnds[resTry] = resEnd = resEnd + resScales[resTry];
 	    }
 	}
     }
 assert(sectionIx == sectionCount);
 freez(&row);
 if (allocedAs)
     asObjectFreeList(&as);
 }
 
 struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom)
 /* Read lines from bed file as long as they match chrom.  Return a rangeTree that
  * corresponds to the coverage. */
 {
 struct rbTree *tree = rangeTreeNew();
 char *line;
 while (lineFileNextReal(lf, &line))
     {
     if (!startsWithWord(chrom, line))
         {
 	lineFileReuse(lf);
 	break;
 	}
     char *row[3];
     chopLine(line, row);
     unsigned start = sqlUnsigned(row[1]);
     unsigned end = sqlUnsigned(row[2]);
     rangeTreeAddToCoverageDepth(tree, start, end);
     }
 return tree;
 }
 
-struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
-	int fieldCount, struct lineFile *lf, int initialReduction, int initialReductionCount, 
+static struct bbiSummary *writeReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, 
+	int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, 
 	int zoomIncrement, int blockSize, int itemsPerSlot, 
 	struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart)
 /* Write out data reduced by factor of initialReduction.  Also calculate and keep in memory
  * next reduction level.  This is more work than some ways, but it keeps us from having to
  * keep the first reduction entirely in memory. */
 {
 struct bbiSummary *twiceReducedList = NULL;
 bits32 doubleReductionSize = initialReduction * zoomIncrement;
 struct bbiChromUsage *usage = usageList;
 struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd;
 boundsPt = AllocArray(boundsArray, initialReductionCount);
 boundsEnd = boundsPt + initialReductionCount;
 
 *retDataStart = ftell(f);
+writeOne(f, initialReductionCount);
 
 /* This gets a little complicated I'm afraid.  The strategy is to:
  *   1) Build up a range tree that represents coverage depth on that chromosome
  *      This also has the nice side effect of getting rid of overlaps.
  *   2) Stream through the range tree, outputting the initial summary level and
  *      further reducing. 
  */
 for (usage = usageList; usage != NULL; usage = usage->next)
     {
     struct bbiSummary oneSummary, *sum = NULL;
     struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name);
     struct range *range, *rangeList = rangeTreeList(rangeTree);
     for (range = rangeList; range != NULL; range = range->next)
         {
 	double val = ptToInt(range->val);
 	int start = range->start;
 	int end = range->end;
 	/* If start past existing block then output it. */
 	if (sum != NULL && sum->end <= start)
 	    {
 	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
 		&boundsPt, boundsEnd, usage->size, lm, f);
 	    sum = NULL;
 	    }
 	/* If don't have a summary we're working on now, make one. */
 	if (sum == NULL)
 	    {
 	    oneSummary.chromId = usage->id;
 	    oneSummary.start = start;
 	    oneSummary.end = start + initialReduction;
 	    if (oneSummary.end > usage->size) oneSummary.end = usage->size;
 	    oneSummary.minVal = oneSummary.maxVal = val;
 	    oneSummary.sumData = oneSummary.sumSquares = 0.0;
 	    oneSummary.validCount = 0;
 	    sum = &oneSummary;
 	    }
 	/* Deal with case where might have to split an item between multiple summaries.  This
 	 * loop handles all but the final affected summary in that case. */
 	while (end > sum->end)
 	    {
 	    verbose(3, "Splitting size %d at %d\n", end - start, end);
 	    /* Fold in bits that overlap with existing summary and output. */
 	    bits32 overlap = rangeIntersection(start, end, sum->start, sum->end);
 	    sum->validCount += overlap;
 	    if (sum->minVal > val) sum->minVal = val;
 	    if (sum->maxVal < val) sum->maxVal = val;
 	    sum->sumData += val * overlap;
 	    sum->sumSquares += val * overlap;
 	    bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
 		    &boundsPt, boundsEnd, usage->size, lm, f);
 
 	    /* Move summary to next part. */
 	    sum->start = start = sum->end;
 	    sum->end = start + initialReduction;
 	    if (sum->end > usage->size) sum->end = usage->size;
 	    sum->minVal = sum->maxVal = val;
 	    sum->sumData = sum->sumSquares = 0.0;
 	    sum->validCount = 0;
 	    }
 
 	/* Add to summary. */
 	bits32 size = end - start;
 	sum->validCount += size;
 	if (sum->minVal > val) sum->minVal = val;
 	if (sum->maxVal < val) sum->maxVal = val;
 	sum->sumData += val * size;
 	sum->sumSquares += val * size;
 	}
     if (sum != NULL)
 	{
 	bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, 
 	    &boundsPt, boundsEnd, usage->size, lm, f);
 	}
     rangeTreeFree(&rangeTree);
     }
 
 /* Write out 1st zoom index. */
 int indexOffset = *retIndexStart = ftell(f);
 assert(boundsPt == boundsEnd);
 cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount,
     blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
     indexOffset, f);
 
 freez(&boundsArray);
 slReverse(&twiceReducedList);
 return twiceReducedList;
 }
 
 
 void bbFileCreate(
 	char *inName, 	  /* Input file in a tabular bed format <chrom><start><end> + whatever. */
 	char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
 	bits16 definedFieldCount,  /* Number of defined bed fields - 3-16 or so.  0 means all fields
 				    * are the defined bed ones. */
 	char *asFileName, /* If non-null points to a .as file that describes fields. */
 	char *outName)    /* BigBed output file name. */
 /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
 {
 /* Set up timing measures. */
 verboseTime(1, NULL);
 struct lineFile *lf = lineFileOpen(inName, TRUE);
 
 /* Load up as object if defined in file. */
 struct asObject *as = NULL;
 if (asFileName != NULL)
     {
     /* Parse it and do sanity check. */
     as = asParseFile(asFileName);
     if (as->next != NULL)
         errAbort("Can only handle .as files containing a single object.");
     }
 
 /* Load in chromosome sizes. */
 struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes);
 verbose(2, "Read %d chromosomes and sizes from %s\n",  chromSizesHash->elCount, chromSizes);
 
 /* Do first pass, mostly just scanning file and counting hits per chromosome. */
 int minDiff = 0;
 double aveSpan = 0;
-struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan);
+bits64 bedCount = 0;
+struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount);
 verboseTime(1, "pass1 - making usageList");
 verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan);
 
 /* Open output file and write dummy header. */
 FILE *f = mustOpen(outName, "wb");
 bbiWriteDummyHeader(f);
 bbiWriteDummyZooms(f);
 
 /* Write out asFile if any */
 bits64 asOffset = 0;
 if (asFileName != NULL)
     {
     int colCount = slCount(as->columnList);
     asOffset = ftell(f);
     FILE *asFile = mustOpen(asFileName, "r");
     copyOpenFile(asFile, f);
     fputc(0, f);
     carefulClose(&asFile);
     verbose(2, "%s has %d columns\n", asFileName, colCount);
     }
 
 /* Write out chromosome/size database. */
 bits64 chromTreeOffset = ftell(f);
 bbiWriteChromInfo(usageList, blockSize, f);
 
 /* Set up to keep track of possible initial reduction levels. */
 int resTryCount = 10, resTry;
 int resIncrement = 4;
 int resScales[resTryCount], resSizes[resTryCount];
 int minZoom = 10;
 int res = aveSpan;
 if (res < minZoom)
     res = minZoom;
 for (resTry = 0; resTry < resTryCount; ++resTry)
     {
     resSizes[resTry] = 0;
     resScales[resTry] = res;
     res *= resIncrement;
     }
 
 /* Write out primary full resolution data in sections, collect stats to use for reductions. */
 bits64 dataOffset = ftell(f);
+writeOne(f, bedCount);
 bits32 blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot);
 struct bbiBoundsArray *boundsArray;
 AllocArray(boundsArray, blockCount);
 lineFileRewind(lf);
 bits16 fieldCount=0;
 writeBlocks(usageList, lf, as, definedFieldCount, itemsPerSlot, boundsArray, blockCount, f,
 	resTryCount, resScales, resSizes, &fieldCount, &definedFieldCount);
 verboseTime(1, "pass2 - checking and writing primary data");
 
 /* Write out primary data index. */
 bits64 indexOffset = ftell(f);
 cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount,
     blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
     indexOffset, f);
 freez(&boundsArray);
 verboseTime(1, "index write");
 
 /* Declare arrays and vars that track the zoom levels we actually output. */
 bits32 zoomAmounts[bbiMaxZoomLevels];
 bits64 zoomDataOffsets[bbiMaxZoomLevels];
 bits64 zoomIndexOffsets[bbiMaxZoomLevels];
 int zoomLevels = 0;
 
 /* Write out first zoomed section while storing in memory next zoom level. */
 if (aveSpan > 0)
     {
     bits64 dataSize = indexOffset - dataOffset;
     int maxReducedSize = dataSize/2;
     int initialReduction = 0, initialReducedCount = 0;
 
     /* Figure out initialReduction for zoom. */
     for (resTry = 0; resTry < resTryCount; ++resTry)
 	{
 	bits64 reducedSize = resSizes[resTry] * sizeof(struct bbiSummaryOnDisk);
 	if (reducedSize <= maxReducedSize)
 	    {
 	    initialReduction = resScales[resTry];
 	    initialReducedCount = resSizes[resTry];
 	    break;
 	    }
 	}
     verbose(2, "initialReduction %d, initialReducedCount = %d\n", 
     	initialReduction, initialReducedCount);
     verbose(2, "dataSize %llu, reducedSize %llu, resScales[0] = %d\n", dataSize, (bits64)initialReducedCount*sizeof(struct bbiSummaryOnDisk), resScales[0]);
 
     if (initialReduction > 0)
         {
 	struct lm *lm = lmInit(0);
 	int zoomIncrement = 4;
 	lineFileRewind(lf);
 	struct bbiSummary *rezoomedList = writeReducedOnceReturnReducedTwice(usageList, 
 		fieldCount, lf, initialReduction, initialReducedCount,
 		resIncrement, blockSize, itemsPerSlot, lm, 
 		f, &zoomDataOffsets[0], &zoomIndexOffsets[0]);
 	verboseTime(1, "pass3 - writeReducedOnceReturnReducedTwice");
 	zoomAmounts[0] = initialReduction;
 	zoomLevels = 1;
 
 	int zoomCount = initialReducedCount;
 	int reduction = initialReduction * zoomIncrement;
 	while (zoomLevels < bbiMaxZoomLevels)
 	    {
 	    int rezoomCount = slCount(rezoomedList);
 	    if (rezoomCount >= zoomCount)
 	        break;
 	    zoomCount = rezoomCount;
 	    zoomDataOffsets[zoomLevels] = ftell(f);
 	    zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, 
 	    	blockSize, itemsPerSlot, f);
 	    zoomAmounts[zoomLevels] = reduction;
 	    ++zoomLevels;
 	    reduction *= zoomIncrement;
 	    rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm);
 	    }
 	lmCleanup(&lm);
 	verboseTime(1, "further reductions");
 	}
     }
 /* Go back and rewrite header. */
 rewind(f);
 bits32 sig = bigBedSig;
-bits16 version = 1;
+bits16 version = bbiCurrentVersion;
 bits16 summaryCount = zoomLevels;
 bits32 reserved32 = 0;
 bits32 reserved64 = 0;
 
 /* Write fixed header */
 writeOne(f, sig);
 writeOne(f, version);
 writeOne(f, summaryCount);
 writeOne(f, chromTreeOffset);
 writeOne(f, dataOffset);
 writeOne(f, indexOffset);
 writeOne(f, fieldCount);
 writeOne(f, definedFieldCount);
 writeOne(f, asOffset);
 int i;
 for (i=0; i<5; ++i)
     writeOne(f, reserved32);
 
 /* Write summary headers with data. */
 verbose(2, "Writing %d levels of zoom\n", zoomLevels);
 for (i=0; i<zoomLevels; ++i)
     {
     verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]);
     writeOne(f, zoomAmounts[i]);
     writeOne(f, reserved32);
     writeOne(f, zoomDataOffsets[i]);
     writeOne(f, zoomIndexOffsets[i]);
     }
 /* Write rest of summary headers with no data. */
 for (i=zoomLevels; i<bbiMaxZoomLevels; ++i)
     {
     writeOne(f, reserved32);
     writeOne(f, reserved32);
     writeOne(f, reserved64);
     writeOne(f, reserved64);
     }
 
 
 /* Clean up. */
 lineFileClose(&lf);
 carefulClose(&f);
 freeHash(&chromSizesHash);
 bbiChromUsageFreeList(&usageList);
 asObjectFreeList(&as);
 }
 
 void bedToBigBed(char *inName, char *chromSizes, char *outName)
 /* bedToBigBed - Convert bed file to bigBed.. */
 {
 bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, bedFields, as, 
 	outName);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 blockSize = optionInt("blockSize", blockSize);
 itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
 bedFields = optionInt("bedFields", bedFields);
 as = optionVal("as", as);
 if (argc != 4)
     usage();
 bedToBigBed(argv[1], argv[2], argv[3]);
 optionFree();
 return 0;
 }