src/lib/bwgCreate.c e947f44bed3e61a500f59423d3c560eff9bcf270

e947f44bed3e61a500f59423d3c560eff9bcf270
galt
  Fri Dec 13 15:03:41 2013 -0800
Merging in bigWigCat shared branch with --squash. This is Daniel Zerbino's work.
diff --git src/lib/bwgCreate.c src/lib/bwgCreate.c
index af76cd7..7cf8e3f 100644
--- src/lib/bwgCreate.c
+++ src/lib/bwgCreate.c
@@ -614,30 +614,102 @@
 int i;
 for (i = 0, uniq = uniqList; i < chromCount; ++i, uniq = uniq->next)
     {
     chromArray[i].name = uniq->val;
     chromArray[i].id = i;
     chromArray[i].size = hashIntVal(chromSizeHash, uniq->val);
     }
 
 /* Clean up, set return values and go home. */
 slFreeList(&uniqList);
 *retChromCount = chromCount;
 *retChromArray = chromArray;
 *retMaxChromNameSize = maxChromNameSize;
 }
 
+static int bwgStrcmp (const void * A, const void * B) {
+	char * stringA = *((char **) A);
+	char * stringB = *((char **) B);
+	return strcmp(stringA, stringB);
+}
+
+void bwgMakeAllChromInfo(struct bwgSection *sectionList, struct hash *chromSizeHash,
+	int *retChromCount, struct bbiChromInfo **retChromArray,
+	int *retMaxChromNameSize)
+/* Fill in chromId field in sectionList.  Return array of chromosome name/ids. 
+ * The chromSizeHash is keyed by name, and has int values. */
+{
+/* Build up list of unique chromosome names. */
+int maxChromNameSize = 0;
+
+/* Get list of values */
+int chromCount = chromSizeHash->elCount;
+char ** chromName, ** chromNames;
+AllocArray(chromNames, chromCount);
+chromName = chromNames;
+struct hashEl* el;
+struct hashCookie cookie = hashFirst(chromSizeHash);
+for (el = hashNext(&cookie); el; el = hashNext(&cookie)) {
+	*chromName = el->name;
+	if (strlen(el->name) > maxChromNameSize)
+		maxChromNameSize = strlen(el->name);
+	chromName++;
+}
+qsort(chromNames, chromCount, sizeof(char *), bwgStrcmp);
+
+/* Allocate and fill in results array. */
+struct bbiChromInfo *chromArray;
+AllocArray(chromArray, chromCount);
+int i;
+for (i = 0; i < chromCount; ++i)
+    {
+    chromArray[i].name = chromNames[i];
+    chromArray[i].id = i;
+    chromArray[i].size = hashIntVal(chromSizeHash, chromNames[i]);
+    }
+
+// Assign IDs to sections:
+struct bwgSection *section;
+char *name = "";
+bits32 chromId = 0;
+for (section = sectionList; section != NULL; section = section->next)
+    {
+    if (!sameString(section->chrom, name))
+        {
+        for (i = 0; i < chromCount; ++i)
+            {
+	    if (sameString(section->chrom, chromArray[i].name)) 
+	        {
+		    section->chromId = i;
+	    	    break;
+	        }
+	    }
+	if (i == chromCount)
+		errAbort("Could not find %s in list of chromosomes\n", section->chrom);
+	chromId = section->chromId;
+	name = section->chrom;
+	}
+    else 
+	section->chromId = chromId;
+    }
+
+/* Clean up, set return values and go home. */
+*retChromCount = chromCount;
+*retChromArray = chromArray;
+*retMaxChromNameSize = maxChromNameSize;
+}
+
 int bwgAverageResolution(struct bwgSection *sectionList)
 /* Return the average resolution seen in sectionList. */
 {
 if (sectionList == NULL)
     return 1;
 bits64 totalRes = 0;
 bits32 sectionCount = 0;
 struct bwgSection *section;
 int i;
 for (section = sectionList; section != NULL; section = section->next)
     {
     int sectionRes = 0;
     switch (section->type)
         {
 	case bwgTypeBedGraph:
@@ -784,119 +856,152 @@
 	case bwgTypeVariableStep:
 	    bwgReduceVariableStep(section, chromSize, reduction, &outList);
 	    break;
 	case bwgTypeFixedStep:
 	    bwgReduceFixedStep(section, chromSize, reduction, &outList);
 	    break;
 	default:
 	    internalErr();
 	    return 0;
 	}
     }
 slReverse(&outList);
 return outList;
 }
 
-void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, 
-	int blockSize, int itemsPerSlot, boolean doCompress, char *fileName)
-/* Create a bigWig file out of a sorted sectionList. */
-{
-bits64 sectionCount = slCount(sectionList);
-FILE *f = mustOpen(fileName, "wb");
-bits32 sig = bigWigSig;
-bits16 version = bbiCurrentVersion;
-bits16 summaryCount = 0;
-bits16 reserved16 = 0;
-bits32 reserved32 = 0;
-bits64 reserved64 = 0;
-bits64 dataOffset = 0, dataOffsetPos;
-bits64 indexOffset = 0, indexOffsetPos;
-bits64 chromTreeOffset = 0, chromTreeOffsetPos;
-bits64 totalSummaryOffset = 0, totalSummaryOffsetPos;
-bits32 uncompressBufSize = 0;
-bits64 uncompressBufSizePos;
-struct bbiSummary *reduceSummaries[10];
-bits32 reductionAmounts[10];
-bits64 reductionDataOffsetPos[10];
-bits64 reductionDataOffsets[10];
-bits64 reductionIndexOffsets[10];
-int i;
-
-/* Figure out chromosome ID's. */
-struct bbiChromInfo *chromInfoArray;
-int chromCount, maxChromNameSize;
-bwgMakeChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize);
-
+static void bwgComputeDynamicSummaries(struct bwgSection *sectionList, struct bbiSummary ** reduceSummaries, bits16 * summaryCount, struct bbiChromInfo *chromInfoArray, int chromCount, bits32 * reductionAmounts, boolean doCompress) {
 /* Figure out initial summary level - starting with a summary 10 times the amount
  * of the smallest item.  See if summarized data is smaller than half input data, if
  * not bump up reduction by a factor of 2 until it is, or until further summarying
  * yeilds no size reduction. */
+int i;
 int  minRes = bwgAverageResolution(sectionList);
 int initialReduction = minRes*10;
 bits64 fullSize = bwgTotalSectionSize(sectionList);
-bits64 maxReducedSize = fullSize/2;
-struct bbiSummary *firstSummaryList = NULL, *summaryList = NULL;
 bits64 lastSummarySize = 0, summarySize;
+bits64 maxReducedSize = fullSize/2;
+struct bbiSummary *summaryList = NULL;
 for (;;)
     {
     summaryList = bwgReduceSectionList(sectionList, chromInfoArray, initialReduction);
     bits64 summarySize = bbiTotalSummarySize(summaryList);
     if (doCompress)
 	{
         summarySize *= 2;	// Compensate for summary not compressing as well as primary data
 	}
     if (summarySize >= maxReducedSize && summarySize != lastSummarySize)
         {
 	/* Need to do more reduction.  First scale reduction by amount that it missed
 	 * being small enough last time, with an extra 10% for good measure.  Then
 	 * just to keep from spinning through loop two many times, make sure this is
 	 * at least 2x the previous reduction. */
 	int nextReduction = 1.1 * initialReduction * summarySize / maxReducedSize;
 	if (nextReduction < initialReduction*2)
 	    nextReduction = initialReduction*2;
 	initialReduction = nextReduction;
 	bbiSummaryFreeList(&summaryList);
 	lastSummarySize = summarySize;
 	}
     else
         break;
     }
-summaryCount = 1;
-reduceSummaries[0] = firstSummaryList = summaryList;
+*summaryCount = 1;
+reduceSummaries[0] = summaryList;
 reductionAmounts[0] = initialReduction;
 
 /* Now calculate up to 10 levels of further summary. */
 bits64 reduction = initialReduction;
-for (i=0; i<ArraySize(reduceSummaries)-1; i++)
+for (i=0; i<9; i++)
     {
     reduction *= 4;
     if (reduction > 1000000000)
         break;
-    summaryList = bbiReduceSummaryList(reduceSummaries[summaryCount-1], chromInfoArray, 
+    summaryList = bbiReduceSummaryList(reduceSummaries[*summaryCount-1], chromInfoArray, 
     	reduction);
     summarySize = bbiTotalSummarySize(summaryList);
     if (summarySize != lastSummarySize)
         {
- 	reduceSummaries[summaryCount] = summaryList;
-	reductionAmounts[summaryCount] = reduction;
-	++summaryCount;
+ 	reduceSummaries[*summaryCount] = summaryList;
+	reductionAmounts[*summaryCount] = reduction;
+	++(*summaryCount);
 	}
     int summaryItemCount = slCount(summaryList);
     if (summaryItemCount <= chromCount)
         break;
     }
 
+}
+
+static void bwgComputeFixedSummaries(struct bwgSection * sectionList, struct bbiSummary ** reduceSummaries, bits16 * summaryCount, struct bbiChromInfo *chromInfoArray, bits32 * reductionAmounts) {
+// Hack: pre-defining summary levels, set off Ensembl default zoom levels
+// The last two values of this array were extrapolated following Jim's formula
+int i;
+#define REDUCTION_COUNT 10
+bits32 presetReductions[REDUCTION_COUNT] = {30, 65, 130, 260, 450, 648, 950, 1296, 4800, 19200}; 
+
+bits64 reduction = reductionAmounts[0] = presetReductions[0];
+reduceSummaries[0] = bwgReduceSectionList(sectionList, chromInfoArray, presetReductions[0]);
+
+for (i=1; i<REDUCTION_COUNT; i++)
+    {
+    reduction = reductionAmounts[i] = presetReductions[i];
+    reduceSummaries[i] = bbiReduceSummaryList(reduceSummaries[i-1], chromInfoArray, 
+    	reduction);
+    }
+
+*summaryCount = REDUCTION_COUNT;
+}
+
+void bwgCreate(struct bwgSection *sectionList, struct hash *chromSizeHash, 
+	int blockSize, int itemsPerSlot, boolean doCompress, boolean keepAllChromosomes,
+        boolean fixedSummaries, char *fileName)
+/* Create a bigWig file out of a sorted sectionList. */
+{
+bits64 sectionCount = slCount(sectionList);
+FILE *f = mustOpen(fileName, "wb");
+bits32 sig = bigWigSig;
+bits16 version = bbiCurrentVersion;
+bits16 summaryCount = 0;
+bits16 reserved16 = 0;
+bits32 reserved32 = 0;
+bits64 reserved64 = 0;
+bits64 dataOffset = 0, dataOffsetPos;
+bits64 indexOffset = 0, indexOffsetPos;
+bits64 chromTreeOffset = 0, chromTreeOffsetPos;
+bits64 totalSummaryOffset = 0, totalSummaryOffsetPos;
+bits32 uncompressBufSize = 0;
+bits64 uncompressBufSizePos;
+struct bbiSummary *reduceSummaries[10];
+bits32 reductionAmounts[10];
+bits64 reductionDataOffsetPos[10];
+bits64 reductionDataOffsets[10];
+bits64 reductionIndexOffsets[10];
+int i;
+
+/* Figure out chromosome ID's. */
+struct bbiChromInfo *chromInfoArray;
+int chromCount, maxChromNameSize;
+if (keepAllChromosomes)
+    bwgMakeAllChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize);
+else
+    bwgMakeChromInfo(sectionList, chromSizeHash, &chromCount, &chromInfoArray, &maxChromNameSize);
+
+if (fixedSummaries) 
+    bwgComputeFixedSummaries(sectionList, reduceSummaries, &summaryCount, chromInfoArray, reductionAmounts);
+else 
+    bwgComputeDynamicSummaries(sectionList, reduceSummaries, &summaryCount, chromInfoArray, chromCount, reductionAmounts, doCompress);
+
 /* Write fixed header. */
 writeOne(f, sig);
 writeOne(f, version);
 writeOne(f, summaryCount);
 chromTreeOffsetPos = ftell(f);
 writeOne(f, chromTreeOffset);
 dataOffsetPos = ftell(f);
 writeOne(f, dataOffset);
 indexOffsetPos = ftell(f);
 writeOne(f, indexOffset);
 writeOne(f, reserved16);  /* fieldCount */
 writeOne(f, reserved16);  /* definedFieldCount */
 writeOne(f, reserved64);  /* autoSqlOffset. */
 totalSummaryOffsetPos = ftell(f);
 writeOne(f, totalSummaryOffset);
@@ -950,31 +1055,31 @@
 cirTreeFileBulkIndexToOpenFile(sectionArray, sizeof(sectionArray[0]), sectionCount,
     blockSize, 1, NULL, bwgSectionFetchKey, bwgSectionFetchOffset, 
     indexOffset, f);
 freez(&sectionArray);
 
 /* Write out summary sections. */
 verbose(2, "bwgCreate writing %d summaries\n", summaryCount);
 for (i=0; i<summaryCount; ++i)
     {
     reductionDataOffsets[i] = ftell(f);
     reductionIndexOffsets[i] = bbiWriteSummaryAndIndex(reduceSummaries[i], blockSize, itemsPerSlot, doCompress, f);
     verbose(3, "wrote %d of data, %d of index on level %d\n", (int)(reductionIndexOffsets[i] - reductionDataOffsets[i]), (int)(ftell(f) - reductionIndexOffsets[i]), i);
     }
 
 /* Calculate summary */
-struct bbiSummary *sum = firstSummaryList;
+struct bbiSummary *sum = reduceSummaries[0];
 if (sum != NULL)
     {
     totalSum.validCount = sum->validCount;
     totalSum.minVal = sum->minVal;
     totalSum.maxVal = sum->maxVal;
     totalSum.sumData = sum->sumData;
     totalSum.sumSquares = sum->sumSquares;
     for (sum = sum->next; sum != NULL; sum = sum->next)
 	{
 	totalSum.validCount += sum->validCount;
 	if (sum->minVal < totalSum.minVal) totalSum.minVal = sum->minVal;
 	if (sum->maxVal > totalSum.maxVal) totalSum.maxVal = sum->maxVal;
 	totalSum.sumData += sum->sumData;
 	totalSum.sumSquares += sum->sumSquares;
 	}
@@ -1080,34 +1185,36 @@
 		}
 	    }
 	}
     }
 
 return sectionList;
 }
 
 void bigWigFileCreate(
 	char *inName, 		/* Input file in ascii wiggle format. */
 	char *chromSizes, 	/* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,		/* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot,	/* Number of items in lowest level of tree.  512 is good. */
 	boolean clipDontDie,	/* If TRUE then clip items off end of chrom rather than dying. */
 	boolean compress,	/* If TRUE then compress data. */
+	boolean keepAllChromosomes,	/* If TRUE then store all chromosomes in chromosomal b-tree. */
+	boolean fixedSummaries,	/* If TRUE then impose fixed summary levels. */
 	char *outName)
 /* Convert ascii format wig file (in fixedStep, variableStep or bedGraph format) 
  * to binary big wig format. */
 {
 /* This code needs to agree with code in two other places currently - bigBedFileCreate,
  * and bbiFileOpen.  I'm thinking of refactoring to share at least between
  * bigBedFileCreate and bigWigFileCreate.  It'd be great so it could be structured
  * so that it could send the input in one chromosome at a time, and send in the zoom
  * stuff only after all the chromosomes are done.  This'd potentially reduce the memory
  * footprint by a factor of 2 or 4.  Still, for now it works. -JK */
 struct hash *chromSizeHash = bbiChromSizesFromFile(chromSizes);
 struct lm *lm = lmInit(0);
 struct bwgSection *sectionList = bwgParseWig(inName, clipDontDie, chromSizeHash, itemsPerSlot, lm);
 if (sectionList == NULL)
     errAbort("%s is empty of data", inName);
-bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, outName);
+bwgCreate(sectionList, chromSizeHash, blockSize, itemsPerSlot, compress, keepAllChromosomes, fixedSummaries, outName);
 lmCleanup(&lm);
 }