src/utils/bedToBigBed/bedToBigBed.c 12d0f12bd304787c52cab0780e367d36b020f84e

12d0f12bd304787c52cab0780e367d36b020f84e
kent
  Tue Feb 26 12:11:18 2013 -0800
Adding name index to bigBed files.  The write side I _think_ is working.  Still developing read side.
diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c
index 54bbfa9..06e66b5 100644
--- src/utils/bedToBigBed/bedToBigBed.c
+++ src/utils/bedToBigBed/bedToBigBed.c
@@ -1,34 +1,36 @@
 /* bedToBigBed - Convert bed to bigBed.. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "obscure.h"
 #include "asParse.h"
 #include "basicBed.h"
 #include "sig.h"
 #include "rangeTree.h"
 #include "zlibFace.h"
 #include "sqlNum.h"
+#include "bPlusTree.h"
 #include "bigBed.h"
 
 char *version = "2.3";
 
 int blockSize = 256;
 int itemsPerSlot = 512;
+boolean doNameIndex = FALSE;
 int bedN = 0;   /* number of standard bed fields */
 int bedP = 0;   /* number of bed plus fields */
 char *as = NULL;
 static boolean doCompress = FALSE;
 static boolean tabSep = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bedToBigBed v. %s - Convert bed file to bigBed. (BigBed version: %d)\n"
   "usage:\n"
   "   bedToBigBed in.bed chrom.sizes out.bb\n"
   "Where in.bed is in one of the ascii bed formats, but not including track lines\n"
   "and chrom.sizes is two column: <chromosome name> <size in bases>\n"
@@ -41,74 +43,112 @@
   "\n"
   "options:\n"
   "   -type=bedN[+[P]] : \n"
   "                      N is between 3 and 15, \n"
   "                      optional (+) if extra \"bedPlus\" fields, \n"
   "                      optional P specifies the number of extra fields. Not required, but preferred.\n"
   "                      Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n"
   "                      (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n"
   "   -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n"
   "                   of each field in a row in AutoSql format here.\n"
   "   -blockSize=N - Number of items to bundle in r-tree.  Default %d\n"
   "   -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n"
   "   -unc - If set, do not use compression.\n"
   "   -tab - If set, expect fields to be tab separated, normally\n"
   "           expects white space separator.\n"
+  "   -nameIndex - If set, make an index of the name field\n"
   , version, bbiCurrentVersion, blockSize, itemsPerSlot
   );
 }
 
 static struct optionSpec options[] = {
    {"blockSize", OPTION_INT},
    {"itemsPerSlot", OPTION_INT},
    {"type", OPTION_STRING},
    {"as", OPTION_STRING},
    {"unc", OPTION_BOOLEAN},
    {"tab", OPTION_BOOLEAN},
+   {"nameIndex", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
-void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, 
+struct bbNamedFileChunk 
+/* A name associated with an offset into a possibly large file. */
+    {
+    char *name;
+    bits64 offset;
+    bits64 size;
+    };
+
+int bbNamedFileChunkCmpByName(const void *va, const void *vb)
+/* Compare two named offset object to facilitate qsorting by name. */
+{
+const struct bbNamedFileChunk *a = va, *b = vb;
+return strcmp(a->name, b->name);
+}
+
+static int maxBedNameSize;
+
+void bbNamedFileChunkKey(const void *va, char *keyBuf)
+/* Copy name to keyBuf for bPlusTree maker */
+{
+const struct bbNamedFileChunk *item = va;
+strncpy(keyBuf,item->name, maxBedNameSize);
+}
+
+void *bbNamedFileChunkVal(const void *va)
+/* Return pointer to val for bPlusTree maker. */
+{
+const struct bbNamedFileChunk *item = va;
+return (void *)&item->offset;
+}
+
+static void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, 
 	int itemsPerSlot, struct bbiBoundsArray *bounds, 
 	int sectionCount, boolean doCompress, FILE *f, 
 	int resTryCount, int resScales[], int resSizes[],
+	struct bbNamedFileChunk *namedChunks, int bedCount,
 	bits16 *retFieldCount, bits32 *retMaxBlockSize)
 /* Read through lf, writing it in f.  Save starting points of blocks (every itemsPerSlot)
  * to boundsArray */
 {
 int maxBlockSize = 0;
 struct bbiChromUsage *usage = usageList;
 char *line, *row[256];  // limit of 256 columns is arbitrary, but useful to catch pathological input
 int fieldCount = 0, lastField = 0;
 int itemIx = 0, sectionIx = 0;
-bits64 blockOffset = 0;
+bits64 blockStartOffset = 0, blockEndOffset = 0;
 int startPos = 0, endPos = 0;
 bits32 chromId = 0;
 boolean allocedAs = FALSE;
 struct dyString *stream = dyStringNew(0);
 
 /* Will keep track of some things that help us determine how much to reduce. */
 bits32 resEnds[resTryCount];
 int resTry;
 for (resTry = 0; resTry < resTryCount; ++resTry)
     resEnds[resTry] = 0;
 boolean atEnd = FALSE, sameChrom = FALSE;
 bits32 start = 0, end = 0;
 char *chrom = NULL;
 struct bed *bed;
 AllocVar(bed);
 
+/* Help keep track of which beds are in current chunk so as to write out
+ * namedChunks. */
+int sectionStartIx = 0, sectionEndIx = 0;
+
 for (;;)
     {
     /* Get next line of input if any. */
     if (lineFileNextReal(lf, &line))
 	{
 	/* First time through figure out the field count and if not set, the bedN. */
 	if (fieldCount == 0)
 	    {
 	    if (as == NULL)
 		{
 		if (tabSep)
 		    fieldCount = chopByChar(line, '\t', NULL, 256); // Do not use chopString, see GOTCHA
 		else
 		    fieldCount = chopByWhite(line, NULL, 0);
 		if (bedN == 0)
@@ -132,30 +172,33 @@
 	    if (fieldCount > ArraySize(row))
 		errAbort("Too many fields [%d], current maximum fields limit is %lu", fieldCount, (unsigned long)ArraySize(row));
 	    lastField = fieldCount - 1;
 	    *retFieldCount = fieldCount;
 
 	    if (bedP == -1)  // user did not specify how many plus columns there are.
 		{
 		bedP = fieldCount - bedN;
 		if (bedP < 1)
 		    errAbort("fieldCount input (%d) did not match the specification (%s)\n"
 			, fieldCount, optionVal("type", ""));
 		}
 	    if (fieldCount != bedN + bedP)
 		errAbort("fieldCount input (%d) did not match the specification (%s)\n"
 		    , fieldCount, optionVal("type", ""));
+
+	    if (namedChunks != NULL && fieldCount < 4)
+	        errAbort("No name field in bed, can't index.");
 	    }
 
 	/* Chop up line and make sure the word count is right. */
 	int wordCount;
 	if (tabSep)
 	    wordCount = chopTabs(line, row);
 	else
 	    wordCount = chopLine(line, row);
 	lineFileExpectWords(lf, fieldCount, wordCount);
 
 	loadAndValidateBed(row, bedN, fieldCount, lf, bed, as, FALSE);
 
 	chrom = bed->chrom;
 	start = bed->chromStart;
 	end = bed->chromEnd;
@@ -187,68 +230,89 @@
                 // free up the old not-big-enough piece
                 freez(&compBuf); // freez knows bout NULL
 
                 // get new scratch area
                 compBufSize = maxCompSize;
                 compBuf = needLargeMem(compBufSize);
                 }
 
 	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
 	    mustWrite(f, compBuf, compSize);
 	    }
 	else
 	    mustWrite(f, stream->string, stream->stringSize);
 	dyStringClear(stream);
 
+	/* Save block offset and size for all named chunks in this section. */
+	if (namedChunks != NULL)
+	    {
+	    blockEndOffset = ftell(f);
+	    int i;
+	    for (i=sectionStartIx; i<sectionEndIx; ++i)
+	        {
+		struct bbNamedFileChunk *chunk = namedChunks + i;
+		chunk->offset = blockStartOffset;
+		chunk->size = blockEndOffset - blockStartOffset;
+		}
+	    sectionStartIx = sectionEndIx;
+	    }
+
 	/* Save info on existing block. */
 	struct bbiBoundsArray *b = &bounds[sectionIx];
-	b->offset = blockOffset;
+	b->offset = blockStartOffset;
 	b->range.chromIx = chromId;
 	b->range.start = startPos;
 	b->range.end = endPos;
 	++sectionIx;
 	itemIx = 0;
 
 	if (atEnd)
 	    break;
 	}
 
     /* Advance to next chromosome if need be and get chromosome id. */
     if (!sameChrom)
         {
 	usage = usage->next;
 	assert(usage != NULL);
 	assert(sameString(chrom, usage->name));
 	for (resTry = 0; resTry < resTryCount; ++resTry)
 	    resEnds[resTry] = 0;
 	}
     chromId = usage->id;
 
     /* At start of block we save a lot of info. */
     if (itemIx == 0)
         {
-	blockOffset = ftell(f);
+	blockStartOffset = ftell(f);
 	startPos = start;
 	endPos = end;
 	}
     /* Otherwise just update end. */
         {
 	if (endPos < end)
 	    endPos = end;
 	/* No need to update startPos since list is sorted. */
 	}
 
+    /* Save name into namedOffset if need be. */
+    if (namedChunks != NULL)
+        {
+	namedChunks[sectionEndIx].name = cloneString(bed->name);
+	sectionEndIx += 1;
+	}
+
     /* Write out data. */
     dyStringWriteOne(stream, chromId);
     dyStringWriteOne(stream, start);
     dyStringWriteOne(stream, end);
     if (fieldCount > 3)
         {
 	int i;
 	/* Write 3rd through next to last field and a tab separator. */
 	for (i=3; i<lastField; ++i)
 	    {
 	    char *s = row[i];
 	    dyStringAppend(stream, s);
 	    dyStringAppendC(stream, '\t');
 	    }
 	/* Write last field and terminal zero */
@@ -431,57 +495,59 @@
     indexOffset, f);
 
 freez(&boundsArray);
 slReverse(&twiceReducedList);
 return twiceReducedList;
 }
 
 
 void bbFileCreate(
 	char *inName, 	  /* Input file in a tabular bed format <chrom><start><end> + whatever. */
 	char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */
 	int blockSize,	  /* Number of items to bundle in r-tree.  1024 is good. */
 	int itemsPerSlot, /* Number of items in lowest level of tree.  64 is good. */
 	char *asFileName, /* If non-null points to a .as file that describes fields. */
 	boolean doCompress, /* If TRUE then compress data. */
+	boolean doNameIndex, /* If TRUE then index names as well. */
 	char *outName)    /* BigBed output file name. */
 /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */
 {
 /* Set up timing measures. */
 verboseTimeInit();
 struct lineFile *lf = lineFileOpen(inName, TRUE);
 
 /* Load up as object if defined in file. */
 struct asObject *as = NULL;
 if (asFileName != NULL)
     {
     /* Parse it and do sanity check. */
     as = asParseFile(asFileName);
     if (as->next != NULL)
         errAbort("Can only handle .as files containing a single object.");
     }
 
 /* Load in chromosome sizes. */
 struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes);
 verbose(2, "Read %d chromosomes and sizes from %s\n",  chromSizesHash->elCount, chromSizes);
 
 /* Do first pass, mostly just scanning file and counting hits per chromosome. */
 int minDiff = 0;
 double aveSpan = 0;
 bits64 bedCount = 0;
 bits32 uncompressBufSize = 0;
-struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount);
+int maxNameSize = 0;
+struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, &minDiff, &aveSpan, &bedCount, &maxNameSize);
 verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList));
 verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSpan);
 
 /* Open output file and write dummy header. */
 FILE *f = mustOpen(outName, "wb");
 bbiWriteDummyHeader(f);
 bbiWriteDummyZooms(f);
 
 /* Write out asFile if any */
 bits64 asOffset = 0;
 if (asFileName != NULL)
     {
     int colCount = slCount(as->columnList);
     asOffset = ftell(f);
     FILE *asFile = mustOpen(asFileName, "r");
@@ -521,32 +587,35 @@
 	verbose(2, "resTryCount reduced from 10 to %d\n", resTryCount);
 	break;
 	}
     res *= resIncrement;
     }
 
 /* Write out primary full resolution data in sections, collect stats to use for reductions. */
 bits64 dataOffset = ftell(f);
 writeOne(f, bedCount);
 bits32 blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot);
 struct bbiBoundsArray *boundsArray;
 AllocArray(boundsArray, blockCount);
 lineFileRewind(lf);
 bits16 fieldCount=0;
 bits32 maxBlockSize = 0;
+struct bbNamedFileChunk *namedChunks = NULL;
+if (doNameIndex)
+    AllocArray(namedChunks, bedCount);
 writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress,
-	f, resTryCount, resScales, resSizes, &fieldCount, &maxBlockSize);
+	f, resTryCount, resScales, resSizes, namedChunks, bedCount, &fieldCount, &maxBlockSize);
 verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", 
 	(long long)bedCount, fieldCount);
 
 /* Write out primary data index. */
 bits64 indexOffset = ftell(f);
 cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount,
     blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, 
     indexOffset, f);
 freez(&boundsArray);
 verboseTime(1, "index write");
 
 /* Declare arrays and vars that track the zoom levels we actually output. */
 bits32 zoomAmounts[bbiMaxZoomLevels];
 bits64 zoomDataOffsets[bbiMaxZoomLevels];
 bits64 zoomIndexOffsets[bbiMaxZoomLevels];
@@ -598,64 +667,77 @@
 	        break;
 	    zoomCount = rezoomCount;
 	    zoomDataOffsets[zoomLevels] = ftell(f);
 	    zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, 
 	    	blockSize, itemsPerSlot, doCompress, f);
 	    zoomAmounts[zoomLevels] = reduction;
 	    ++zoomLevels;
 	    reduction *= zoomIncrement;
 	    rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm);
 	    }
 	lmCleanup(&lm);
 	verboseTime(1, "further reductions");
 	}
     }
 
+/* Write out name index if need be. */
+bits64 nameIndexOffset = 0;
+if (doNameIndex)
+    {
+    qsort(namedChunks, bedCount, sizeof(namedChunks[0]),  bbNamedFileChunkCmpByName);
+    nameIndexOffset = ftell(f);
+    maxBedNameSize = maxNameSize;
+    bptFileBulkIndexToOpenFile(namedChunks, sizeof(namedChunks[0]), bedCount,
+        blockSize, bbNamedFileChunkKey, maxNameSize, bbNamedFileChunkVal, 
+	sizeof(bits64) + sizeof(bits64), f);
+    verboseTime(1, "Sorting and writing name index");
+    }
+
 /* Figure out buffer size needed for uncompression if need be. */
 if (doCompress)
     {
     int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk);
     uncompressBufSize = max(maxBlockSize, maxZoomUncompSize);
     }
 
 /* Go back and rewrite header. */
 rewind(f);
 bits32 sig = bigBedSig;
 bits16 version = bbiCurrentVersion;
 bits16 summaryCount = zoomLevels;
 bits32 reserved32 = 0;
 bits64 reserved64 = 0;
 
 bits16 definedFieldCount = bedN;
 
 /* Write fixed header */
 writeOne(f, sig);
 writeOne(f, version);
 writeOne(f, summaryCount);
 writeOne(f, chromTreeOffset);
 writeOne(f, dataOffset);
 writeOne(f, indexOffset);
 writeOne(f, fieldCount);
 writeOne(f, definedFieldCount);
 writeOne(f, asOffset);
 writeOne(f, totalSummaryOffset);
 writeOne(f, uncompressBufSize);
-int i;
-for (i=0; i<2; ++i)
-    writeOne(f, reserved32);
+writeOne(f, nameIndexOffset);
+assert(ftell(f) == 64);
 
 /* Write summary headers with data. */
+int i;
 verbose(2, "Writing %d levels of zoom\n", zoomLevels);
 for (i=0; i<zoomLevels; ++i)
     {
     verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]);
     writeOne(f, zoomAmounts[i]);
     writeOne(f, reserved32);
     writeOne(f, zoomDataOffsets[i]);
     writeOne(f, zoomIndexOffsets[i]);
     }
 /* Write rest of summary headers with no data. */
 for (i=zoomLevels; i<bbiMaxZoomLevels; ++i)
     {
     writeOne(f, reserved32);
     writeOne(f, reserved32);
     writeOne(f, reserved64);
@@ -671,41 +753,42 @@
 writeOne(f, sig);
 
 
 /* Clean up. */
 lineFileClose(&lf);
 carefulClose(&f);
 freeHash(&chromSizesHash);
 bbiChromUsageFreeList(&usageList);
 asObjectFreeList(&as);
 }
 
 void bedToBigBed(char *inName, char *chromSizes, char *outName)
 /* bedToBigBed - Convert bed file to bigBed.. */
 {
 bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, as, 
-	doCompress, outName);
+	doCompress, doNameIndex, outName);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 blockSize = optionInt("blockSize", blockSize);
 itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
 as = optionVal("as", as);
 doCompress = !optionExists("unc");
+doNameIndex = optionExists("nameIndex");
 tabSep = optionExists("tab");
 if (argc != 4)
     usage();
 if (optionExists("type"))
     {
     // parse type
     char *btype = cloneString(optionVal("type", ""));
     char *plus = strchr(btype, '+');
     if (plus)
 	{
 	*plus++ = 0;
 	if (isdigit(*plus))
 	    bedP = sqlUnsigned(plus);
 	else
 	    bedP = -1;