src/utils/bedToBigBed/bedToBigBed.c cfa3310ec7d0c3ef45a3f647c7e7164453d9d4e0

cfa3310ec7d0c3ef45a3f647c7e7164453d9d4e0
kent
  Tue Mar 5 01:01:42 2013 -0800
A little more progress on multiple extra index change. First time reader and writer have both worked in any small way together.
diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c
index ef4e30a..ce17c3a 100644
--- src/utils/bedToBigBed/bedToBigBed.c
+++ src/utils/bedToBigBed/bedToBigBed.c
@@ -4,35 +4,37 @@
 #include "hash.h"
 #include "options.h"
 #include "dystring.h"
 #include "obscure.h"
 #include "asParse.h"
 #include "basicBed.h"
 #include "sig.h"
 #include "rangeTree.h"
 #include "zlibFace.h"
 #include "sqlNum.h"
 #include "bPlusTree.h"
 #include "bigBed.h"
 
 char *version = "2.4";
 
+/* Things set directly or indirectly by command lne in main() routine. */
 int blockSize = 256;
 int itemsPerSlot = 512;
 char *extraIndex = NULL;
 int bedN = 0;   /* number of standard bed fields */
 int bedP = 0;   /* number of bed plus fields */
+char *asFile = NULL;
 char *asText = NULL;
 static boolean doCompress = FALSE;
 static boolean tabSep = FALSE;
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bedToBigBed v. %s - Convert bed file to bigBed. (BigBed version: %d)\n"
   "usage:\n"
   "   bedToBigBed in.bed chrom.sizes out.bb\n"
   "Where in.bed is in one of the ascii bed formats, but not including track lines\n"
   "and chrom.sizes is two column: <chromosome name> <size in bases>\n"
   "and out.bb is the output indexed big bed file.\n"
   "Use the script: fetchChromSizes to obtain the actual chrom.sizes information\n"
@@ -134,31 +136,31 @@
 
 static void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, 
 	int itemsPerSlot, struct bbiBoundsArray *bounds, 
 	int sectionCount, boolean doCompress, FILE *f, 
 	int resTryCount, int resScales[], int resSizes[], 
 	struct bbExIndexMaker *eim,  int bedCount,
 	bits16 fieldCount, bits32 *retMaxBlockSize)
 /* Read through lf, writing it in f.  Save starting points of blocks (every itemsPerSlot)
  * to boundsArray */
 {
 int maxBlockSize = 0;
 struct bbiChromUsage *usage = usageList;
 char *line, *row[fieldCount+1];
 int lastField = fieldCount-1;
 int itemIx = 0, sectionIx = 0;
-bits64 blockStartOffset = 0, blockEndOffset = 0;
+bits64 blockStartOffset = 0;
 int startPos = 0, endPos = 0;
 bits32 chromId = 0;
 struct dyString *stream = dyStringNew(0);
 
 /* Will keep track of some things that help us determine how much to reduce. */
 bits32 resEnds[resTryCount];
 int resTry;
 for (resTry = 0; resTry < resTryCount; ++resTry)
     resEnds[resTry] = 0;
 boolean atEnd = FALSE, sameChrom = FALSE;
 bits32 start = 0, end = 0;
 char *chrom = NULL;
 struct bed *bed;
 AllocVar(bed);
 
@@ -217,42 +219,33 @@
                 // get new scratch area
                 compBufSize = maxCompSize;
                 compBuf = needLargeMem(compBufSize);
                 }
 
 	    int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize);
 	    mustWrite(f, compBuf, compSize);
 	    }
 	else
 	    mustWrite(f, stream->string, stream->stringSize);
 	dyStringClear(stream);
 
 	/* Save block offset and size for all named chunks in this section. */
 	if (eim != NULL)
 	    {
+	    bits64 blockEndOffset = ftell(f);
 	    bbExIndexMakerAddOffsetSize(eim, blockStartOffset, blockEndOffset-blockStartOffset,
 		sectionStartIx, sectionEndIx);
-#ifdef OLD
-	    blockEndOffset = ftell(f);
-	    int i;
-	    for (i=sectionStartIx; i<sectionEndIx; ++i)
-	        {
-		struct bbNamedFileChunk *chunk = namedChunks + i;
-		chunk->offset = blockStartOffset;
-		chunk->size = blockEndOffset - blockStartOffset;
-		}
-#endif /* OLD */
 	    sectionStartIx = sectionEndIx;
 	    }
 
 	/* Save info on existing block. */
 	struct bbiBoundsArray *b = &bounds[sectionIx];
 	b->offset = blockStartOffset;
 	b->range.chromIx = chromId;
 	b->range.start = startPos;
 	b->range.end = endPos;
 	++sectionIx;
 	itemIx = 0;
 
 	if (atEnd)
 	    break;
 	}
@@ -721,47 +714,33 @@
 	        break;
 	    zoomCount = rezoomCount;
 	    zoomDataOffsets[zoomLevels] = ftell(f);
 	    zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, 
 	    	blockSize, itemsPerSlot, doCompress, f);
 	    zoomAmounts[zoomLevels] = reduction;
 	    ++zoomLevels;
 	    reduction *= zoomIncrement;
 	    rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm);
 	    }
 	lmCleanup(&lm);
 	verboseTime(1, "further reductions");
 	}
     }
 
-/* Write out name index if need be. */
-#ifdef SOON
-bits64 nameIndexOffset = 0;
-if (doNameIndex)
-    {
-    qsort(namedChunks, bedCount, sizeof(namedChunks[0]),  bbNamedFileChunkCmpByName);
-    nameIndexOffset = ftell(f);
-    maxBedNameSize = maxNameSize;
-    bptFileBulkIndexToOpenFile(namedChunks, sizeof(namedChunks[0]), bedCount,
-        blockSize, bbNamedFileChunkKey, maxNameSize, bbNamedFileChunkVal, 
-	sizeof(bits64) + sizeof(bits64), f);
-    verboseTime(1, "Sorting and writing name index");
-    }
-#endif /* SOON */
+/* Write out extra indexes if need be. */
 if (eim)
     {
-    warn("Oh dear, really don't know how to do this yet.");
     int i;
     for (i=0; i<eim->indexCount; ++i)
         {
 	eim->fileOffsets[i] = ftell(f);
 	maxBedNameSize = eim->maxFieldSize[i];
 	assert(sizeof(struct bbNamedFileChunk) == sizeof(eim->chunkArrayArray[i][0]));
 	bptFileBulkIndexToOpenFile(eim->chunkArrayArray[i], sizeof(eim->chunkArrayArray[i][0]), 
 	    bedCount, blockSize, bbNamedFileChunkKey, maxBedNameSize, bbNamedFileChunkVal, 
 	    sizeof(bits64) + sizeof(bits64), f);
 	verboseTime(1, "Sorting and writing extra index %d", i);
 	}
     }
 
 /* Figure out buffer size needed for uncompression if need be. */
 if (doCompress)
@@ -827,31 +806,32 @@
 repeatCharOut(f, 0, 52);    // reserved
 assert(ftell(f) - extHeaderOffset == extHeaderSize);
 
 /* Write extra index offsets if need be. */
 if (extraIndexCount != 0)
     {
     fseek(f, extraIndexListOffset, SEEK_SET);
     int i;
     for (i=0; i<extraIndexCount; ++i)
         {
 	// Write out fixed part of index info
 	bits16 type = 0;    // bPlusTree type
 	bits16 indexFieldCount = 1;
 	writeOne(f, type);
 	writeOne(f, indexFieldCount);
-	repeatCharOut(f, 0, 12);  // reserved
+	writeOne(f, eim->fileOffsets[i]);
+	repeatCharOut(f, 0, 4);  // reserved
 
 	// Write out field list - easy this time because for now always only one field.
 	bits16 fieldId = eim->indexFields[i];
 	writeOne(f, fieldId);
 	repeatCharOut(f, 0, 2); // reserved
 	}
     assert(ftell(f) == extraIndexListEndOffset);
     }
 
 /* Write end signature. */
 fseek(f, 0L, SEEK_END);
 writeOne(f, sig);
 
 
 /* Clean up. */
@@ -866,31 +846,31 @@
 /* bedToBigBed - Convert bed file to bigBed.. */
 {
 struct slName *extraIndexList = slNameListFromString(extraIndex, ',');
 struct asObject *as = asParseText(asText);
 asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard
 bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, asText, as, 
 	doCompress, extraIndexList, outName);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 blockSize = optionInt("blockSize", blockSize);
 itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot);
-asText = optionVal("as", asText);
+asFile = optionVal("as", asFile);
 doCompress = !optionExists("unc");
 extraIndex = optionVal("extraIndex", NULL);
 tabSep = optionExists("tab");
 if (argc != 4)
     usage();
 if (optionExists("type"))
     {
     // parse type
     char *btype = cloneString(optionVal("type", ""));
     char *plus = strchr(btype, '+');
     if (plus)
 	{
 	*plus++ = 0;
 	if (isdigit(*plus))
 	    bedP = sqlUnsigned(plus);
@@ -929,24 +909,26 @@
     if (fieldCount > 256)
         errAbort("Too many columns in %s, you sure it's a bed file?", lf->fileName);
     lineFileClose(&lf);
 
     /* Set up so that it looks like we are straight up bed for that many fields,
      * or if more than or maximum defined fields, then for bed15+ */
     bedN = fieldCount;
     if (bedN > bedKnownFields)
         {
 	bedP = bedN - bedKnownFields;
 	bedN = bedKnownFields;
 	}
     }
    
 /* Make sure that fields are defined, from bed spec if nowhere else. */
-if (asText == NULL)
+if (asFile)
+    readInGulp(asFile, &asText, NULL);
+else
     asText = bedAsDef(bedN,  bedN + bedP);
 
 bedToBigBed(bedFileName, argv[2], argv[3]);
 optionFree();
 if (verboseLevel() > 1)
     printVmPeak();
 return 0;
 }