cfa3310ec7d0c3ef45a3f647c7e7164453d9d4e0 kent Tue Mar 5 01:01:42 2013 -0800 A little more progress on multiple extra index change. First time reader and writer have both worked in any small way together. diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index ef4e30a..ce17c3a 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -4,35 +4,37 @@ #include "hash.h" #include "options.h" #include "dystring.h" #include "obscure.h" #include "asParse.h" #include "basicBed.h" #include "sig.h" #include "rangeTree.h" #include "zlibFace.h" #include "sqlNum.h" #include "bPlusTree.h" #include "bigBed.h" char *version = "2.4"; +/* Things set directly or indirectly by command lne in main() routine. */ int blockSize = 256; int itemsPerSlot = 512; char *extraIndex = NULL; int bedN = 0; /* number of standard bed fields */ int bedP = 0; /* number of bed plus fields */ +char *asFile = NULL; char *asText = NULL; static boolean doCompress = FALSE; static boolean tabSep = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedToBigBed v. %s - Convert bed file to bigBed. (BigBed version: %d)\n" "usage:\n" " bedToBigBed in.bed chrom.sizes out.bb\n" "Where in.bed is in one of the ascii bed formats, but not including track lines\n" "and chrom.sizes is two column: \n" "and out.bb is the output indexed big bed file.\n" "Use the script: fetchChromSizes to obtain the actual chrom.sizes information\n" @@ -134,31 +136,31 @@ static void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, boolean doCompress, FILE *f, int resTryCount, int resScales[], int resSizes[], struct bbExIndexMaker *eim, int bedCount, bits16 fieldCount, bits32 *retMaxBlockSize) /* Read through lf, writing it in f. Save starting points of blocks (every itemsPerSlot) * to boundsArray */ { int maxBlockSize = 0; struct bbiChromUsage *usage = usageList; char *line, *row[fieldCount+1]; int lastField = fieldCount-1; int itemIx = 0, sectionIx = 0; -bits64 blockStartOffset = 0, blockEndOffset = 0; +bits64 blockStartOffset = 0; int startPos = 0, endPos = 0; bits32 chromId = 0; struct dyString *stream = dyStringNew(0); /* Will keep track of some things that help us determine how much to reduce. */ bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; boolean atEnd = FALSE, sameChrom = FALSE; bits32 start = 0, end = 0; char *chrom = NULL; struct bed *bed; AllocVar(bed); @@ -217,42 +219,33 @@ // get new scratch area compBufSize = maxCompSize; compBuf = needLargeMem(compBufSize); } int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); dyStringClear(stream); /* Save block offset and size for all named chunks in this section. */ if (eim != NULL) { + bits64 blockEndOffset = ftell(f); bbExIndexMakerAddOffsetSize(eim, blockStartOffset, blockEndOffset-blockStartOffset, sectionStartIx, sectionEndIx); -#ifdef OLD - blockEndOffset = ftell(f); - int i; - for (i=sectionStartIx; ioffset = blockStartOffset; - chunk->size = blockEndOffset - blockStartOffset; - } -#endif /* OLD */ sectionStartIx = sectionEndIx; } /* Save info on existing block. */ struct bbiBoundsArray *b = &bounds[sectionIx]; b->offset = blockStartOffset; b->range.chromIx = chromId; b->range.start = startPos; b->range.end = endPos; ++sectionIx; itemIx = 0; if (atEnd) break; } @@ -721,47 +714,33 @@ break; zoomCount = rezoomCount; zoomDataOffsets[zoomLevels] = ftell(f); zoomIndexOffsets[zoomLevels] = bbiWriteSummaryAndIndex(rezoomedList, blockSize, itemsPerSlot, doCompress, f); zoomAmounts[zoomLevels] = reduction; ++zoomLevels; reduction *= zoomIncrement; rezoomedList = bbiSummarySimpleReduce(rezoomedList, reduction, lm); } lmCleanup(&lm); verboseTime(1, "further reductions"); } } -/* Write out name index if need be. */ -#ifdef SOON -bits64 nameIndexOffset = 0; -if (doNameIndex) - { - qsort(namedChunks, bedCount, sizeof(namedChunks[0]), bbNamedFileChunkCmpByName); - nameIndexOffset = ftell(f); - maxBedNameSize = maxNameSize; - bptFileBulkIndexToOpenFile(namedChunks, sizeof(namedChunks[0]), bedCount, - blockSize, bbNamedFileChunkKey, maxNameSize, bbNamedFileChunkVal, - sizeof(bits64) + sizeof(bits64), f); - verboseTime(1, "Sorting and writing name index"); - } -#endif /* SOON */ +/* Write out extra indexes if need be. */ if (eim) { - warn("Oh dear, really don't know how to do this yet."); int i; for (i=0; iindexCount; ++i) { eim->fileOffsets[i] = ftell(f); maxBedNameSize = eim->maxFieldSize[i]; assert(sizeof(struct bbNamedFileChunk) == sizeof(eim->chunkArrayArray[i][0])); bptFileBulkIndexToOpenFile(eim->chunkArrayArray[i], sizeof(eim->chunkArrayArray[i][0]), bedCount, blockSize, bbNamedFileChunkKey, maxBedNameSize, bbNamedFileChunkVal, sizeof(bits64) + sizeof(bits64), f); verboseTime(1, "Sorting and writing extra index %d", i); } } /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) @@ -827,31 +806,32 @@ repeatCharOut(f, 0, 52); // reserved assert(ftell(f) - extHeaderOffset == extHeaderSize); /* Write extra index offsets if need be. */ if (extraIndexCount != 0) { fseek(f, extraIndexListOffset, SEEK_SET); int i; for (i=0; ifileOffsets[i]); + repeatCharOut(f, 0, 4); // reserved // Write out field list - easy this time because for now always only one field. bits16 fieldId = eim->indexFields[i]; writeOne(f, fieldId); repeatCharOut(f, 0, 2); // reserved } assert(ftell(f) == extraIndexListEndOffset); } /* Write end signature. */ fseek(f, 0L, SEEK_END); writeOne(f, sig); /* Clean up. */ @@ -866,31 +846,31 @@ /* bedToBigBed - Convert bed file to bigBed.. */ { struct slName *extraIndexList = slNameListFromString(extraIndex, ','); struct asObject *as = asParseText(asText); asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, asText, as, doCompress, extraIndexList, outName); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); -asText = optionVal("as", asText); +asFile = optionVal("as", asFile); doCompress = !optionExists("unc"); extraIndex = optionVal("extraIndex", NULL); tabSep = optionExists("tab"); if (argc != 4) usage(); if (optionExists("type")) { // parse type char *btype = cloneString(optionVal("type", "")); char *plus = strchr(btype, '+'); if (plus) { *plus++ = 0; if (isdigit(*plus)) bedP = sqlUnsigned(plus); @@ -929,24 +909,26 @@ if (fieldCount > 256) errAbort("Too many columns in %s, you sure it's a bed file?", lf->fileName); lineFileClose(&lf); /* Set up so that it looks like we are straight up bed for that many fields, * or if more than or maximum defined fields, then for bed15+ */ bedN = fieldCount; if (bedN > bedKnownFields) { bedP = bedN - bedKnownFields; bedN = bedKnownFields; } } /* Make sure that fields are defined, from bed spec if nowhere else. */ -if (asText == NULL) +if (asFile) + readInGulp(asFile, &asText, NULL); +else asText = bedAsDef(bedN, bedN + bedP); bedToBigBed(bedFileName, argv[2], argv[3]); optionFree(); if (verboseLevel() > 1) printVmPeak(); return 0; }