69087d3a65af31c39337085920e99e5b2db13082 galt Fri Jun 17 15:05:28 2022 -0700 Ran the dbsnp pipeline designed by Angie for dbsnp v155. It produces huge bigBed output and I found and fixed a problem encountered on the bedToBigBed. I also tweaked dbSnpJsonToTab to deal with some dbsnp data having multiple study subversions, by ignoring the old datasets and just using the latest one. Added a track description page that has lots of content and counts to update. dbsnp155 is ready for QA on hgwdev. refs #rm27751 diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index 723ab13..13a9ac1 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -1,918 +1,918 @@ /* bedToBigBed - Convert bed to bigBed.. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "obscure.h" #include "asParse.h" #include "basicBed.h" #include "memalloc.h" #include "sig.h" #include "rangeTree.h" #include "zlibFace.h" #include "sqlNum.h" #include "bPlusTree.h" #include "bigBed.h" #include "bbiAlias.h" #include "twoBit.h" char *version = "2.9"; // when changing, change in bedToBigBed, bedGraphToBigWig, and wigToBigWig /* Version history from 2.6 on at least - * 2.9 - ability to specify chromAlias bigBed as chromSizes file * 2.8 - Various changes where developer didn't increment version id * 2.7 - Added check for duplicate field names in asParse.c * 2.6 - Made it not crash on empty input. * */ /* Things set directly or indirectly by command lne in main() routine. */ int blockSize = 256; int itemsPerSlot = 512; char *extraIndex = NULL; int bedN = 0; /* number of standard bed fields */ int bedP = 0; /* number of bed plus fields */ char *asFile = NULL; char *asText = NULL; char *udcDir = NULL; static boolean doCompress = FALSE; static boolean tabSep = FALSE; static boolean sizesIs2Bit = FALSE; static boolean sizesIsBb = FALSE; static boolean allow1bpOverlap = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedToBigBed v. %s - Convert bed file to bigBed. (bbi version: %d)\n" "usage:\n" " bedToBigBed in.bed chrom.sizes out.bb\n" "Where in.bed is in one of the ascii bed formats, but not including track lines\n" "and chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n" "and out.bb is the output indexed big bed file.\n" "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n" " http://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n" "or you may use the script fetchChromSizes to download the chrom.sizes file.\n" "If you have bed annotations on patch sequences from NCBI, a more inclusive\n" "chrom.sizes file can be found using a URL like\n" " http://hgdownload.soe.ucsc.edu/goldenPath/<db>/database/chromInfo.txt.gz\n" "If not hosted by UCSC, a chrom.sizes file can be generated by running\n" "twoBitInfo on the assembly .2bit file.\n" "The in.bed file must be sorted by chromosome,start,\n" " to sort a bed file, use the unix sort command:\n" " sort -k1,1 -k2,2n unsorted.bed > sorted.bed\n" "Sorting must be set to skip Unicode mapping (LC_COLLATE=C).\n" "\n" "options:\n" " -type=bedN[+[P]] : \n" " N is between 3 and 15, \n" " optional (+) if extra \"bedPlus\" fields, \n" " optional P specifies the number of extra fields. Not required, but preferred.\n" " Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n" " (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n" " -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n" " of each field in a row in AutoSql format here.\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" " -unc - If set, do not use compression.\n" " -tab - If set, expect fields to be tab separated, normally\n" " expects white space separator.\n" " -extraIndex=fieldList - If set, make an index on each field in a comma separated list\n" " extraIndex=name and extraIndex=name,id are commonly used.\n" " -sizesIs2Bit -- If set, the chrom.sizes file is assumed to be a 2bit file.\n" " -sizesIsBb -- If set, the chrom.sizes file is assumed to be a bigBed file.\n" " -udcDir=/path/to/udcCacheDir -- sets the UDC cache dir for caching of remote files.\n" " -allow1bpOverlap -- allow exons to overlap by at most one base pair\n" " -maxAlloc=N -- Set the maximum memory allocation size to N bytes\n" , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, {"type", OPTION_STRING}, {"as", OPTION_STRING}, {"unc", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, {"sizesIs2Bit", OPTION_BOOLEAN}, {"sizesIsBb", OPTION_BOOLEAN}, {"extraIndex", OPTION_STRING}, {"udcDir", OPTION_STRING}, {"allow1bpOverlap", OPTION_BOOLEAN}, {"maxAlloc", OPTION_LONG_LONG}, {NULL, 0}, }; int bbNamedFileChunkCmpByName(const void *va, const void *vb) /* Compare two named offset object to facilitate qsorting by name. */ { const struct bbNamedFileChunk *a = va, *b = vb; return strcmp(a->name, b->name); } static int maxBedNameSize; void bbNamedFileChunkKey(const void *va, char *keyBuf) /* Copy name to keyBuf for bPlusTree maker */ { const struct bbNamedFileChunk *item = va; strncpy(keyBuf,item->name, maxBedNameSize); } void *bbNamedFileChunkVal(const void *va) /* Return pointer to val for bPlusTree maker. */ { const struct bbNamedFileChunk *item = va; return (void *)&item->offset; } void bbExIndexMakerAddKeysFromRow(struct bbExIndexMaker *eim, char **row, int recordIx) /* Save the keys that are being indexed by row in eim. */ { int i; for (i=0; i < eim->indexCount; ++i) { int rowIx = eim->indexFields[i]; eim->chunkArrayArray[i][recordIx].name = cloneString(row[rowIx]); } } void bbExIndexMakerAddOffsetSize(struct bbExIndexMaker *eim, bits64 offset, bits64 size, long startIx, long endIx) /* Update offset and size fields of all file chunks between startIx and endIx */ { int i; for (i=0; i < eim->indexCount; ++i) { struct bbNamedFileChunk *chunks = eim->chunkArrayArray[i]; long j; for (j = startIx; j < endIx; ++j) { struct bbNamedFileChunk *chunk = chunks + j; chunk->offset = offset; chunk->size = size; } } } static void writeBlocks(struct bbiChromUsage *usageList, struct lineFile *lf, struct asObject *as, int itemsPerSlot, struct bbiBoundsArray *bounds, int sectionCount, boolean doCompress, FILE *f, int resTryCount, int resScales[], int resSizes[], struct bbExIndexMaker *eim, int bedCount, bits16 fieldCount, bits32 *retMaxBlockSize) /* Read through lf, writing it in f. Save starting points of blocks (every itemsPerSlot) * to boundsArray */ { int maxBlockSize = 0; struct bbiChromUsage *usage = usageList; char *line, *row[fieldCount+1]; int lastField = fieldCount-1; int itemIx = 0, sectionIx = 0; bits64 blockStartOffset = 0; int startPos = 0, endPos = 0; bits32 chromId = 0; struct dyString *stream = dyStringNew(0); /* Will keep track of some things that help us determine how much to reduce. */ bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; boolean atEnd = FALSE, sameChrom = FALSE; bits32 start = 0, end = 0; char *chrom = NULL; struct bed *bed; AllocVar(bed); /* Help keep track of which beds are in current chunk so as to write out * namedChunks to eim if need be. */ long sectionStartIx = 0, sectionEndIx = 0; for (;;) { /* Get next line of input if any. */ if (lineFileNextReal(lf, &line)) { /* Chop up line and make sure the word count is right. */ int wordCount; if (tabSep) wordCount = chopTabs(line, row); else wordCount = chopLine(line, row); lineFileExpectWordsMesg(lf, fieldCount, wordCount, "If the input is a tab-sep file, do not forget to use the -tab option"); loadAndValidateBedExt(row, bedN, fieldCount, lf, bed, as, FALSE, allow1bpOverlap); chrom = bed->chrom; start = bed->chromStart; end = bed->chromEnd; sameChrom = sameString(chrom, usage->name); } else /* No next line */ { atEnd = TRUE; } /* Check conditions that would end block and save block info and advance to next if need be. */ if (atEnd || !sameChrom || itemIx >= itemsPerSlot) { /* Save stream to file, compressing if need be. */ if (stream->stringSize > maxBlockSize) maxBlockSize = stream->stringSize; if (doCompress) { size_t maxCompSize = zCompBufSize(stream->stringSize); // keep around an area of scratch memory static int compBufSize = 0; static char *compBuf = NULL; // check to see if buffer needed for compression is big enough if (compBufSize < maxCompSize) { // free up the old not-big-enough piece freez(&compBuf); // freez knows bout NULL // get new scratch area compBufSize = maxCompSize; compBuf = needLargeMem(compBufSize); } int compSize = zCompress(stream->string, stream->stringSize, compBuf, maxCompSize); mustWrite(f, compBuf, compSize); } else mustWrite(f, stream->string, stream->stringSize); dyStringClear(stream); /* Save block offset and size for all named chunks in this section. */ if (eim != NULL) { bits64 blockEndOffset = ftell(f); bbExIndexMakerAddOffsetSize(eim, blockStartOffset, blockEndOffset-blockStartOffset, sectionStartIx, sectionEndIx); sectionStartIx = sectionEndIx; } /* Save info on existing block. */ struct bbiBoundsArray *b = &bounds[sectionIx]; b->offset = blockStartOffset; b->range.chromIx = chromId; b->range.start = startPos; b->range.end = endPos; ++sectionIx; itemIx = 0; if (atEnd) break; } /* Advance to next chromosome if need be and get chromosome id. */ if (!sameChrom) { usage = usage->next; assert(usage != NULL); assert(sameString(chrom, usage->name)); for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; } chromId = usage->id; /* At start of block we save a lot of info. */ if (itemIx == 0) { blockStartOffset = ftell(f); startPos = start; endPos = end; } /* Otherwise just update end. */ { if (endPos < end) endPos = end; /* No need to update startPos since list is sorted. */ } /* Save name into namedOffset if need be. */ if (eim != NULL) { bbExIndexMakerAddKeysFromRow(eim, row, sectionEndIx); sectionEndIx += 1; } /* Write out data. */ dyStringWriteOne(stream, chromId); dyStringWriteOne(stream, start); dyStringWriteOne(stream, end); if (fieldCount > 3) { int i; /* Write 3rd through next to last field and a tab separator. */ for (i=3; i<lastField; ++i) { char *s = row[i]; dyStringAppend(stream, s); dyStringAppendC(stream, '\t'); } /* Write last field and terminal zero */ char *s = row[lastField]; dyStringAppend(stream, s); } dyStringAppendC(stream, 0); itemIx += 1; /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; - if (start >= resEnd) + if (start >= resEnd && resEnd < usage->size) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = start + resScales[resTry]; } while (end > resEnd) { resSizes[resTry] += 1; resEnds[resTry] = resEnd = resEnd + resScales[resTry]; } } } assert(sectionIx == sectionCount); freez(&bed); *retMaxBlockSize = maxBlockSize; } struct rbTree *rangeTreeForBedChrom(struct lineFile *lf, char *chrom) /* Read lines from bed file as long as they match chrom. Return a rangeTree that * corresponds to the coverage. */ { struct rbTree *tree = rangeTreeNew(); char *line; while (lineFileNextReal(lf, &line)) { if (!startsWithWord(chrom, line)) { lineFileReuse(lf); break; } char *row[3]; chopLine(line, row); unsigned start = sqlUnsigned(row[1]); unsigned end = sqlUnsigned(row[2]); rangeTreeAddToCoverageDepth(tree, start, end); } return tree; } static struct bbiSummary *bedWriteReducedOnceReturnReducedTwice(struct bbiChromUsage *usageList, int fieldCount, struct lineFile *lf, bits32 initialReduction, bits32 initialReductionCount, int zoomIncrement, int blockSize, int itemsPerSlot, boolean doCompress, struct lm *lm, FILE *f, bits64 *retDataStart, bits64 *retIndexStart, struct bbiSummaryElement *totalSum) /* Write out data reduced by factor of initialReduction. Also calculate and keep in memory * next reduction level. This is more work than some ways, but it keeps us from having to * keep the first reduction entirely in memory. */ { struct bbiSummary *twiceReducedList = NULL; bits32 doubleReductionSize = initialReduction * zoomIncrement; struct bbiChromUsage *usage = usageList; struct bbiBoundsArray *boundsArray, *boundsPt, *boundsEnd; boundsPt = AllocArray(boundsArray, initialReductionCount); boundsEnd = boundsPt + initialReductionCount; *retDataStart = ftell(f); writeOne(f, initialReductionCount); /* This gets a little complicated I'm afraid. The strategy is to: * 1) Build up a range tree that represents coverage depth on that chromosome * This also has the nice side effect of getting rid of overlaps. * 2) Stream through the range tree, outputting the initial summary level and * further reducing. */ boolean firstTime = TRUE; struct bbiSumOutStream *stream = bbiSumOutStreamOpen(itemsPerSlot, f, doCompress); for (usage = usageList; usage != NULL; usage = usage->next) { struct bbiSummary oneSummary, *sum = NULL; struct rbTree *rangeTree = rangeTreeForBedChrom(lf, usage->name); struct range *range, *rangeList = rangeTreeList(rangeTree); for (range = rangeList; range != NULL; range = range->next) { /* Grab values we want from range. */ double val = ptToInt(range->val); int start = range->start; int end = range->end; bits32 size = end - start; // we want to make sure we count zero size elements if (size == 0) size = 1; /* Add to total summary. */ if (firstTime) { totalSum->validCount = size; totalSum->minVal = totalSum->maxVal = val; totalSum->sumData = val*size; totalSum->sumSquares = val*val*size; firstTime = FALSE; } else { totalSum->validCount += size; if (val < totalSum->minVal) totalSum->minVal = val; if (val > totalSum->maxVal) totalSum->maxVal = val; totalSum->sumData += val*size; totalSum->sumSquares += val*val*size; } /* If start past existing block then output it. */ if (sum != NULL && sum->end <= start && sum->end < usage->size) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); sum = NULL; } /* If don't have a summary we're working on now, make one. */ if (sum == NULL) { oneSummary.chromId = usage->id; oneSummary.start = start; oneSummary.end = start + initialReduction; if (oneSummary.end > usage->size) oneSummary.end = usage->size; oneSummary.minVal = oneSummary.maxVal = val; oneSummary.sumData = oneSummary.sumSquares = 0.0; oneSummary.validCount = 0; sum = &oneSummary; } /* Deal with case where might have to split an item between multiple summaries. This * loop handles all but the final affected summary in that case. */ while (end > sum->end) { /* Fold in bits that overlap with existing summary and output. */ int overlap = rangeIntersection(start, end, sum->start, sum->end); assert(overlap > 0); verbose(3, "Splitting size %d at %d, overlap %d\n", end - start, sum->end, overlap); sum->validCount += overlap; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * overlap; sum->sumSquares += val*val * overlap; bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); size -= overlap; /* Move summary to next part. */ sum->start = start = sum->end; sum->end = start + initialReduction; if (sum->end > usage->size) sum->end = usage->size; sum->minVal = sum->maxVal = val; sum->sumData = sum->sumSquares = 0.0; sum->validCount = 0; } /* Add to summary. */ sum->validCount += size; if (sum->minVal > val) sum->minVal = val; if (sum->maxVal < val) sum->maxVal = val; sum->sumData += val * size; sum->sumSquares += val*val * size; } if (sum != NULL) { bbiOutputOneSummaryFurtherReduce(sum, &twiceReducedList, doubleReductionSize, &boundsPt, boundsEnd, lm, stream); } rangeTreeFree(&rangeTree); } bbiSumOutStreamClose(&stream); /* Write out 1st zoom index. */ int indexOffset = *retIndexStart = ftell(f); assert(boundsPt == boundsEnd); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), initialReductionCount, blockSize, itemsPerSlot, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); slReverse(&twiceReducedList); return twiceReducedList; } struct bbExIndexMaker *bbExIndexMakerNew(struct slName *extraIndexList, struct asObject *as) /* Return an index maker corresponding to extraIndexList. Checks that all fields * mentioned are in autoSql definition, and for now that they are all text fields. */ { /* Fill in scalar fields and return quickly if no extra indexes. */ struct bbExIndexMaker *eim; AllocVar(eim); eim->indexCount = slCount(extraIndexList); if (eim->indexCount == 0) return eim; // Not much to do in this case /* Allocate arrays according field count. */ AllocArray(eim->indexFields, eim->indexCount); AllocArray(eim->maxFieldSize, eim->indexCount); AllocArray(eim->chunkArrayArray, eim->indexCount); AllocArray(eim->fileOffsets, eim->indexCount); /* Loop through each field checking that it is indeed something we can index * and if so saving information about it */ int indexIx = 0; struct slName *name; for (name = extraIndexList; name != NULL; name = name->next) { struct asColumn *col = asColumnFind(as, name->name); if (col == NULL) errAbort("extraIndex field %s not a standard bed field or found in 'as' file.", name->name); if (!sameString(col->lowType->name, "string")) errAbort("Sorry for now can only index string fields."); eim->indexFields[indexIx] = slIxFromElement(as->columnList, col); ++indexIx; } return eim; } void bbExIndexMakerAllocChunkArrays(struct bbExIndexMaker *eim, int recordCount) /* Allocate the big part of the extra index maker - the part that holds which * chunk is used for each record. */ { eim->recordCount = recordCount; int i; for (i=0; i < eim->indexCount; ++i) AllocArray(eim->chunkArrayArray[i], recordCount); } void bbExIndexMakerFree(struct bbExIndexMaker **pEim) /* Free up memory associated with bbExIndexMaker */ { struct bbExIndexMaker *eim = *pEim; if (eim != NULL) { if (eim->chunkArrayArray != NULL) { int i; for (i=0; i < eim->indexCount; ++i) freeMem(eim->chunkArrayArray[i]); } freeMem(eim->indexFields); freeMem(eim->maxFieldSize); freeMem(eim->chunkArrayArray); freeMem(eim->fileOffsets); freez(pEim); } } void bbFileCreate( char *inName, /* Input file in a tabular bed format <chrom><start><end> + whatever. */ char *chromSizes, /* Two column tab-separated file: <chromosome> <size>. */ int blockSize, /* Number of items to bundle in r-tree. 1024 is good. */ int itemsPerSlot, /* Number of items in lowest level of tree. 64 is good. */ char *asText, /* Field definitions in a string */ struct asObject *as, /* Field definitions parsed out */ boolean doCompress, /* If TRUE then compress data. */ struct slName *extraIndexList, /* List of extra indexes to add */ char *outName) /* BigBed output file name. */ /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */ { /* Set up timing measures. */ verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); bits16 fieldCount = slCount(as->columnList); bits16 extraIndexCount = slCount(extraIndexList); struct bbExIndexMaker *eim = NULL; if (extraIndexList != NULL) eim = bbExIndexMakerNew(extraIndexList, as); /* Do first pass, mostly just scanning file and counting hits per chromosome. */ int minDiff = 0; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList = NULL; if (sizesIsBb) usageList = bbiChromUsageFromBedFileAlias(lf, chromSizes, eim, &minDiff, &aveSize, &bedCount, tabSep); else { struct hash *chromSizesHash = NULL; if (sizesIs2Bit) chromSizesHash = twoBitChromHash(chromSizes); else chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "Read %d chromosomes and sizes from %s\n", chromSizesHash->elCount, chromSizes); usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, &minDiff, &aveSize, &bedCount, tabSep); freeHash(&chromSizesHash); } verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList)); verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize); /* Open output file and write dummy header. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); bbiWriteDummyZooms(f); /* Write out autoSql string */ bits64 asOffset = ftell(f); mustWrite(f, asText, strlen(asText) + 1); verbose(2, "as definition has %d columns\n", fieldCount); /* Write out dummy total summary. */ struct bbiSummaryElement totalSum; ZeroVar(&totalSum); bits64 totalSummaryOffset = ftell(f); bbiSummaryElementWrite(f, &totalSum); /* Write out dummy header extension */ bits64 extHeaderOffset = ftell(f); bits16 extHeaderSize = 64; repeatCharOut(f, 0, extHeaderSize); /* Write out extra index stuff if need be. */ bits64 extraIndexListOffset = 0; bits64 extraIndexListEndOffset = 0; if (extraIndexList != NULL) { extraIndexListOffset = ftell(f); int extraIndexSize = 16 + 4*1; // Fixed record size 16, plus 1 times field size of 4 repeatCharOut(f, 0, extraIndexSize*extraIndexCount); extraIndexListEndOffset = ftell(f); } /* Write out chromosome/size database. */ bits64 chromTreeOffset = ftell(f); bbiWriteChromInfo(usageList, blockSize, f); /* Set up to keep track of possible initial reduction levels. */ int resScales[bbiMaxZoomLevels], resSizes[bbiMaxZoomLevels]; int resTryCount = bbiCalcResScalesAndSizes(aveSize, resScales, resSizes); /* Write out primary full resolution data in sections, collect stats to use for reductions. */ bits64 dataOffset = ftell(f); bits32 blockCount = 0; bits32 maxBlockSize = 0; struct bbiBoundsArray *boundsArray = NULL; writeOne(f, bedCount); if (bedCount > 0) { blockCount = bbiCountSectionsNeeded(usageList, itemsPerSlot); AllocArray(boundsArray, blockCount); lineFileRewind(lf); if (eim) bbExIndexMakerAllocChunkArrays(eim, bedCount); writeBlocks(usageList, lf, as, itemsPerSlot, boundsArray, blockCount, doCompress, f, resTryCount, resScales, resSizes, eim, bedCount, fieldCount, &maxBlockSize); } verboseTime(1, "pass2 - checking and writing primary data (%lld records, %d fields)", (long long)bedCount, fieldCount); /* Write out primary data index. */ bits64 indexOffset = ftell(f); cirTreeFileBulkIndexToOpenFile(boundsArray, sizeof(boundsArray[0]), blockCount, blockSize, 1, NULL, bbiBoundsArrayFetchKey, bbiBoundsArrayFetchOffset, indexOffset, f); freez(&boundsArray); verboseTime(2, "index write"); /* Declare arrays and vars that track the zoom levels we actually output. */ bits32 zoomAmounts[bbiMaxZoomLevels]; bits64 zoomDataOffsets[bbiMaxZoomLevels]; bits64 zoomIndexOffsets[bbiMaxZoomLevels]; /* Call monster zoom maker library function that bedGraphToBigWig also uses. */ int zoomLevels = 0; if (bedCount > 0) { zoomLevels = bbiWriteZoomLevels(lf, f, blockSize, itemsPerSlot, bedWriteReducedOnceReturnReducedTwice, fieldCount, doCompress, indexOffset - dataOffset, usageList, resTryCount, resScales, resSizes, zoomAmounts, zoomDataOffsets, zoomIndexOffsets, &totalSum); } /* Write out extra indexes if need be. */ if (eim) { int i; for (i=0; i < eim->indexCount; ++i) { eim->fileOffsets[i] = ftell(f); maxBedNameSize = eim->maxFieldSize[i]; qsort(eim->chunkArrayArray[i], bedCount, sizeof(struct bbNamedFileChunk), bbNamedFileChunkCmpByName); assert(sizeof(struct bbNamedFileChunk) == sizeof(eim->chunkArrayArray[i][0])); bptFileBulkIndexToOpenFile(eim->chunkArrayArray[i], sizeof(eim->chunkArrayArray[i][0]), bedCount, blockSize, bbNamedFileChunkKey, maxBedNameSize, bbNamedFileChunkVal, sizeof(bits64) + sizeof(bits64), f); verboseTime(1, "Sorting and writing extra index %d", i); } } /* Figure out buffer size needed for uncompression if need be. */ if (doCompress) { int maxZoomUncompSize = itemsPerSlot * sizeof(struct bbiSummaryOnDisk); uncompressBufSize = max(maxBlockSize, maxZoomUncompSize); } /* Go back and rewrite header. */ rewind(f); bits32 sig = bigBedSig; bits16 version = bbiCurrentVersion; bits16 summaryCount = zoomLevels; bits32 reserved32 = 0; bits64 reserved64 = 0; bits16 definedFieldCount = bedN; /* Write fixed header */ writeOne(f, sig); writeOne(f, version); writeOne(f, summaryCount); writeOne(f, chromTreeOffset); writeOne(f, dataOffset); writeOne(f, indexOffset); writeOne(f, fieldCount); writeOne(f, definedFieldCount); writeOne(f, asOffset); writeOne(f, totalSummaryOffset); writeOne(f, uncompressBufSize); writeOne(f, extHeaderOffset); assert(ftell(f) == 64); /* Write summary headers with data. */ int i; verbose(2, "Writing %d levels of zoom\n", zoomLevels); for (i=0; i<zoomLevels; ++i) { verbose(3, "zoomAmounts[%d] = %d\n", i, (int)zoomAmounts[i]); writeOne(f, zoomAmounts[i]); writeOne(f, reserved32); writeOne(f, zoomDataOffsets[i]); writeOne(f, zoomIndexOffsets[i]); } /* Write rest of summary headers with no data. */ for (i=zoomLevels; i<bbiMaxZoomLevels; ++i) { writeOne(f, reserved32); writeOne(f, reserved32); writeOne(f, reserved64); writeOne(f, reserved64); } /* Write total summary. */ fseek(f, totalSummaryOffset, SEEK_SET); bbiSummaryElementWrite(f, &totalSum); /* Write extended header */ fseek(f, extHeaderOffset, SEEK_SET); writeOne(f, extHeaderSize); writeOne(f, extraIndexCount); writeOne(f, extraIndexListOffset); repeatCharOut(f, 0, 52); // reserved assert(ftell(f) - extHeaderOffset == extHeaderSize); /* Write extra index offsets if need be. */ if (extraIndexCount != 0) { fseek(f, extraIndexListOffset, SEEK_SET); int i; for (i=0; i<extraIndexCount; ++i) { // Write out fixed part of index info bits16 type = 0; // bPlusTree type bits16 indexFieldCount = 1; writeOne(f, type); writeOne(f, indexFieldCount); writeOne(f, eim->fileOffsets[i]); repeatCharOut(f, 0, 4); // reserved // Write out field list - easy this time because for now always only one field. bits16 fieldId = eim->indexFields[i]; writeOne(f, fieldId); repeatCharOut(f, 0, 2); // reserved } assert(ftell(f) == extraIndexListEndOffset); } /* Write end signature. */ fseek(f, 0L, SEEK_END); writeOne(f, sig); /* Clean up. */ lineFileClose(&lf); carefulClose(&f); bbiChromUsageFreeList(&usageList); asObjectFreeList(&as); } void bedToBigBed(char *inName, char *chromSizes, char *outName) /* bedToBigBed - Convert bed file to bigBed.. */ { struct slName *extraIndexList = slNameListFromString(extraIndex, ','); struct asObject *as = asParseText(asText); if (as == NULL) errAbort("AutoSql file (%s) not in legal format.", asFile); asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, asText, as, doCompress, extraIndexList, outName); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); asFile = optionVal("as", asFile); doCompress = !optionExists("unc"); sizesIs2Bit = optionExists("sizesIs2Bit"); sizesIsBb = optionExists("sizesIsBb"); extraIndex = optionVal("extraIndex", NULL); tabSep = optionExists("tab"); allow1bpOverlap = optionExists("allow1bpOverlap"); udcDir = optionVal("udcDir", udcDefaultDir()); size_t maxAlloc = optionLongLong("maxAlloc", 0); if (argc != 4) usage(); udcSetDefaultDir(udcDir); if (maxAlloc > 0) setMaxAlloc(maxAlloc); if (optionExists("type")) { // parse type char *btype = cloneString(optionVal("type", "")); char *plus = strchr(btype, '+'); if (plus) { *plus++ = 0; if (isdigit(*plus)) bedP = sqlUnsigned(plus); } if (!startsWith("bed", btype)) errAbort("type must begin with \"bed\""); btype +=3; bedN = sqlUnsigned(btype); if (bedN < 3) errAbort("Bed must be 3 or higher, found %d\n", bedN); if (bedN > 15) errAbort("Bed must be 15 or lower, found %d\n", bedN); } else { if (asText) errAbort("If you specify the .as file, you must specify the -type as well so that\n" "the number of standard BED columns is known."); } /* If the haven't set bedN and bedP from the type var in the command line, then we sniff it * out from file. */ char *bedFileName = argv[1]; if (bedN == 0) { /* Just read in single line and count fields. */ struct lineFile *lf = lineFileOpen(bedFileName, TRUE); char *line; if (!lineFileNextReal(lf, &line)) errAbort("%s is empty", lf->fileName); int fieldCount; if (tabSep) fieldCount = chopByChar(line, '\t', NULL, 0); // Do not use chopString, see GOTCHA else fieldCount = chopByWhite(line, NULL, 0); if (fieldCount > 256) errAbort("Too many columns in %s, you sure it's a bed file?", lf->fileName); lineFileClose(&lf); /* Set up so that it looks like we are straight up bed for that many fields, * or if more than or maximum defined fields, then for bed15+ */ bedN = fieldCount; if (bedN > bedKnownFields) { bedP = bedN - bedKnownFields; bedN = bedKnownFields; } } /* Make sure that fields are defined, from bed spec if nowhere else. */ if (asFile) readInGulp(asFile, &asText, NULL); else asText = bedAsDef(bedN, bedN + bedP); bedToBigBed(bedFileName, argv[2], argv[3]); optionFree(); if (verboseLevel() > 1) printVmPeak(); return 0; }