4dd0e74f2da650f4f41ccfc29fc18df024e303d9 braney Tue Nov 29 18:03:37 2016 -0800 get BLAT custom tracks to work on assembly hubs diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index 3bc96b9..1323441 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -4,47 +4,50 @@ * See README in this or parent directory for licensing information. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "dystring.h" #include "obscure.h" #include "asParse.h" #include "basicBed.h" #include "sig.h" #include "rangeTree.h" #include "zlibFace.h" #include "sqlNum.h" #include "bPlusTree.h" #include "bigBed.h" +#include "twoBit.h" char *version = "2.7"; /* Version history from 2.6 on at least - * 2.7 - Added check for duplicate field names in asParse.c * 2.6 - Made it not crash on empty input. * */ /* Things set directly or indirectly by command lne in main() routine. */ int blockSize = 256; int itemsPerSlot = 512; char *extraIndex = NULL; int bedN = 0; /* number of standard bed fields */ int bedP = 0; /* number of bed plus fields */ char *asFile = NULL; char *asText = NULL; +char *udcDir = NULL; static boolean doCompress = FALSE; static boolean tabSep = FALSE; +static boolean sizesIs2Bit = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedToBigBed v. %s - Convert bed file to bigBed. (BigBed version: %d)\n" "usage:\n" " bedToBigBed in.bed chrom.sizes out.bb\n" "Where in.bed is in one of the ascii bed formats, but not including track lines\n" "and chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n" "and out.bb is the output indexed big bed file.\n" "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n" " http://hgdownload.cse.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n" "or you may use the script fetchChromSizes to download the chrom.sizes file.\n" "If not hosted by UCSC, a chrom.sizes file can be generated by running\n" @@ -58,42 +61,46 @@ " -type=bedN[+[P]] : \n" " N is between 3 and 15, \n" " optional (+) if extra \"bedPlus\" fields, \n" " optional P specifies the number of extra fields. Not required, but preferred.\n" " Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n" " (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n" " -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n" " of each field in a row in AutoSql format here.\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" " -unc - If set, do not use compression.\n" " -tab - If set, expect fields to be tab separated, normally\n" " expects white space separator.\n" " -extraIndex=fieldList - If set, make an index on each field in a comma separated list\n" " extraIndex=name and extraIndex=name,id are commonly used.\n" + " -sizesIs2Bit -- If set, the chrom.sizes file is assumed to be a 2bit file.\n" + " -udcDir=/path/to/udcCacheDir -- sets the UDC cache dir for caching of remote files.\n" , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, {"type", OPTION_STRING}, {"as", OPTION_STRING}, {"unc", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, + {"sizesIs2Bit", OPTION_BOOLEAN}, {"extraIndex", OPTION_STRING}, + {"udcDir", OPTION_STRING}, {NULL, 0}, }; int bbNamedFileChunkCmpByName(const void *va, const void *vb) /* Compare two named offset object to facilitate qsorting by name. */ { const struct bbNamedFileChunk *a = va, *b = vb; return strcmp(a->name, b->name); } static int maxBedNameSize; void bbNamedFileChunkKey(const void *va, char *keyBuf) /* Copy name to keyBuf for bPlusTree maker */ { @@ -557,31 +564,36 @@ char *outName) /* BigBed output file name. */ /* Convert tab-separated bed file to binary indexed, zoomed bigBed version. */ { /* Set up timing measures. */ verboseTimeInit(); struct lineFile *lf = lineFileOpen(inName, TRUE); bits16 fieldCount = slCount(as->columnList); bits16 extraIndexCount = slCount(extraIndexList); struct bbExIndexMaker *eim = NULL; if (extraIndexList != NULL) eim = bbExIndexMakerNew(extraIndexList, as); /* Load in chromosome sizes. */ -struct hash *chromSizesHash = bbiChromSizesFromFile(chromSizes); +struct hash *chromSizesHash = NULL; + +if (sizesIs2Bit) + chromSizesHash = twoBitChromHash(chromSizes); +else + chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "Read %d chromosomes and sizes from %s\n", chromSizesHash->elCount, chromSizes); /* Do first pass, mostly just scanning file and counting hits per chromosome. */ int minDiff = 0; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, &minDiff, &aveSize, &bedCount, tabSep); verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList)); verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize); /* Open output file and write dummy header. */ FILE *f = mustOpen(outName, "wb"); bbiWriteDummyHeader(f); @@ -789,34 +801,38 @@ struct slName *extraIndexList = slNameListFromString(extraIndex, ','); struct asObject *as = asParseText(asText); asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, asText, as, doCompress, extraIndexList, outName); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); asFile = optionVal("as", asFile); doCompress = !optionExists("unc"); +sizesIs2Bit = optionExists("sizesIs2Bit"); extraIndex = optionVal("extraIndex", NULL); tabSep = optionExists("tab"); +udcDir = optionVal("udcDir", udcDefaultDir()); if (argc != 4) usage(); +udcSetDefaultDir(udcDir); + if (optionExists("type")) { // parse type char *btype = cloneString(optionVal("type", "")); char *plus = strchr(btype, '+'); if (plus) { *plus++ = 0; if (isdigit(*plus)) bedP = sqlUnsigned(plus); } if (!startsWith("bed", btype)) errAbort("type must begin with \"bed\""); btype +=3; bedN = sqlUnsigned(btype);