9b033f6d0e8b14a79908339c61c7153e7004f19b markd Thu Sep 8 12:27:11 2022 -0700 change bedToBigBed to actually describe the use of chromAlias diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index 13a9ac1..f601d30 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -28,89 +28,107 @@ * 2.6 - Made it not crash on empty input. * */ /* Things set directly or indirectly by command lne in main() routine. */ int blockSize = 256; int itemsPerSlot = 512; char *extraIndex = NULL; int bedN = 0; /* number of standard bed fields */ int bedP = 0; /* number of bed plus fields */ char *asFile = NULL; char *asText = NULL; char *udcDir = NULL; static boolean doCompress = FALSE; static boolean tabSep = FALSE; static boolean sizesIs2Bit = FALSE; -static boolean sizesIsBb = FALSE; +static boolean sizesIsChromAliasBb = FALSE; static boolean allow1bpOverlap = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedToBigBed v. %s - Convert bed file to bigBed. (bbi version: %d)\n" "usage:\n" " bedToBigBed in.bed chrom.sizes out.bb\n" "Where in.bed is in one of the ascii bed formats, but not including track lines\n" "and chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n" "and out.bb is the output indexed big bed file.\n" + "\n" "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n" " http://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n" "or you may use the script fetchChromSizes to download the chrom.sizes file.\n" "If you have bed annotations on patch sequences from NCBI, a more inclusive\n" "chrom.sizes file can be found using a URL like\n" " http://hgdownload.soe.ucsc.edu/goldenPath/<db>/database/chromInfo.txt.gz\n" "If not hosted by UCSC, a chrom.sizes file can be generated by running\n" - "twoBitInfo on the assembly .2bit file.\n" + "twoBitInfo on the assembly .2bit file or the 2bit file or used directly\n" + "if the -sizesIs2Bit option is specified.\n" + "\n" + "The chrom.sizes file may also be a chromAlias bigBed file, or a URL to\n" + "such a file, by specifying the -sizesIsChromAliasBb option. With a\n" + "chromAlias, the input BED file may have chromosome names matching any\n" + "of the sequence name aliases in the chromAlias file.\n" + "\n" + "For UCSC provided genomes, the chromAlias files can be found under:\n" + " https://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chromAlias.bb\n" + "For UCSC GenArk assembly hubs, the chrom aliases are namedd in the form:\n" + " https://hgdownload.soe.ucsc.edu/hubs/GCF/006/542/625/GCF_006542625.1/GCF_006542625.1.chromAlias.bb\n" + "For a description of generating chromAlias files for your own assembly hub, see:\n" + " http://genomewiki.ucsc.edu/index.php/Chrom_Alias\n" + "\n" "The in.bed file must be sorted by chromosome,start,\n" " to sort a bed file, use the unix sort command:\n" " sort -k1,1 -k2,2n unsorted.bed > sorted.bed\n" "Sorting must be set to skip Unicode mapping (LC_COLLATE=C).\n" "\n" "options:\n" " -type=bedN[+[P]] : \n" " N is between 3 and 15, \n" " optional (+) if extra \"bedPlus\" fields, \n" " optional P specifies the number of extra fields. Not required, but preferred.\n" " Examples: -type=bed6 or -type=bed6+ or -type=bed6+3 \n" " (see http://genome.ucsc.edu/FAQ/FAQformat.html#format1)\n" " -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n" " of each field in a row in AutoSql format here.\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" " -unc - If set, do not use compression.\n" " -tab - If set, expect fields to be tab separated, normally\n" " expects white space separator.\n" " -extraIndex=fieldList - If set, make an index on each field in a comma separated list\n" " extraIndex=name and extraIndex=name,id are commonly used.\n" " -sizesIs2Bit -- If set, the chrom.sizes file is assumed to be a 2bit file.\n" - " -sizesIsBb -- If set, the chrom.sizes file is assumed to be a bigBed file.\n" + " -sizesIsChromAliasBb -- If set, then chrom.sizes file is assumed to be a chromAlias\n" + " bigBed file or a URL to a such a file (see above).\n" + " -sizesIsBb -- Obsolete name for -sizesIsChromAliasBb.\n" " -udcDir=/path/to/udcCacheDir -- sets the UDC cache dir for caching of remote files.\n" " -allow1bpOverlap -- allow exons to overlap by at most one base pair\n" " -maxAlloc=N -- Set the maximum memory allocation size to N bytes\n" , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, {"type", OPTION_STRING}, {"as", OPTION_STRING}, {"unc", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, {"sizesIs2Bit", OPTION_BOOLEAN}, + {"sizesIsChromAliasBb", OPTION_BOOLEAN}, {"sizesIsBb", OPTION_BOOLEAN}, {"extraIndex", OPTION_STRING}, {"udcDir", OPTION_STRING}, {"allow1bpOverlap", OPTION_BOOLEAN}, {"maxAlloc", OPTION_LONG_LONG}, {NULL, 0}, }; int bbNamedFileChunkCmpByName(const void *va, const void *vb) /* Compare two named offset object to facilitate qsorting by name. */ { const struct bbNamedFileChunk *a = va, *b = vb; return strcmp(a->name, b->name); } @@ -587,31 +605,31 @@ struct lineFile *lf = lineFileOpen(inName, TRUE); bits16 fieldCount = slCount(as->columnList); bits16 extraIndexCount = slCount(extraIndexList); struct bbExIndexMaker *eim = NULL; if (extraIndexList != NULL) eim = bbExIndexMakerNew(extraIndexList, as); /* Do first pass, mostly just scanning file and counting hits per chromosome. */ int minDiff = 0; double aveSize = 0; bits64 bedCount = 0; bits32 uncompressBufSize = 0; struct bbiChromUsage *usageList = NULL; -if (sizesIsBb) +if (sizesIsChromAliasBb) usageList = bbiChromUsageFromBedFileAlias(lf, chromSizes, eim, &minDiff, &aveSize, &bedCount, tabSep); else { struct hash *chromSizesHash = NULL; if (sizesIs2Bit) chromSizesHash = twoBitChromHash(chromSizes); else chromSizesHash = bbiChromSizesFromFile(chromSizes); verbose(2, "Read %d chromosomes and sizes from %s\n", chromSizesHash->elCount, chromSizes); usageList = bbiChromUsageFromBedFile(lf, chromSizesHash, eim, &minDiff, &aveSize, &bedCount, tabSep); freeHash(&chromSizesHash); } verboseTime(1, "pass1 - making usageList (%d chroms)", slCount(usageList)); verbose(2, "%d chroms in %s. Average span of beds %f\n", slCount(usageList), inName, aveSize); @@ -824,31 +842,33 @@ errAbort("AutoSql file (%s) not in legal format.", asFile); asCompareObjAgainstStandardBed(as, bedN, TRUE); // abort if bedN columns are not standard bbFileCreate(inName, chromSizes, blockSize, itemsPerSlot, asText, as, doCompress, extraIndexList, outName); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); asFile = optionVal("as", asFile); doCompress = !optionExists("unc"); sizesIs2Bit = optionExists("sizesIs2Bit"); -sizesIsBb = optionExists("sizesIsBb"); +sizesIsChromAliasBb = optionExists("sizesIsChromAliasBb") || optionExists("sizesIsBb"); +if (sizesIs2Bit && sizesIsChromAliasBb) + errAbort("can't specify both -sizesIs2Bit and -sizesIsChromAliasBb"); extraIndex = optionVal("extraIndex", NULL); tabSep = optionExists("tab"); allow1bpOverlap = optionExists("allow1bpOverlap"); udcDir = optionVal("udcDir", udcDefaultDir()); size_t maxAlloc = optionLongLong("maxAlloc", 0); if (argc != 4) usage(); udcSetDefaultDir(udcDir); if (maxAlloc > 0) setMaxAlloc(maxAlloc); if (optionExists("type")) { // parse type char *btype = cloneString(optionVal("type", ""));