d0054a39067a773d5342af78a80d964835d47a60 markd Thu Oct 16 13:28:21 2025 -0700 add bedToBigBed -fixScores to correct scores that are out-of-range or invalid diff --git src/utils/bedToBigBed/bedToBigBed.c src/utils/bedToBigBed/bedToBigBed.c index 7f9e5b97711..e0ac7b1e662 100644 --- src/utils/bedToBigBed/bedToBigBed.c +++ src/utils/bedToBigBed/bedToBigBed.c @@ -33,30 +33,31 @@ /* Things set directly or indirectly by command lne in main() routine. */ int blockSize = 256; int itemsPerSlot = 512; char *extraIndex = NULL; int bedN = 0; /* number of standard bed fields */ int bedP = 0; /* number of bed plus fields */ char *asFile = NULL; char *asText = NULL; char *udcDir = NULL; static boolean doCompress = FALSE; static boolean tabSep = FALSE; static boolean sizesIs2Bit = FALSE; static boolean sizesIsChromAliasBb = FALSE; static boolean allow1bpOverlap = FALSE; +static boolean fixScores = FALSE; void usage() /* Explain usage and exit. */ { errAbort( "bedToBigBed v. %s - Convert bed file to bigBed. (bbi version: %d)\n" "usage:\n" " bedToBigBed in.bed chrom.sizes out.bb\n" "Where in.bed is in one of the ascii bed formats, but not including track lines\n" "and chrom.sizes is a two-column file/URL: <chromosome name> <size in bases>\n" "and out.bb is the output indexed big bed file.\n" "\n" "If the assembly <db> is hosted by UCSC, chrom.sizes can be a URL like\n" " http://hgdownload.soe.ucsc.edu/goldenPath/<db>/bigZips/<db>.chrom.sizes\n" "or you may use the script fetchChromSizes to download the chrom.sizes file.\n" @@ -95,49 +96,51 @@ " -as=fields.as - If you have non-standard \"bedPlus\" fields, it's great to put a definition\n" " of each field in a row in AutoSql format here.\n" " -blockSize=N - Number of items to bundle in r-tree. Default %d\n" " -itemsPerSlot=N - Number of data points bundled at lowest level. Default %d\n" " -unc - If set, do not use compression.\n" " -tab - If set, expect fields to be tab separated, normally\n" " expects white space separator.\n" " -extraIndex=fieldList - If set, make an index on each field in a comma separated list\n" " extraIndex=name and extraIndex=name,id are commonly used.\n" " -sizesIs2Bit -- If set, the chrom.sizes file is assumed to be a 2bit file.\n" " -sizesIsChromAliasBb -- If set, then chrom.sizes file is assumed to be a chromAlias\n" " bigBed file or a URL to a such a file (see above).\n" " -sizesIsBb -- Obsolete name for -sizesIsChromAliasBb.\n" " -udcDir=/path/to/udcCacheDir -- sets the UDC cache dir for caching of remote files.\n" " -allow1bpOverlap -- allow exons to overlap by at most one base pair\n" + " -fixScores -- change non-integer scores to 0 and for scores into range 0..1000\n" " -maxAlloc=N -- Set the maximum memory allocation size to N bytes\n" " -sort -- sort the input file\n" , version, bbiCurrentVersion, blockSize, itemsPerSlot ); } static struct optionSpec options[] = { {"blockSize", OPTION_INT}, {"itemsPerSlot", OPTION_INT}, {"type", OPTION_STRING}, {"as", OPTION_STRING}, {"unc", OPTION_BOOLEAN}, {"tab", OPTION_BOOLEAN}, {"sizesIs2Bit", OPTION_BOOLEAN}, {"sizesIsChromAliasBb", OPTION_BOOLEAN}, {"sizesIsBb", OPTION_BOOLEAN}, {"extraIndex", OPTION_STRING}, {"udcDir", OPTION_STRING}, {"allow1bpOverlap", OPTION_BOOLEAN}, + {"fixScores", OPTION_BOOLEAN}, {"maxAlloc", OPTION_LONG_LONG}, {"sort", OPTION_BOOLEAN}, {NULL, 0}, }; static struct lineFile *rewindFile(char *inName, struct lineFile *lf) /* set up lineFile to point at the beginning of the file. It we're reading from a decompressing * pipe, we need to close and reopen the pipe. */ { if (lf->pl) { lineFileClose(&lf); lf = lineFileOpen(inName, TRUE); } else @@ -215,49 +218,55 @@ bits64 blockStartOffset = 0; int startPos = 0, endPos = 0; bits32 chromId = 0; struct dyString *stream = dyStringNew(0); /* Will keep track of some things that help us determine how much to reduce. */ bits32 resEnds[resTryCount]; int resTry; for (resTry = 0; resTry < resTryCount; ++resTry) resEnds[resTry] = 0; boolean atEnd = FALSE, sameChrom = FALSE; bits32 start = 0, end = 0; char *chrom = NULL; struct bed *bed; AllocVar(bed); +unsigned opts = 0; +if (allow1bpOverlap) + opts |= BED_ALLOW_1BP_OVERLAP; +if (fixScores) + opts |= BED_FIX_SCORE; + /* Help keep track of which beds are in current chunk so as to write out * namedChunks to eim if need be. */ long sectionStartIx = 0, sectionEndIx = 0; for (;;) { /* Get next line of input if any. */ if (lineFileNextReal(lf, &line)) { /* Chop up line and make sure the word count is right. */ int wordCount; if (tabSep) wordCount = chopTabs(line, row); else wordCount = chopLine(line, row); lineFileExpectWordsMesg(lf, fieldCount, wordCount, "If the input is a tab-sep file, do not forget to use the -tab option"); - loadAndValidateBedExt(row, bedN, fieldCount, lf, bed, as, FALSE, allow1bpOverlap); + loadAndValidateBedOpts(row, bedN, fieldCount, lf, bed, as, opts); chrom = bed->chrom; start = bed->chromStart; end = bed->chromEnd; sameChrom = sameString(chrom, usage->name); } else /* No next line */ { atEnd = TRUE; } /* Check conditions that would end block and save block info and advance to next if need be. */ if (atEnd || !sameChrom || itemIx >= itemsPerSlot) @@ -342,32 +351,34 @@ { bbExIndexMakerAddKeysFromRow(eim, row, sectionEndIx); sectionEndIx += 1; } /* Write out data. */ dyStringWriteOne(stream, chromId); dyStringWriteOne(stream, start); dyStringWriteOne(stream, end); if (fieldCount > 3) { int i; /* Write 3rd through next to last field and a tab separator. */ for (i=3; i<lastField; ++i) { - char *s = row[i]; - dyStringAppend(stream, s); + if ((opts & BED_FIX_SCORE) && (i == 4)) + dyStringPrintf(stream, "%d", bed->score); // keep fixed score + else + dyStringAppend(stream, row[i]); dyStringAppendC(stream, '\t'); } /* Write last field and terminal zero */ char *s = row[lastField]; dyStringAppend(stream, s); } dyStringAppendC(stream, 0); itemIx += 1; /* Do zoom counting. */ for (resTry = 0; resTry < resTryCount; ++resTry) { bits32 resEnd = resEnds[resTry]; if (start >= resEnd && resEnd < usage->size) @@ -870,30 +881,31 @@ int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); blockSize = optionInt("blockSize", blockSize); itemsPerSlot = optionInt("itemsPerSlot", itemsPerSlot); asFile = optionVal("as", asFile); doCompress = !optionExists("unc"); sizesIs2Bit = optionExists("sizesIs2Bit"); sizesIsChromAliasBb = optionExists("sizesIsChromAliasBb") || optionExists("sizesIsBb"); if (sizesIs2Bit && sizesIsChromAliasBb) errAbort("can't specify both -sizesIs2Bit and -sizesIsChromAliasBb"); extraIndex = optionVal("extraIndex", NULL); tabSep = optionExists("tab"); allow1bpOverlap = optionExists("allow1bpOverlap"); +fixScores = optionExists("fixScores"); udcDir = optionVal("udcDir", udcDefaultDir()); size_t maxAlloc = optionLongLong("maxAlloc", 0); if (argc != 4) usage(); char *bedFileName = argv[1]; mustBeReadableAndRegularFile(bedFileName); udcSetDefaultDir(udcDir); if (maxAlloc > 0) setMaxAlloc(maxAlloc); if (optionExists("type")) {