af3fb358fdeee7bf9080bdee719eb0cc1d4b4c72 hiram Sat Nov 22 15:32:57 2025 -0800 allow input to be a bigBed file refs #36672 diff --git src/hg/utils/twoBitMask/twoBitMask.c src/hg/utils/twoBitMask/twoBitMask.c index eb201da6a81..c78b57a69b7 100644 --- src/hg/utils/twoBitMask/twoBitMask.c +++ src/hg/utils/twoBitMask/twoBitMask.c @@ -11,48 +11,48 @@ #include "memalloc.h" #include "repMask.h" #include "twoBit.h" #include "bed.h" void usage() /* Explain usage and exit. */ { errAbort( "twoBitMask - apply masking to a .2bit file, creating a new .2bit file\n" "usage:\n" " twoBitMask input.2bit maskFile output.2bit\n" "options:\n" " -add Don't remove pre-existing masking before applying maskFile.\n" - " -type=.XXX Type of maskFile is XXX (bed or out).\n" - "maskFile can be a RepeatMasker .out file or a .bed file. It must not\n" - "contain rows for sequences which are not in input.2bit.\n" + " -type=.XXX Type of maskFile is XXX (bed or bb or out).\n" + "maskFile can be a RepeatMasker .out file, a bigBed .bb file, or a .bed file.\n" + "It must not contain rows for sequences which are not in input.2bit.\n" ); } /* Options: */ boolean add = FALSE; char *type = NULL; static struct optionSpec options[] = { {"add", OPTION_BOOLEAN}, {"type", OPTION_STRING}, {NULL, 0}, }; -unsigned slurpInput(char *inName, struct hash *tbHash, +static unsigned slurpInput(char *inName, struct hash *tbHash, struct hash *bitmapHash, struct twoBit **list) /* Read .2bit file inName into memory and return list of twoBit items. * Populate tbHash with twoBit items, and bitmapHash with bitmaps for * easy masking. Both are hashed by twoBit sequence name. */ { struct twoBit *twoBitList = NULL; struct twoBit *twoBit = NULL; struct twoBitFile *tbf = twoBitOpen(inName); int version = tbf->version; *list = twoBitList = twoBitFromOpenFile(tbf); /* Free and clear the masking data (unless -add). Hash twoBits by name. */ for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next) { Bits *bits = bitAlloc(twoBit->size); if (add) @@ -64,54 +64,54 @@ bitSetRange(bits, twoBit->maskStarts[i], twoBit->maskSizes[i]); } } /* Free the current representation of masking -- it will be replaced. */ twoBit->maskBlockCount = 0; freez(&(twoBit->maskStarts)); freez(&(twoBit->maskSizes)); /* Hash twoBit and our new bitmap by sequence name. */ hashAddUnique(tbHash, twoBit->name, twoBit); hashAddUnique(bitmapHash, twoBit->name, bits); } return version; } -void addMasking(struct hash *twoBitHash, struct hash *bitmapHash, char *seqName, +static void addMasking(struct hash *twoBitHash, struct hash *bitmapHash, char *seqName, unsigned start, unsigned end) /* Set bits in range. */ { if (end > start) { struct twoBit *tb = (struct twoBit *)hashMustFindVal(twoBitHash, seqName); if ((end > tb->size) || (start >= tb->size)) errAbort("bed range (%d - %d) is off the end of chromosome %s size %d", start, end, seqName, tb->size); Bits *bits = (Bits *)hashMustFindVal(bitmapHash, seqName); bitSetRange(bits, start, (end - start)); } } struct unsignedRange { struct unsignedRange *next; unsigned start; unsigned size; }; -void bitmapToMaskArray(struct hash *bitmapHash, struct hash *tbHash) +static void bitmapToMaskArray(struct hash *bitmapHash, struct hash *tbHash) /* Translate each bitmap in bitmapHash into an array of mask coordinates * in the corresponding twoBit in tbHash. Assume tbHash's mask array is * empty at the start -- we allocate it here. Free bitmap when done. */ { struct hashCookie cookie = hashFirst(tbHash); struct hashEl *hel = NULL; while ((hel = hashNext(&cookie)) != NULL) { char *seqName = hel->name; struct twoBit *tb = (struct twoBit *)(hel->val); struct hashEl *bHel = hashLookup(bitmapHash, seqName); Bits *bits; unsigned start=0, end=0; @@ -148,57 +148,77 @@ for (i = 0, range = rangeList; range != NULL; i++, range = range->next) { tb->maskStarts[i] = range->start; tb->maskSizes[i] = range->size; } } lmCleanup(&lm); bitFree(&bits); bHel->val = NULL; } } } -void maskWithBed(char *bedName, struct hash *tbHash, struct hash *bitmapHash) +static void maskWithBigBed(char *bbName, struct hash *tbHash, struct hash *bitmapHash) +/* Read coordinates from bbName and apply them to twoBits in tbHash. */ +{ +struct bbiFile *bbi = bigBedFileOpen(bbName); +struct bbiChromInfo *chrom, *chromList = bbiChromList(bbi); +for (chrom = chromList; chrom != NULL; chrom = chrom->next) + { + struct lm *lm = lmInit(0); + struct bigBedInterval *list = bigBedIntervalQuery(bbi,chrom->name,0,chrom->size,0,lm); + struct bigBedInterval *el; + for (el = list; el != NULL; el = el->next) + { + addMasking(tbHash, bitmapHash, chrom->name, el->start, el->end); + } + lmCleanup(&lm); + } +bigBedFileClose(&bbi); +bitmapToMaskArray(bitmapHash, tbHash); +} + +static void maskWithBed(char *bedName, struct hash *tbHash, struct hash *bitmapHash) /* Read coordinates from bedName and apply them to twoBits in tbHash. */ { struct lineFile *lf = lineFileOpen(bedName, TRUE); int wordCount; char *words[13]; boolean alreadyWarned = FALSE; while ((wordCount = lineFileChop(lf, words)) != 0) { struct bed bed; /* warn if bed has at least 12 fields -- no support for blocks */ if (wordCount >= 12 && !alreadyWarned) { warn("Warning: BED file %s has >=%d fields which means it might " "contain block coordinates, but this program uses only the " "first three fields (the entire span -- no support for blocks).", bedName, wordCount); alreadyWarned = TRUE; } bedStaticLoad(words, &bed); addMasking(tbHash, bitmapHash, bed.chrom, bed.chromStart, bed.chromEnd); } bitmapToMaskArray(bitmapHash, tbHash); } -void maskWithOut(char *outName, struct hash *tbHash, struct hash *bitmapHash) +static void maskWithOut(char *outName, struct hash *tbHash, struct hash *bitmapHash) /* Read coordinates from outName and apply them to twoBits in tbHash. */ { struct lineFile *lf = lineFileOpen(outName, TRUE); char *line; int lineSize; /* Make sure we have a .out header. */ if (!lineFileNext(lf, &line, &lineSize)) errAbort("Empty %s", lf->fileName); if (!startsWith(" SW perc perc", line)) { if (!startsWith(" SW perc perc", line)) errAbort("%s doesn't seem to be a RepeatMasker .out file, first " "line seen:\n%s", lf->fileName, line); } @@ -238,30 +258,32 @@ FILE *f = NULL; if (! twoBitIsFile(inName)) { if (twoBitIsSpec(inName)) errAbort("Sorry, this works only on whole .2bit files, not specs."); else errAbort("Input %s does not look like a proper .2bit file.", inName); } unsigned version = slurpInput(inName, tbHash, bitmapHash, &twoBitList); /* Read mask data into bitmapHash, store it in twoBits: */ if ((type && endsWith(type, "bed")) || endsWith(maskName, ".bed")) maskWithBed(maskName, tbHash, bitmapHash); +else if ((type && endsWith(type, "bb")) || endsWith(maskName, ".bb")) + maskWithBigBed(maskName, tbHash, bitmapHash); else if ((type && endsWith(type, "out")) || endsWith(maskName, ".out")) maskWithOut(maskName, tbHash, bitmapHash); else errAbort("Sorry, maskFile must end in \".bed\" or \".out\"."); /* Create a new .2bit file, write it out from twoBits. */ f = mustOpen(outName, "wb"); twoBitWriteHeaderExt(twoBitList, f, version == 1); for (twoBit = twoBitList; twoBit != NULL; twoBit = twoBit->next) { twoBitWriteOne(twoBit, f); } carefulClose(&f); /* Don't bother freeing twoBitList and hashes here -- just exit. */