d248ea47e2ffed0b22206642ad3477c1213ec5cc braney Wed Mar 3 14:15:32 2021 -0800 add -countBlocks option. Support bigBeds. diff --git src/hg/featureBits/featureBits.c src/hg/featureBits/featureBits.c index ca3a900..abd1e84 100644 --- src/hg/featureBits/featureBits.c +++ src/hg/featureBits/featureBits.c @@ -31,74 +31,77 @@ {"not", OPTION_BOOLEAN}, {"countGaps", OPTION_BOOLEAN}, {"noRandom", OPTION_BOOLEAN}, {"noHap", OPTION_BOOLEAN}, {"primaryChroms", OPTION_BOOLEAN}, {"dots", OPTION_INT}, {"minFeatureSize", OPTION_INT}, {"enrichment", OPTION_BOOLEAN}, {"where", OPTION_STRING}, {"bin", OPTION_STRING}, {"chromSize", OPTION_STRING}, {"binSize", OPTION_INT}, {"binOverlap", OPTION_INT}, {"bedRegionIn", OPTION_STRING}, {"bedRegionOut", OPTION_STRING}, + {"countBlocks", OPTION_BOOLEAN}, {NULL, 0} }; int minSize = 1; /* Minimum size of feature. */ char *clChrom = "all"; /* Which chromosome. */ boolean orLogic = FALSE; /* Do ors instead of ands? */ boolean notResults = FALSE; /* negate results? */ char *where = NULL; /* Extra selection info. */ char *chromSizes = NULL; /* read chrom sizes from file instead of database . */ boolean countGaps = FALSE; /* Count gaps in denominator? */ boolean noRandom = FALSE; /* Exclude _random chromosomes? */ boolean noHap = FALSE; /* Exclude _hap|_alt chromosomes? */ boolean primaryChroms = FALSE; /* only include primary chroms */ int dots = 0; /* print dots every N chroms (scaffolds) processed */ boolean calcEnrichment = FALSE; /* Calculate coverage/enrichment? */ int binSize = 500000; /* Default bin size. */ int binOverlap = 250000; /* Default bin size. */ +boolean countBlocks = FALSE; /* Count blocks in bed12 rather than extent. */ /* to process chroms without constantly looking up in chromInfo, create * this list of them from the chromInfo once. */ static struct chromInfo *chromInfoList = NULL; static struct hash *gapHash = NULL; void usage() /* Explain usage and exit. */ { errAbort( "featureBits - Correlate tables via bitmap projections. \n" "usage:\n" " featureBits database table(s)\n" "This will return the number of bits in all the tables anded together\n" "Pipe warning: output goes to stderr.\n" "Options:\n" " -bed=output.bed Put intersection into bed format. Can use stdout.\n" " -fa=output.fa Put sequence in intersection into .fa file\n" " -faMerge For fa output merge overlapping features.\n" " -minSize=N Minimum size to output (default 1)\n" " -chrom=chrN Restrict to one chromosome\n" " -chromSize=sizefile Read chrom sizes from file instead of database. \n" " (chromInfo three column format)\n" " -or Or tables together instead of anding them\n" " -not Output negation of resulting bit set.\n" " -countGaps Count gaps in denominator\n" + " -countBlocks Count blocks in bed12 files rather than entire extent.\n" " -noRandom Don't include _random (or Un) chromosomes\n" " -noHap Don't include _hap|_alt chromosomes\n" " -primaryChroms Primary assembly (chroms without '_' in name)\n" " -dots=N Output dot every N chroms (scaffolds) processed\n" " -minFeatureSize=n Don't include bits of the track that are smaller than\n" " minFeatureSize, useful for differentiating between\n" " alignment gaps and introns.\n" " -bin=output.bin Put bin counts in output file\n" " -binSize=N Bin size for generating counts in bin file (default 500000)\n" " -binOverlap=N Bin overlap for generating counts in bin file (default 250000)\n" " -bedRegionIn=input.bed Read in a bed file for bin counts in specific regions \n" " and write to bedRegionsOut\n" " -bedRegionOut=output.bed Write a bed file of bin counts in specific regions \n" " from bedRegionIn\n" " -enrichment Calculates coverage and enrichment assuming first table\n" @@ -347,56 +350,132 @@ struct psl *psl; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; lf = pslFileOpen(fileName); while ((psl = pslNext(lf)) != NULL) { if (sameString(psl->tName, chrom)) setPslBits(lf, acc, psl, 0, chromSize); pslFree(&psl); } lineFileClose(&lf); } +static int howManyFields(char *fileName) +/* Figure out how many fields the first bed has in this file. */ +{ +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *line, *row[256]; +unsigned numFields = 0; +if (lineFileNextReal(lf, &line)) + numFields = chopByWhite(line, row, ArraySize(row)); + +lineFileClose(&lf); +return numFields; +} + +static struct bed *cacheBeds(char *fileName) +/* Read in the beds from this file and keep them in a cache. */ +{ +static char *cachedFile; +static struct bed *cachedBeds; + +if ((cachedFile == NULL) || differentString(cachedFile, fileName)) + { + if (cachedBeds) + { + bedFreeList(&cachedBeds); + freez(&cachedFile); + } + + cachedFile = cloneString(fileName); + unsigned numFields = howManyFields(cachedFile); + cachedBeds = bedLoadNAllChrom(fileName, numFields > 12 ? 12 : numFields, NULL); + } + +return cachedBeds; +} + void fbOrBed(Bits *acc, char *track, char *chrom, int chromSize) -/* Or in bits of psl file that correspond to chrom. */ +/* Or in bits of bed file that correspond to chrom. */ { -struct lineFile *lf; char fileName[512]; -char *row[3]; -int s, e, w; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; -lf = lineFileOpen(fileName, TRUE); -while (lineFileRow(lf, row)) +struct bed *bed, *beds = cacheBeds(fileName); + +for(bed = beds; bed; bed = bed->next) { - if (sameString(row[0], chrom)) + if (sameString(bed->chrom, chrom)) { - s = lineFileNeedNum(lf, row, 1); - if (s < 0) outOfRange(lf, chrom, chromSize); - e = lineFileNeedNum(lf, row, 2); - if (e > chromSize) outOfRange(lf, chrom, chromSize); - w = e - s; + if (countBlocks && (bed->blockCount > 0)) + { + int ii; + for(ii=0; ii < bed->blockCount; ii++) + bitSetRange(acc, bed->chromStart + bed->chromStarts[ii], bed->blockSizes[ii]); + } + else + { + int w; + w = bed->chromEnd - bed->chromStart; if (w > 0) - bitSetRange(acc, s, w); + bitSetRange(acc, bed->chromStart, w); } } -lineFileClose(&lf); + } +} + +void fbOrBigBed(Bits *acc, char *track, char *chrom, int chromSize) +/* Or in a bigBed file. */ +{ +char fileName[512]; +chromFileName(track, chrom, fileName); +if (!fileExists(fileName)) + return; + +struct lm *lm = lmInit(0); +struct bbiFile *bbi = bigBedFileOpen(fileName); +unsigned fieldCount = bbi->definedFieldCount; +char *bedRow[fieldCount]; +struct bigBedInterval *interval, *intervalList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); + +for (interval = intervalList; interval != NULL; interval = interval->next) + { + if (countBlocks && (fieldCount >= 12)) + { + char startBuf[16], endBuf[16]; + bigBedIntervalToRow(interval, chrom, startBuf, endBuf, bedRow, ArraySize(bedRow)); + struct bed *bed = bedLoadN(bedRow, fieldCount); + int ii; + for(ii=0; ii < bed->blockCount; ii++) + bitSetRange(acc, bed->chromStart + bed->chromStarts[ii], bed->blockSizes[ii]); + bedFree(&bed); + } + else + { + int w = interval->end - interval->start; + if (w > 0) + bitSetRange(acc, interval->start, w); + } + } + +lmCleanup(&lm); +bbiFileClose(&bbi); } void fbOrChain(Bits *acc, char *track, char *chrom, int chromSize) /* Or in a chain file. */ { struct lineFile *lf; char fileName[512]; struct chain *chain; struct cBlock *b; chromFileName(track, chrom, fileName); if (!fileExists(fileName)) return; lf = lineFileOpen(fileName, TRUE); while ((chain = chainRead(lf)) != NULL) @@ -429,30 +508,34 @@ void orFile(Bits *acc, char *track, char *chrom, int chromSize) /* Or some sort of file into bits. */ { if (isFileType(track, "psl")) { fbOrPsl(acc, track, chrom, chromSize); } else if (isFileType(track, "bed")) { fbOrBed(acc, track, chrom, chromSize); } else if (isFileType(track, "chain")) { fbOrChain(acc, track, chrom, chromSize); } +else if (isFileType(track, "bb")) + { + fbOrBigBed(acc, track, chrom, chromSize); + } else errAbort("can't determine file type of: %s", track); } void orTable(char *database, Bits *acc, char *track, char *chrom, int chromSize, struct sqlConnection *conn) /* Or in table if it exists. Else do nothing. */ { char t[512], *s; char table[HDB_MAX_TABLE_STRING]; isolateTrackPartOfSpec(track, t); s = strrchr(t, '.'); if (s != NULL) { @@ -916,30 +999,31 @@ hFreeConn(&conn); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc < 3) usage(); clChrom = optionVal("chrom", clChrom); chromSizes = optionVal("chromSize", NULL); orLogic = optionExists("or"); notResults = optionExists("not"); countGaps = optionExists("countGaps"); +countBlocks = optionExists("countBlocks"); noRandom = optionExists("noRandom"); noHap = optionExists("noHap"); primaryChroms = optionExists("primaryChroms"); dots = optionInt("dots", dots); where = optionVal("where", NULL); calcEnrichment = optionExists("enrichment"); if (calcEnrichment && argc != 4) errAbort("You must specify two tables with -enrichment"); if (calcEnrichment && orLogic) errAbort("You can't use -or with -enrichment"); if (calcEnrichment && notResults) errAbort("You can't use -not with -enrichment"); if (notResults && optionExists("fa") && !optionExists("faMerge")) errAbort("Must specify -faMerge if using -not with -fa");