70668f5dd4d6bbcf079a1025649e17b9885335c7 angie Mon May 13 10:17:53 2013 -0700 Added 2 new args to annoStreamer:nextRow: minChrom and minEnd(which could also be called regionStart depending on point of view). Streamers may use those hints to skip over data that precedes minChrom and minEnd, to avoid the overhead of creating annoRows that annoGrators will then have to skip over. When primary data are sparse and grator data are very dense, this saves significant memory and user-cycles. Unfortunately mysql can still be the bottleneck for elapsed time. Room for improvement in annoStreamDb: when assembly has a reasonably small number of sequences (<1000), genome-wide queries could be internally broken down into per-seq queries; that would let us skip over chroms that precede minChrom. refs #6152 diff --git src/lib/annoStreamBigWig.c src/lib/annoStreamBigWig.c index fb67696..14f9a4b 100644 --- src/lib/annoStreamBigWig.c +++ src/lib/annoStreamBigWig.c @@ -1,159 +1,178 @@ /* annoStreamBigWig -- subclass of annoStreamer for bigWig file or URL */ #include "annoStreamBigWig.h" #include "bigWig.h" char *annoRowBigWigAsText = "table annoRowBigWig\n" "\"autoSql description of a single annoRowBigWig value, for filtering\"\n" " (\n" " float value; \"data value for this range\"\n" " )\n" ; struct annoStreamBigWig { struct annoStreamer streamer; // Parent class members & methods // Private members struct bbiFile *bbi; // bbi handle for bigBed file/URL. struct lm *intervalQueryLm; // localmem object for bigWigIntervalQuery struct bbiInterval *intervalList; // results of bigWigIntervalQuery struct bbiInterval *nextInterval; // next result to be translated into row struct bbiChromInfo *chromList; // list of chromosomes for which bbi actually has data struct bbiChromInfo *queryChrom; // most recently queried chrom (or NULL) for whole-genome }; static void asbwSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd) /* Set region -- and free localmem from previous query if necessary. */ { annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd); struct annoStreamBigWig *self = (struct annoStreamBigWig *)vSelf; self->nextInterval = self->intervalList = NULL; lmCleanup(&(self->intervalQueryLm)); } -static void asbwDoQuery(struct annoStreamBigWig *self) +static void asbwDoQuery(struct annoStreamBigWig *self, char *minChrom, uint minEnd) /* Store results of an interval query. [Would be nice to make a streaming version of this.] */ { struct annoStreamer *sSelf = &(self->streamer); if (self->intervalQueryLm == NULL) self->intervalQueryLm = lmInit(0); if (sSelf->chrom != NULL) { - self->intervalList = bigWigIntervalQuery(self->bbi, sSelf->chrom, - sSelf->regionStart, sSelf->regionEnd, + uint start = sSelf->regionStart; + if (minChrom) + { + if (differentString(minChrom, sSelf->chrom)) + errAbort("annoStreamBigWig %s: nextRow minChrom='%s' but region chrom='%s'", + sSelf->name, minChrom, sSelf->chrom); + if (start < minEnd) + start = minEnd; + } + self->intervalList = bigWigIntervalQuery(self->bbi, sSelf->chrom, start, sSelf->regionEnd, self->intervalQueryLm); } else { // Genome-wide query: break it into chrom-by-chrom queries. if (self->queryChrom == NULL) self->queryChrom = self->chromList; else self->queryChrom = self->queryChrom->next; + if (minChrom != NULL) + { + // Skip chroms that precede minChrom + while (self->queryChrom != NULL && strcmp(self->queryChrom->name, minChrom) < 0) + self->queryChrom = self->queryChrom->next; + } if (self->queryChrom == NULL) { self->chromList = NULL; // EOF, don't start over! self->intervalList = NULL; } else { char *chrom = self->queryChrom->name; + int start = 0; + if (minChrom != NULL && sameString(chrom, minChrom)) + start = minEnd; uint end = self->queryChrom->size; - self->intervalList = bigWigIntervalQuery(self->bbi, chrom, 0, end, self->intervalQueryLm); + self->intervalList = bigWigIntervalQuery(self->bbi, chrom, start, end, + self->intervalQueryLm); } } self->nextInterval = self->intervalList; } static struct annoRow *annoRowFromContigBbiIntervals(char *name, char *chrom, struct bbiInterval *startIv, struct bbiInterval *endIv, boolean rightJoinFail, struct lm *callerLm) /* Given a range of non-NULL contiguous bbiIntervals (i.e. no gaps between intervals), * translate into annoRow with annoVector as data. */ { float *vals; int baseCount = endIv->end - startIv->start; AllocArray(vals, baseCount); int vecOff = 0; struct bbiInterval *iv; for (iv = startIv; iv != endIv->next; iv = iv->next) { int i; for (i = 0; i < (iv->end - iv->start); i++) vals[vecOff++] = iv->val; if (vecOff > baseCount) errAbort("annoStreamBigWig %s: overflowed baseCount (%s:%d-%d)", name, chrom, startIv->start, endIv->end); } return annoRowWigNew(chrom, startIv->start, endIv->end, rightJoinFail, vals, callerLm); } -static struct annoRow *asbwNextRow(struct annoStreamer *sSelf, struct lm *callerLm) +static struct annoRow *asbwNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd, + struct lm *callerLm) /* Return a single annoRow, or NULL if there are no more items. */ { struct annoStreamBigWig *self = (struct annoStreamBigWig *)sSelf; if (self->intervalList == NULL) - asbwDoQuery(self); + asbwDoQuery(self, minChrom, minEnd); if (self->nextInterval == NULL) return NULL; // Skip past any left-join failures until we get a right-join failure, a passing interval, or EOF. boolean rightFail = FALSE; struct bbiInterval *startIv = self->nextInterval; while (annoFilterWigValueFails(sSelf->filters, self->nextInterval->val, &rightFail)) { if (rightFail) break; startIv = self->nextInterval = self->nextInterval->next; if (self->nextInterval == NULL) return NULL; } char *chrom = sSelf->chrom ? sSelf->chrom : self->queryChrom->name; if (rightFail) return annoRowFromContigBbiIntervals(sSelf->name, chrom, startIv, startIv, rightFail, callerLm); struct bbiInterval *endIv = startIv, *iv; int maxCount = 16 * 1024, count; for (iv = startIv->next, count = 0; iv != NULL && count < maxCount; iv = iv->next, count++) { // collect contiguous intervals; then make annoRow with vector. if (annoFilterWigValueFails(sSelf->filters, iv->val, &rightFail)) break; if (iv->start == endIv->end) endIv = iv; else break; } self->nextInterval = endIv->next; return annoRowFromContigBbiIntervals(sSelf->name, chrom, startIv, endIv, rightFail, callerLm); } static void asbwClose(struct annoStreamer **pVSelf) /* Close bbi handle and free self. */ { if (pVSelf == NULL) return; struct annoStreamBigWig *self = *(struct annoStreamBigWig **)pVSelf; bigWigFileClose(&(self->bbi)); self->intervalList = NULL; lmCleanup(&(self->intervalQueryLm)); annoStreamerFree(pVSelf); } struct annoStreamer *annoStreamBigWigNew(char *fileOrUrl, struct annoAssembly *aa) /* Create an annoStreamer (subclass) object from a file or URL. */ { struct bbiFile *bbi = bigWigFileOpen(fileOrUrl); struct asObject *asObj = asParseText(annoRowBigWigAsText); struct annoStreamBigWig *self = NULL; AllocVar(self); struct annoStreamer *streamer = &(self->streamer); annoStreamerInit(streamer, aa, asObj, fileOrUrl); streamer->rowType = arWig; streamer->setRegion = asbwSetRegion; streamer->nextRow = asbwNextRow; streamer->close = asbwClose; self->bbi = bbi; return (struct annoStreamer *)self; }