70668f5dd4d6bbcf079a1025649e17b9885335c7 angie Mon May 13 10:17:53 2013 -0700 Added 2 new args to annoStreamer:nextRow: minChrom and minEnd(which could also be called regionStart depending on point of view). Streamers may use those hints to skip over data that precedes minChrom and minEnd, to avoid the overhead of creating annoRows that annoGrators will then have to skip over. When primary data are sparse and grator data are very dense, this saves significant memory and user-cycles. Unfortunately mysql can still be the bottleneck for elapsed time. Room for improvement in annoStreamDb: when assembly has a reasonably small number of sequences (<1000), genome-wide queries could be internally broken down into per-seq queries; that would let us skip over chroms that precede minChrom. refs #6152 diff --git src/lib/annoStreamBigBed.c src/lib/annoStreamBigBed.c index 4350c3f..4b8541d 100644 --- src/lib/annoStreamBigBed.c +++ src/lib/annoStreamBigBed.c @@ -1,147 +1,164 @@ /* annoStreamBigBed -- subclass of annoStreamer for bigBed file or URL */ #include "annoStreamBigBed.h" #include "bigBed.h" #include "localmem.h" #include "sqlNum.h" struct annoStreamBigBed { struct annoStreamer streamer; // Parent class members & methods // Private members struct bbiFile *bbi; // bbi handle for bigBed file/URL. struct lm *intervalQueryLm; // localmem object for bigBedIntervalQuery struct bigBedInterval *intervalList; // results of bigBedIntervalQuery struct bigBedInterval *nextInterval; // next result to be translated into row struct bbiChromInfo *chromList; // list of chromosomes for which bbi actually has data struct bbiChromInfo *queryChrom; // most recently queried chrom (or NULL) for whole-genome int maxItems; // max items returned from bigBedIntervalQuery char **row; // storage for results of bigBedIntervalToRow char *startBuf; // storage for stringified start from bigBedIntervalToRow char *endBuf; // storage for stringified end from bigBedIntervalToRow }; static void asbbSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd) /* Set region -- and free localmem from previous query if necessary. */ { annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd); struct annoStreamBigBed *self = (struct annoStreamBigBed *)vSelf; self->nextInterval = self->intervalList = NULL; lmCleanup(&(self->intervalQueryLm)); } -static void asbbDoQuery(struct annoStreamBigBed *self) +static void asbbDoQuery(struct annoStreamBigBed *self, char *minChrom, uint minEnd) /* Store results of an interval query. [Would be nice to make a streaming version of this.] */ { struct annoStreamer *sSelf = &(self->streamer); if (self->intervalQueryLm == NULL) self->intervalQueryLm = lmInit(0); if (sSelf->chrom != NULL) { - self->intervalList = bigBedIntervalQuery(self->bbi, sSelf->chrom, - sSelf->regionStart, sSelf->regionEnd, + uint start = sSelf->regionStart; + if (minChrom) + { + if (differentString(minChrom, sSelf->chrom)) + errAbort("annoStreamBigBed %s: nextRow minChrom='%s' but region chrom='%s'", + sSelf->name, minChrom, sSelf->chrom); + if (start < minEnd) + start = minEnd; + } + self->intervalList = bigBedIntervalQuery(self->bbi, sSelf->chrom, start, sSelf->regionEnd, self->maxItems, self->intervalQueryLm); } else { // Genome-wide query: break it into chrom-by-chrom queries. if (self->queryChrom == NULL) self->queryChrom = self->chromList; else self->queryChrom = self->queryChrom->next; + if (minChrom != NULL) + { + // Skip chroms that precede minChrom + while (self->queryChrom != NULL && strcmp(self->queryChrom->name, minChrom) < 0) + self->queryChrom = self->queryChrom->next; + } if (self->queryChrom == NULL) { self->chromList = NULL; // EOF, don't start over! self->intervalList = NULL; } else { char *chrom = self->queryChrom->name; + int start = 0; + if (minChrom != NULL && sameString(chrom, minChrom)) + start = minEnd; uint end = self->queryChrom->size; - self->intervalList = bigBedIntervalQuery(self->bbi, chrom, 0, end, + self->intervalList = bigBedIntervalQuery(self->bbi, chrom, start, end, self->maxItems, self->intervalQueryLm); } } self->nextInterval = self->intervalList; } static char **nextRowUnfiltered(struct annoStreamBigBed *self) /* Convert the next available interval into a row of words, or return NULL. */ { struct annoStreamer *sSelf = &(self->streamer); if (self->nextInterval == NULL) return NULL; char *chrom = sSelf->chrom ? sSelf->chrom : self->queryChrom->name; int fieldCount = bigBedIntervalToRow(self->nextInterval, chrom, self->startBuf, self->endBuf, self->row, sSelf->numCols+1); if (fieldCount != sSelf->numCols) errAbort("annoStreamBigBed %s: expected %d columns, got %d", sSelf->name, sSelf->numCols, fieldCount); self->nextInterval = self->nextInterval->next; return self->row; } -static struct annoRow *asbbNextRow(struct annoStreamer *vSelf, struct lm *lm) +static struct annoRow *asbbNextRow(struct annoStreamer *vSelf, char *minChrom, uint minEnd, struct lm *lm) /* Return a single annoRow, or NULL if there are no more items. */ { struct annoStreamBigBed *self = (struct annoStreamBigBed *)vSelf; if (self->intervalList == NULL) - asbbDoQuery(self); + asbbDoQuery(self, minChrom, minEnd); char **row = nextRowUnfiltered(self); if (row == NULL) return NULL; // Skip past any left-join failures until we get a right-join failure, a passing row, or EOF. boolean rightFail = FALSE; while (annoFilterRowFails(vSelf->filters, row, vSelf->numCols, &rightFail)) { if (rightFail) break; row = nextRowUnfiltered(self); if (row == NULL) return NULL; } char *chrom = row[0]; uint chromStart = sqlUnsigned(row[1]); uint chromEnd = sqlUnsigned(row[2]); return annoRowFromStringArray(chrom, chromStart, chromEnd, rightFail, row, vSelf->numCols, lm); } static void asbbClose(struct annoStreamer **pVSelf) /* Close bbi handle and free self. */ { if (pVSelf == NULL) return; struct annoStreamBigBed *self = *(struct annoStreamBigBed **)pVSelf; bigBedFileClose(&(self->bbi)); self->intervalList = NULL; lmCleanup(&(self->intervalQueryLm)); freeMem(self->row); freeMem(self->startBuf); freeMem(self->endBuf); annoStreamerFree(pVSelf); } struct annoStreamer *annoStreamBigBedNew(char *fileOrUrl, struct annoAssembly *aa, int maxItems) /* Create an annoStreamer (subclass) object from a file or URL; if * maxItems is 0, all items from a query will be returned, otherwise * each query is limited to maxItems. */ { struct bbiFile *bbi = bigBedFileOpen(fileOrUrl); struct asObject *asObj = bigBedAsOrDefault(bbi); struct annoStreamBigBed *self = NULL; AllocVar(self); struct annoStreamer *streamer = &(self->streamer); annoStreamerInit(streamer, aa, asObj, fileOrUrl); streamer->rowType = arWords; streamer->setRegion = asbbSetRegion; streamer->nextRow = asbbNextRow; streamer->close = asbbClose; self->bbi = bbi; self->maxItems = maxItems; AllocArray(self->row, streamer->numCols + 1); self->startBuf = needMem(32); self->endBuf = needMem(32); self->chromList = bbiChromList(bbi); return (struct annoStreamer *)self; }