70668f5dd4d6bbcf079a1025649e17b9885335c7
angie
  Mon May 13 10:17:53 2013 -0700
Added 2 new args to annoStreamer:nextRow: minChrom and minEnd(which could also be called regionStart depending on point of view).
Streamers may use those hints to skip over data that precedes
minChrom and minEnd, to avoid the overhead of creating annoRows
that annoGrators will then have to skip over.  When primary data
are sparse and grator data are very dense, this saves significant
memory and user-cycles.  Unfortunately mysql can still be the
bottleneck for elapsed time.  Room for improvement in annoStreamDb:
when assembly has a reasonably small number of sequences (<1000),
genome-wide queries could be internally broken down into per-seq
queries; that would let us skip over chroms that precede minChrom.
refs #6152

diff --git src/lib/annoStreamBigWig.c src/lib/annoStreamBigWig.c
index fb67696..14f9a4b 100644
--- src/lib/annoStreamBigWig.c
+++ src/lib/annoStreamBigWig.c
@@ -1,159 +1,178 @@
 /* annoStreamBigWig -- subclass of annoStreamer for bigWig file or URL */
 
 #include "annoStreamBigWig.h"
 #include "bigWig.h"
 
 char *annoRowBigWigAsText =
 "table annoRowBigWig\n"
 "\"autoSql description of a single annoRowBigWig value, for filtering\"\n"
 "    (\n"
 "    float value;  \"data value for this range\"\n"
 "    )\n"
     ;
 
 struct annoStreamBigWig
     {
     struct annoStreamer streamer;	// Parent class members & methods
     // Private members
     struct bbiFile *bbi;		// bbi handle for bigBed file/URL.
     struct lm *intervalQueryLm;		// localmem object for bigWigIntervalQuery
     struct bbiInterval *intervalList;	// results of bigWigIntervalQuery
     struct bbiInterval *nextInterval;	// next result to be translated into row
     struct bbiChromInfo *chromList;	// list of chromosomes for which bbi actually has data
     struct bbiChromInfo *queryChrom;	// most recently queried chrom (or NULL) for whole-genome
     };
 
 
 static void asbwSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd)
 /* Set region -- and free localmem from previous query if necessary. */
 {
 annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd);
 struct annoStreamBigWig *self = (struct annoStreamBigWig *)vSelf;
 self->nextInterval = self->intervalList = NULL;
 lmCleanup(&(self->intervalQueryLm));
 }
 
-static void asbwDoQuery(struct annoStreamBigWig *self)
+static void asbwDoQuery(struct annoStreamBigWig *self, char *minChrom, uint minEnd)
 /* Store results of an interval query. [Would be nice to make a streaming version of this.] */
 {
 struct annoStreamer *sSelf = &(self->streamer);
 if (self->intervalQueryLm == NULL)
     self->intervalQueryLm = lmInit(0);
 if (sSelf->chrom != NULL)
     {
-    self->intervalList = bigWigIntervalQuery(self->bbi, sSelf->chrom,
-					     sSelf->regionStart, sSelf->regionEnd,
+    uint start = sSelf->regionStart;
+    if (minChrom)
+	{
+	if (differentString(minChrom, sSelf->chrom))
+	    errAbort("annoStreamBigWig %s: nextRow minChrom='%s' but region chrom='%s'",
+		     sSelf->name, minChrom, sSelf->chrom);
+	if (start < minEnd)
+	    start = minEnd;
+	}
+    self->intervalList = bigWigIntervalQuery(self->bbi, sSelf->chrom, start, sSelf->regionEnd,
 					     self->intervalQueryLm);
     }
 else
     {
     // Genome-wide query: break it into chrom-by-chrom queries.
     if (self->queryChrom == NULL)
 	self->queryChrom = self->chromList;
     else
 	self->queryChrom = self->queryChrom->next;
+    if (minChrom != NULL)
+	{
+	// Skip chroms that precede minChrom
+	while (self->queryChrom != NULL && strcmp(self->queryChrom->name, minChrom) < 0)
+	    self->queryChrom = self->queryChrom->next;
+	}
     if (self->queryChrom == NULL)
 	{
 	self->chromList = NULL; // EOF, don't start over!
 	self->intervalList = NULL;
 	}
     else
 	{
 	char *chrom = self->queryChrom->name;
+	int start = 0;
+	if (minChrom != NULL && sameString(chrom, minChrom))
+	    start = minEnd;
 	uint end = self->queryChrom->size;
-	self->intervalList = bigWigIntervalQuery(self->bbi, chrom, 0, end, self->intervalQueryLm);
+	self->intervalList = bigWigIntervalQuery(self->bbi, chrom, start, end,
+						 self->intervalQueryLm);
 	}
     }
 self->nextInterval = self->intervalList;
 }
 
 static struct annoRow *annoRowFromContigBbiIntervals(char *name, char *chrom,
 				struct bbiInterval *startIv, struct bbiInterval *endIv,
 				boolean rightJoinFail, struct lm *callerLm)
 /* Given a range of non-NULL contiguous bbiIntervals (i.e. no gaps between intervals),
  * translate into annoRow with annoVector as data. */
 {
 float *vals;
 int baseCount = endIv->end - startIv->start;
 AllocArray(vals, baseCount);
 int vecOff = 0;
 struct bbiInterval *iv;
 for (iv = startIv;  iv != endIv->next;  iv = iv->next)
     {
     int i;
     for (i = 0;  i < (iv->end - iv->start);  i++)
 	vals[vecOff++] = iv->val;
     if (vecOff > baseCount)
 	errAbort("annoStreamBigWig %s: overflowed baseCount (%s:%d-%d)",
 		 name, chrom, startIv->start, endIv->end);
     }
 return annoRowWigNew(chrom, startIv->start, endIv->end, rightJoinFail, vals, callerLm);
 }
 
-static struct annoRow *asbwNextRow(struct annoStreamer *sSelf, struct lm *callerLm)
+static struct annoRow *asbwNextRow(struct annoStreamer *sSelf, char *minChrom, uint minEnd,
+				   struct lm *callerLm)
 /* Return a single annoRow, or NULL if there are no more items. */
 {
 struct annoStreamBigWig *self = (struct annoStreamBigWig *)sSelf;
 if (self->intervalList == NULL)
-    asbwDoQuery(self);
+    asbwDoQuery(self, minChrom, minEnd);
 if (self->nextInterval == NULL)
     return NULL;
 // Skip past any left-join failures until we get a right-join failure, a passing interval, or EOF.
 boolean rightFail = FALSE;
 struct bbiInterval *startIv = self->nextInterval;
 while (annoFilterWigValueFails(sSelf->filters, self->nextInterval->val, &rightFail))
     {
     if (rightFail)
 	break;
     startIv = self->nextInterval = self->nextInterval->next;
     if (self->nextInterval == NULL)
 	return NULL;
     }
 char *chrom = sSelf->chrom ? sSelf->chrom : self->queryChrom->name;
 if (rightFail)
     return annoRowFromContigBbiIntervals(sSelf->name, chrom, startIv, startIv, rightFail,
 					 callerLm);
 struct bbiInterval *endIv = startIv, *iv;
 int maxCount = 16 * 1024, count;
 for (iv = startIv->next, count = 0;  iv != NULL && count < maxCount;  iv = iv->next, count++)
     {
     // collect contiguous intervals; then make annoRow with vector.
     if (annoFilterWigValueFails(sSelf->filters, iv->val, &rightFail))
 	break;
     if (iv->start == endIv->end)
 	endIv = iv;
     else
 	break;
     }
 self->nextInterval = endIv->next;
 return annoRowFromContigBbiIntervals(sSelf->name, chrom, startIv, endIv, rightFail, callerLm);
 }
 
 static void asbwClose(struct annoStreamer **pVSelf)
 /* Close bbi handle and free self. */
 {
 if (pVSelf == NULL)
     return;
 struct annoStreamBigWig *self = *(struct annoStreamBigWig **)pVSelf;
 bigWigFileClose(&(self->bbi));
 self->intervalList = NULL;
 lmCleanup(&(self->intervalQueryLm));
 annoStreamerFree(pVSelf);
 }
 
 struct annoStreamer *annoStreamBigWigNew(char *fileOrUrl, struct annoAssembly *aa)
 /* Create an annoStreamer (subclass) object from a file or URL. */
 {
 struct bbiFile *bbi = bigWigFileOpen(fileOrUrl);
 struct asObject *asObj = asParseText(annoRowBigWigAsText);
 struct annoStreamBigWig *self = NULL;
 AllocVar(self);
 struct annoStreamer *streamer = &(self->streamer);
 annoStreamerInit(streamer, aa, asObj, fileOrUrl);
 streamer->rowType = arWig;
 streamer->setRegion = asbwSetRegion;
 streamer->nextRow = asbwNextRow;
 streamer->close = asbwClose;
 self->bbi = bbi;
 return (struct annoStreamer *)self;
 }