043ea3a998c9846a647f87ae14d4f27bc01af3a6
angie
  Wed May 28 13:59:55 2014 -0700
When position was set to a chrom not present in a VCF+tabix file,annoStreamVcf just kept reading rows out of the file instead of
paying attention to the return value of lineFileSetTabixRegion.
This inefficiency led to carefulMem running into its 6G limit.
Fix: if lineFileSetTabixRegion fails in asvSetRegion, set self->eof
to prevent reading out rows from the wrong position.  If we're trying
to skip ahead to minChrom using lineFileSetTabixRegion in
nextRowUnfiltered, and it fails, just keep current position in case
the file has more chroms after minChrom.
fixes #13343

diff --git src/lib/annoStreamVcf.c src/lib/annoStreamVcf.c
index 2c15149..2d60f07 100644
--- src/lib/annoStreamVcf.c
+++ src/lib/annoStreamVcf.c
@@ -20,34 +20,40 @@
     struct annoRow *nextPosQ;		// FIFO (max len=1) for stashing row while draining indelQ
     struct lm *qLm;			// Local mem for saving rows in Q's (callerLm disappears)
     int numFileCols;			// Number of columns in VCF file.
     int maxRecords;			// Maximum number of annoRows to return.
     int recordCount;			// Number of annoRows we have returned so far.
     boolean isTabix;			// True if we are accessing compressed VCF via tabix index
     boolean eof;			// True when we have hit end of file or maxRecords
     };
 
 
 static void asvSetRegion(struct annoStreamer *vSelf, char *chrom, uint regionStart, uint regionEnd)
 /* Set region -- and free current sqlResult if there is one. */
 {
 annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd);
 struct annoStreamVcf *self = (struct annoStreamVcf *)vSelf;
-if (self->isTabix)
-    lineFileSetTabixRegion(self->vcff->lf, chrom, regionStart, regionEnd);
 self->indelQ = self->nextPosQ = NULL;
 self->eof = FALSE;
+if (self->isTabix)
+    {
+    // If this region is not in tabix index, set self->eof so we won't keep grabbing rows
+    // from the old position.
+    boolean gotRegion = lineFileSetTabixRegion(self->vcff->lf, chrom, regionStart, regionEnd);
+    if (! gotRegion)
+	self->eof = TRUE;
+    }
 }
 
 static char *asvGetHeader(struct annoStreamer *vSelf)
 /* Return VCF header (e.g. for use by formatter) */
 {
 struct annoStreamVcf *self = (struct annoStreamVcf *)vSelf;
 return cloneString(self->vcff->headerString);
 }
 
 static char **nextRowRaw(struct annoStreamVcf *self)
 /* Get the next VCF record and put the row text into autoSql words.
  * Return pointer to self->asWords if we get a row, otherwise NULL. */
 {
 char *words[self->numFileCols];
 int wordCount;
@@ -122,30 +128,32 @@
 	}
     else
 	{
 	regionStart = max(regionStart, minEnd);
 	}
     }
 char **words = nextRowRaw(self);
 if (regionChrom != NULL && words != NULL)
     {
     char *rowChrom = getProperChromName(self, words[0]);
     if (self->isTabix && strcmp(rowChrom, regionChrom) < 0)
 	{
 	uint regionEnd = sSelf->regionEnd;
 	if (minChrom != NULL && sSelf->chrom == NULL)
 	    regionEnd = annoAssemblySeqSize(sSelf->assembly, minChrom);
+	// If lineFileSetTabixRegion fails, just keep the current file position
+	// -- hopefully we'll just be skipping to the next row after region{Chrom,Start,End}.
 	lineFileSetTabixRegion(self->vcff->lf, regionChrom, regionStart, regionEnd);
 	}
     while (words != NULL &&
 	   (strcmp(rowChrom, regionChrom) < 0 ||
 	    (sameString(rowChrom, regionChrom) && self->record->chromEnd < regionStart)))
 	words = nextRowRaw(self);
     }
 // Tabix doesn't give us any rows past end of region, but if not using tabix,
 // detect when we're past end of region:
 if (words != NULL && !self->isTabix && sSelf->chrom != NULL
     && self->record->chromStart > sSelf->regionEnd)
     {
     words = NULL;
     self->record = NULL;
     }