40ed3e546ef868bffe4c338d93d6528138ecfc44
angie
  Wed Sep 30 09:25:12 2015 -0700
Add test cases (and bug fixes!) to make sure that we get the desired
behavior when an insertion is adjacent to a non-insertion, and when
an insertion falls at the start or end of the search region (if applicable):

- Include insertions that fall at the start or end of the search region

- If the primary row is an insertion, keep secondary non-insertion rows
to the left and right.

- If the primary row is a non-insertion, keep secondary insertion rows
at its start and end.

(i.e. keep insertions at boundaries -- but don't let any non-insertions
slip through)

The VCF logic is more complicated because VCF indels always include an
extra base to the left, so they appear to start before they actually do,
and can be interspersed with non-indels that start there.

diff --git src/lib/annoStreamTab.c src/lib/annoStreamTab.c
index 17de91a..d1f8675 100644
--- src/lib/annoStreamTab.c
+++ src/lib/annoStreamTab.c
@@ -61,30 +61,39 @@
 		" (current region: %s:%d-%d, new region: %s:%d-%d)",
 		vSelf->chrom, vSelf->regionStart, vSelf->regionEnd,
 		chrom, regionStart, regionEnd);
     }
 annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd);
 if (keepOpen)
     self->eof = FALSE;
 else
     {
     lineFileClose(&(self->lf));
     self->lf = astLFOpen(self->fileOrUrl);
     self->eof = FALSE;
     }
 }
 
+static void reuseRow(struct annoStreamTab *self)
+// When a row falls after the region, undo the damage of lineFileChopNext,
+// tell lf to reuse the line, and set EOF - we are all done until & unless the region changes.
+{
+unChop(self->asWords, self->streamer.numCols);
+lineFileReuse(self->lf);
+self->eof = TRUE;
+}
+
 INLINE boolean isAllDigits(char *s)
 {
 return (isNotEmpty(s) && countLeadingDigits(s) == strlen(s));
 }
 
 static void checkWordCountAndBin(struct annoStreamTab *self, int wordCount)
 /* Auto-detect initial bin column and set self->omitBin if autoSql doesn't have bin. */
 {
 if (wordCount == self->streamer.numCols + 1 &&
     isAllDigits(self->asWords[0]))
     {
     self->fileWordCount = self->streamer.numCols + 1;
     char *asFirstColumnName = self->streamer.asObj->columnList->name;
     if (!sameString(asFirstColumnName, "bin"))
 	self->omitBin = 1;
@@ -131,49 +140,61 @@
 	return NULL;
 	}
     if (self->fileWordCount == 0)
 	checkWordCountAndBin(self, wordCount);
     lineFileExpectWords(self->lf, self->fileWordCount, wordCount);
     if (regionChrom == NULL)
 	// Whole-genome query and no minChrom hint; no need to check region.
 	done = TRUE;
     else
 	{
 	// We're searching within a region -- is this row in range?
 	char *thisChrom = self->asWords[self->omitBin + self->chromIx];
 	uint thisStart = atoll(self->asWords[self->omitBin + self->startIx]);
 	uint thisEnd = atoll(self->asWords[self->omitBin + self->endIx]);
 	int chromDif = strcmp(thisChrom, regionChrom);
-	if (chromDif < 0 ||
-	    (chromDif == 0 && thisEnd <= regionStart))
-	    // This row precedes the region -- keep looking.
+	if (chromDif < 0)
+	    // This chrom precedes the region -- keep looking.
 	    continue;
-	else if (chromDif == 0 && thisEnd > regionStart && thisStart < regionEnd)
+	else if (chromDif > 0)
+	    {
+            // This chrom falls after the end of region -- done.
+            reuseRow(self);
+	    return NULL;
+	    }
+        else
+            {
+            // Same chromosome -- check coords.
+            if (thisEnd > regionStart && thisStart < regionEnd)
                 // This row overlaps region; return it.
                 done = TRUE;
+            else if (thisStart == thisEnd &&
+                     (thisEnd == regionStart || thisStart == regionEnd))
+                // This row is an insertion adjacent to region; return it.
+                done = TRUE;
+            else if (thisEnd <= regionStart)
+                // This row precedes the region -- keep looking.
+                continue;
             else
                 {
-	    // This row falls after the region. Undo the damage of lineFileChopNext,
-            // tell lf to reuse the line, set EOF and return NULL - we are all done
-	    // until & unless region changes.
-	    unChop(self->asWords, sSelf->numCols);
-	    lineFileReuse(self->lf);
-	    self->eof = TRUE;
+                // This row falls after the end of region -- done.
+                reuseRow(self);
                 return NULL;
                 }
             }
 	}
+    }
 return self->asWords + self->omitBin;
 }
 
 static struct annoRow *astNextRow(struct annoStreamer *vSelf, char *minChrom, uint minEnd,
 				  struct lm *callerLm)
 /* Return the next annoRow that passes filters, or NULL if there are no more items. */
 {
 struct annoStreamTab *self = (struct annoStreamTab *)vSelf;
 char **words = nextRowUnfiltered(self, minChrom, minEnd);
 if (words == NULL)
     return NULL;
 // Skip past any left-join failures until we get a right-join failure, a passing row, or EOF.
 boolean rightFail = FALSE;
 while (annoFilterRowFails(vSelf->filters, words, vSelf->numCols, &rightFail))
     {