bf1d058d73bdb153279eb4e530d1257d2ccc5675
angie
  Mon Jun 23 10:43:04 2014 -0700
Added ENCODE Regulatory summary tracks for clustered DNase and TFBS,with support for filtering based on BED5 score and factor/cellType/treatment.
refs #11461

diff --git src/hg/lib/annoStreamDb.c src/hg/lib/annoStreamDb.c
index 6aa94f0..1ca9e27 100644
--- src/hg/lib/annoStreamDb.c
+++ src/hg/lib/annoStreamDb.c
@@ -5,30 +5,35 @@
 
 #include "annoStreamDb.h"
 #include "annoGratorQuery.h"
 #include "binRange.h"
 #include "hdb.h"
 #include "sqlNum.h"
 
 struct annoStreamDb
     {
     struct annoStreamer streamer;	// Parent class members & methods
     // Private members
     struct sqlConnection *conn;		// Database connection (e.g. hg19 or customTrash)
     struct sqlResult *sr;		// SQL query result from which we grab rows
     char *table;			// Table name, must exist in database
 
+    struct dyString *(*makeBaselineQuery)(struct annoStreamDb *self, boolean *retHasWhere);
+    /* Provide baseline query, by default just 'select * from <table>'.
+     * Override this to make a query with specific fields, joins etc.
+     * If the returned query includes a join/where, set *retHasWhere to TRUE. */
+
     // These members enable us to extract coords from the otherwise unknown row:
     char *chromField;			// Name of chrom-ish column in table
     char *startField;			// Name of chromStart-ish column in table
     char *endField;			// Name of chromEnd-ish column in table
     int chromIx;			// Index of chrom-ish col in autoSql or bin-less table
     int startIx;			// Index of chromStart-ish col in autoSql or bin-less table
     int endIx;				// Index of chromEnd-ish col in autoSql or bin-less table
 
     // These members enable us to produce {chrom, start}-sorted output:
     char *endFieldIndexName;		// SQL index on end field, if any (can mess up sorting)
     boolean notSorted;			// TRUE if table is not sorted (e.g. genbank-updated)
     boolean hasBin;			// 1 if SQL table's first column is bin
     boolean omitBin;			// 1 if table hasBin and autoSql doesn't have bin
     boolean mergeBins;			// TRUE if query results will be in bin order
     struct annoRow *bigItemQueue;	// If mergeBins, accumulate coarse-bin items here
@@ -103,56 +108,68 @@
 /* Set region -- and free current sqlResult if there is one. */
 {
 annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd);
 struct annoStreamDb *self = (struct annoStreamDb *)vSelf;
 sqlFreeResult(&(self->sr));
 resetMergeState(self);
 resetChunkState(self);
 }
 
 static char **nextRowFromSqlResult(struct annoStreamDb *self)
 /* Stream rows directly from self->sr. */
 {
 return sqlNextRow(self->sr);
 }
 
+static struct dyString *asdMakeBaselineQuery(struct annoStreamDb *self, boolean *retHasWhere)
+/* Return a baseline query, i.e. "select * from <table>".  This is the default implementation
+ * of annoStreamDb.makeBaselineQuery. */
+{
+if (retHasWhere)
+    *retHasWhere = FALSE;
+return sqlDyStringCreate("select * from %s ", self->table);
+}
+
+
 static void asdDoQuerySimple(struct annoStreamDb *self, char *minChrom, uint minEnd)
 /* Return a sqlResult for a query on table items in position range.
- * If doing a whole genome query. just 'select * from' table. */
+ * If doing a whole genome query. just select all rows from table. */
 // NOTE: it would be possible to implement filters at this level, as in hgTables.
 {
 struct annoStreamer *streamer = &(self->streamer);
-struct dyString *query = sqlDyStringCreate("select * from %s", self->table);
+boolean hasWhere = FALSE;
+struct dyString *query = self->makeBaselineQuery(self, &hasWhere);
 if (!streamer->positionIsGenome)
     {
     if (minChrom && differentString(minChrom, streamer->chrom))
 	errAbort("annoStreamDb %s: nextRow minChrom='%s' but region chrom='%s'",
 		 streamer->name, minChrom, streamer->chrom);
     if (self->hasBin)
 	{
 	// Results will be in bin order, but we can restore chromStart order by
 	// accumulating initial coarse-bin items and merge-sorting them with
 	// subsequent finest-bin items which will be in chromStart order.
 	resetMergeState(self);
 	self->mergeBins = TRUE;
 	self->qLm = lmInit(0);
 	}
     if (self->endFieldIndexName != NULL)
 	// Don't let mysql use a (chrom, chromEnd) index because that messes up
 	// sorting by chromStart.
 	sqlDyStringPrintf(query, " IGNORE INDEX (%s)", self->endFieldIndexName);
-    sqlDyStringPrintf(query, " where %s='%s'", self->chromField, streamer->chrom);
+    sqlDyStringAppend(query, hasWhere ? " and " : " where ");
+    sqlDyStringPrintf(query, "%s='%s'", self->chromField, streamer->chrom);
     int chromSize = annoAssemblySeqSize(streamer->assembly, streamer->chrom);
     if (streamer->regionStart != 0 || streamer->regionEnd != chromSize)
 	{
 	dyStringAppend(query, " and ");
 	if (self->hasBin)
 	    hAddBinToQuery(streamer->regionStart, streamer->regionEnd, query);
 	sqlDyStringPrintf(query, "%s < %u and %s > %u", self->startField, streamer->regionEnd,
 		       self->endField, streamer->regionStart);
 	}
     if (self->notSorted)
 	sqlDyStringPrintf(query, " order by %s", self->startField);
     }
 else if (self->notSorted)
     sqlDyStringPrintf(query, " order by %s,%s", self->chromField, self->startField);
 if (self->maxOutRows > 0)
@@ -226,34 +243,35 @@
 while ((row = sqlNextRow(sr)) != NULL)
     {
     if (ix >= rowBuf->size)
 	errAbort("annoStreamDb %s: rowBuf overflow, got more than %d rows",
 		 sSelf->name, rowBuf->size);
     rowBuf->buf[ix++] = lmCloneRow(rowBuf->lm, row, sSelf->numCols+self->omitBin);
     }
 // Set rowBuf->size to the number of rows we actually stored.
 rowBuf->size = ix;
 sqlFreeResult(&sr);
 updateNextChunkState(self, queryMaxItems);
 }
 
 static void asdDoQueryChunking(struct annoStreamDb *self, char *minChrom, uint minEnd)
 /* Return a sqlResult for a query on table items in position range.
- * If doing a whole genome query. just 'select * from' table. */
+ * If doing a whole genome query, just select all rows from table. */
 {
 struct annoStreamer *sSelf = &(self->streamer);
-struct dyString *query = sqlDyStringCreate("select * from %s ", self->table);
+boolean hasWhere = FALSE;
+struct dyString *query = self->makeBaselineQuery(self, &hasWhere);
 if (sSelf->chrom != NULL && self->rowBuf.size > 0 && !self->doNextChunk)
     {
     // We're doing a region query, we already got some rows, and don't need another chunk:
     resetRowBuf(&self->rowBuf);
     self->eof = TRUE;
     }
 if (self->useMaxOutRows)
     {
     self->maxOutRows -= self->rowBuf.size;
     if (self->maxOutRows <= 0)
 	self->eof = TRUE;
     }
 if (self->eof)
     return;
 int queryMaxItems = ASD_CHUNK_SIZE;
@@ -276,31 +294,32 @@
     // sorting by chromStart.
     sqlDyStringPrintf(query, " IGNORE INDEX (%s) ", self->endFieldIndexName);
 if (sSelf->chrom != NULL)
     {
     uint start = sSelf->regionStart;
     if (minChrom)
 	{
 	if (differentString(minChrom, sSelf->chrom))
 	    errAbort("annoStreamDb %s: nextRow minChrom='%s' but region chrom='%s'",
 		     sSelf->name, minChrom, sSelf->chrom);
 	if (start < minEnd)
 	    start = minEnd;
 	}
     if (self->doNextChunk && start < self->nextChunkStart)
 	start = self->nextChunkStart;
-    sqlDyStringPrintf(query, "where %s = '%s' and ", self->chromField, sSelf->chrom);
+    sqlDyStringAppend(query, hasWhere ? " and " : " where ");
+    sqlDyStringPrintf(query, "%s = '%s' and ", self->chromField, sSelf->chrom);
     if (self->hasBin)
 	{
 	if (self->doNextChunk && self->gotFinestBin)
 	    // It would be way more elegant to make a hAddBinTopLevelOnly but this will do:
 	    dyStringPrintf(query, "bin > %d and ", self->minFinestBin);
 	hAddBinToQuery(start, sSelf->regionEnd, query);
 	}
     if (self->doNextChunk)
 	sqlDyStringPrintf(query, "%s >= %u and ", self->startField, self->nextChunkStart);
     sqlDyStringPrintf(query, "%s < %u and %s > %u ", self->startField, sSelf->regionEnd,
 		      self->endField, start);
     if (self->notSorted)
 	sqlDyStringPrintf(query, "order by %s ", self->startField);
     sqlDyStringPrintf(query, "limit %d", queryMaxItems);
     bufferRowsFromSqlQuery(self, query->string, queryMaxItems);
@@ -332,31 +351,32 @@
 	    if (self->qLm == NULL)
 		self->qLm = lmInit(0);
 	    }
 	}
     if (self->queryChrom == NULL)
 	self->eof = TRUE;
     else
 	{
 	char *chrom = self->queryChrom->name;
 	int start = 0;
 	if (minChrom != NULL && sameString(chrom, minChrom))
 	    start = minEnd;
 	if (self->doNextChunk && start < self->nextChunkStart)
 	    start = self->nextChunkStart;
 	uint end = annoAssemblySeqSize(self->streamer.assembly, self->queryChrom->name);
-	sqlDyStringPrintf(query, "where %s = '%s' ", self->chromField, chrom);
+	sqlDyStringAppend(query, hasWhere ? " and " : " where ");
+	sqlDyStringPrintf(query, "%s = '%s' ", self->chromField, chrom);
 	if (start > 0 || self->doNextChunk)
 	    {
 	    dyStringAppend(query, "and ");
 	    if (self->hasBin)
 		{
 		if (self->doNextChunk && self->gotFinestBin)
 		    // It would be way more elegant to make a hAddBinTopLevelOnly but this will do:
 		    dyStringPrintf(query, "bin > %d and ", self->minFinestBin);
 		hAddBinToQuery(start, end, query);
 		}
 	    if (self->doNextChunk)
 		sqlDyStringPrintf(query, "%s >= %u and ", self->startField, self->nextChunkStart);
 	    // region end is chromSize, so no need to constrain startField here:
 	    sqlDyStringPrintf(query, "%s > %u ", self->endField, start);
 	    }
@@ -645,30 +665,31 @@
 streamer->nextRow = asdNextRow;
 streamer->close = asdClose;
 self->conn = conn;
 self->table = cloneString(table);
 char *asFirstColumnName = streamer->asObj->columnList->name;
 if (sqlFieldIndex(self->conn, self->table, "bin") == 0)
     {
     self->hasBin = 1;
     self->minFinestBin = binFromRange(0, 1);
     }
 if (self->hasBin && !sameString(asFirstColumnName, "bin"))
     self->omitBin = 1;
 if (!asdInitBed3Fields(self))
     errAbort("annoStreamDbNew: can't figure out which fields of %s.%s to use as "
 	     "{chrom, chromStart, chromEnd}.", db, table);
+self->makeBaselineQuery = asdMakeBaselineQuery;
 // When a table has an index on endField, sometimes the query optimizer uses it
 // and that ruins the sorting.  Fortunately most tables don't anymore.
 self->endFieldIndexName = sqlTableIndexOnField(self->conn, self->table, self->endField);
 self->notSorted = FALSE;
 // Special case: genbank-updated tables are not sorted because new mappings are
 // tacked on at the end.
 if (isIncrementallyUpdated(table))
     self->notSorted = TRUE;
 self->mergeBins = FALSE;
 self->maxOutRows = maxOutRows;
 self->useMaxOutRows = (maxOutRows > 0);
 self->needQuery = TRUE;
 self->chromList = annoAssemblySeqNames(aa);
 if (slCount(self->chromList) > 1000)
     {