bf1d058d73bdb153279eb4e530d1257d2ccc5675
angie
Mon Jun 23 10:43:04 2014 -0700
Added ENCODE Regulatory summary tracks for clustered DNase and TFBS,with support for filtering based on BED5 score and factor/cellType/treatment.
refs #11461
diff --git src/hg/lib/annoStreamDb.c src/hg/lib/annoStreamDb.c
index 6aa94f0..1ca9e27 100644
--- src/hg/lib/annoStreamDb.c
+++ src/hg/lib/annoStreamDb.c
@@ -5,30 +5,35 @@
#include "annoStreamDb.h"
#include "annoGratorQuery.h"
#include "binRange.h"
#include "hdb.h"
#include "sqlNum.h"
struct annoStreamDb
{
struct annoStreamer streamer; // Parent class members & methods
// Private members
struct sqlConnection *conn; // Database connection (e.g. hg19 or customTrash)
struct sqlResult *sr; // SQL query result from which we grab rows
char *table; // Table name, must exist in database
+ struct dyString *(*makeBaselineQuery)(struct annoStreamDb *self, boolean *retHasWhere);
+ /* Provide baseline query, by default just 'select * from
'.
+ * Override this to make a query with specific fields, joins etc.
+ * If the returned query includes a join/where, set *retHasWhere to TRUE. */
+
// These members enable us to extract coords from the otherwise unknown row:
char *chromField; // Name of chrom-ish column in table
char *startField; // Name of chromStart-ish column in table
char *endField; // Name of chromEnd-ish column in table
int chromIx; // Index of chrom-ish col in autoSql or bin-less table
int startIx; // Index of chromStart-ish col in autoSql or bin-less table
int endIx; // Index of chromEnd-ish col in autoSql or bin-less table
// These members enable us to produce {chrom, start}-sorted output:
char *endFieldIndexName; // SQL index on end field, if any (can mess up sorting)
boolean notSorted; // TRUE if table is not sorted (e.g. genbank-updated)
boolean hasBin; // 1 if SQL table's first column is bin
boolean omitBin; // 1 if table hasBin and autoSql doesn't have bin
boolean mergeBins; // TRUE if query results will be in bin order
struct annoRow *bigItemQueue; // If mergeBins, accumulate coarse-bin items here
@@ -103,56 +108,68 @@
/* Set region -- and free current sqlResult if there is one. */
{
annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd);
struct annoStreamDb *self = (struct annoStreamDb *)vSelf;
sqlFreeResult(&(self->sr));
resetMergeState(self);
resetChunkState(self);
}
static char **nextRowFromSqlResult(struct annoStreamDb *self)
/* Stream rows directly from self->sr. */
{
return sqlNextRow(self->sr);
}
+static struct dyString *asdMakeBaselineQuery(struct annoStreamDb *self, boolean *retHasWhere)
+/* Return a baseline query, i.e. "select * from ". This is the default implementation
+ * of annoStreamDb.makeBaselineQuery. */
+{
+if (retHasWhere)
+ *retHasWhere = FALSE;
+return sqlDyStringCreate("select * from %s ", self->table);
+}
+
+
static void asdDoQuerySimple(struct annoStreamDb *self, char *minChrom, uint minEnd)
/* Return a sqlResult for a query on table items in position range.
- * If doing a whole genome query. just 'select * from' table. */
+ * If doing a whole genome query. just select all rows from table. */
// NOTE: it would be possible to implement filters at this level, as in hgTables.
{
struct annoStreamer *streamer = &(self->streamer);
-struct dyString *query = sqlDyStringCreate("select * from %s", self->table);
+boolean hasWhere = FALSE;
+struct dyString *query = self->makeBaselineQuery(self, &hasWhere);
if (!streamer->positionIsGenome)
{
if (minChrom && differentString(minChrom, streamer->chrom))
errAbort("annoStreamDb %s: nextRow minChrom='%s' but region chrom='%s'",
streamer->name, minChrom, streamer->chrom);
if (self->hasBin)
{
// Results will be in bin order, but we can restore chromStart order by
// accumulating initial coarse-bin items and merge-sorting them with
// subsequent finest-bin items which will be in chromStart order.
resetMergeState(self);
self->mergeBins = TRUE;
self->qLm = lmInit(0);
}
if (self->endFieldIndexName != NULL)
// Don't let mysql use a (chrom, chromEnd) index because that messes up
// sorting by chromStart.
sqlDyStringPrintf(query, " IGNORE INDEX (%s)", self->endFieldIndexName);
- sqlDyStringPrintf(query, " where %s='%s'", self->chromField, streamer->chrom);
+ sqlDyStringAppend(query, hasWhere ? " and " : " where ");
+ sqlDyStringPrintf(query, "%s='%s'", self->chromField, streamer->chrom);
int chromSize = annoAssemblySeqSize(streamer->assembly, streamer->chrom);
if (streamer->regionStart != 0 || streamer->regionEnd != chromSize)
{
dyStringAppend(query, " and ");
if (self->hasBin)
hAddBinToQuery(streamer->regionStart, streamer->regionEnd, query);
sqlDyStringPrintf(query, "%s < %u and %s > %u", self->startField, streamer->regionEnd,
self->endField, streamer->regionStart);
}
if (self->notSorted)
sqlDyStringPrintf(query, " order by %s", self->startField);
}
else if (self->notSorted)
sqlDyStringPrintf(query, " order by %s,%s", self->chromField, self->startField);
if (self->maxOutRows > 0)
@@ -226,34 +243,35 @@
while ((row = sqlNextRow(sr)) != NULL)
{
if (ix >= rowBuf->size)
errAbort("annoStreamDb %s: rowBuf overflow, got more than %d rows",
sSelf->name, rowBuf->size);
rowBuf->buf[ix++] = lmCloneRow(rowBuf->lm, row, sSelf->numCols+self->omitBin);
}
// Set rowBuf->size to the number of rows we actually stored.
rowBuf->size = ix;
sqlFreeResult(&sr);
updateNextChunkState(self, queryMaxItems);
}
static void asdDoQueryChunking(struct annoStreamDb *self, char *minChrom, uint minEnd)
/* Return a sqlResult for a query on table items in position range.
- * If doing a whole genome query. just 'select * from' table. */
+ * If doing a whole genome query, just select all rows from table. */
{
struct annoStreamer *sSelf = &(self->streamer);
-struct dyString *query = sqlDyStringCreate("select * from %s ", self->table);
+boolean hasWhere = FALSE;
+struct dyString *query = self->makeBaselineQuery(self, &hasWhere);
if (sSelf->chrom != NULL && self->rowBuf.size > 0 && !self->doNextChunk)
{
// We're doing a region query, we already got some rows, and don't need another chunk:
resetRowBuf(&self->rowBuf);
self->eof = TRUE;
}
if (self->useMaxOutRows)
{
self->maxOutRows -= self->rowBuf.size;
if (self->maxOutRows <= 0)
self->eof = TRUE;
}
if (self->eof)
return;
int queryMaxItems = ASD_CHUNK_SIZE;
@@ -276,31 +294,32 @@
// sorting by chromStart.
sqlDyStringPrintf(query, " IGNORE INDEX (%s) ", self->endFieldIndexName);
if (sSelf->chrom != NULL)
{
uint start = sSelf->regionStart;
if (minChrom)
{
if (differentString(minChrom, sSelf->chrom))
errAbort("annoStreamDb %s: nextRow minChrom='%s' but region chrom='%s'",
sSelf->name, minChrom, sSelf->chrom);
if (start < minEnd)
start = minEnd;
}
if (self->doNextChunk && start < self->nextChunkStart)
start = self->nextChunkStart;
- sqlDyStringPrintf(query, "where %s = '%s' and ", self->chromField, sSelf->chrom);
+ sqlDyStringAppend(query, hasWhere ? " and " : " where ");
+ sqlDyStringPrintf(query, "%s = '%s' and ", self->chromField, sSelf->chrom);
if (self->hasBin)
{
if (self->doNextChunk && self->gotFinestBin)
// It would be way more elegant to make a hAddBinTopLevelOnly but this will do:
dyStringPrintf(query, "bin > %d and ", self->minFinestBin);
hAddBinToQuery(start, sSelf->regionEnd, query);
}
if (self->doNextChunk)
sqlDyStringPrintf(query, "%s >= %u and ", self->startField, self->nextChunkStart);
sqlDyStringPrintf(query, "%s < %u and %s > %u ", self->startField, sSelf->regionEnd,
self->endField, start);
if (self->notSorted)
sqlDyStringPrintf(query, "order by %s ", self->startField);
sqlDyStringPrintf(query, "limit %d", queryMaxItems);
bufferRowsFromSqlQuery(self, query->string, queryMaxItems);
@@ -332,31 +351,32 @@
if (self->qLm == NULL)
self->qLm = lmInit(0);
}
}
if (self->queryChrom == NULL)
self->eof = TRUE;
else
{
char *chrom = self->queryChrom->name;
int start = 0;
if (minChrom != NULL && sameString(chrom, minChrom))
start = minEnd;
if (self->doNextChunk && start < self->nextChunkStart)
start = self->nextChunkStart;
uint end = annoAssemblySeqSize(self->streamer.assembly, self->queryChrom->name);
- sqlDyStringPrintf(query, "where %s = '%s' ", self->chromField, chrom);
+ sqlDyStringAppend(query, hasWhere ? " and " : " where ");
+ sqlDyStringPrintf(query, "%s = '%s' ", self->chromField, chrom);
if (start > 0 || self->doNextChunk)
{
dyStringAppend(query, "and ");
if (self->hasBin)
{
if (self->doNextChunk && self->gotFinestBin)
// It would be way more elegant to make a hAddBinTopLevelOnly but this will do:
dyStringPrintf(query, "bin > %d and ", self->minFinestBin);
hAddBinToQuery(start, end, query);
}
if (self->doNextChunk)
sqlDyStringPrintf(query, "%s >= %u and ", self->startField, self->nextChunkStart);
// region end is chromSize, so no need to constrain startField here:
sqlDyStringPrintf(query, "%s > %u ", self->endField, start);
}
@@ -645,30 +665,31 @@
streamer->nextRow = asdNextRow;
streamer->close = asdClose;
self->conn = conn;
self->table = cloneString(table);
char *asFirstColumnName = streamer->asObj->columnList->name;
if (sqlFieldIndex(self->conn, self->table, "bin") == 0)
{
self->hasBin = 1;
self->minFinestBin = binFromRange(0, 1);
}
if (self->hasBin && !sameString(asFirstColumnName, "bin"))
self->omitBin = 1;
if (!asdInitBed3Fields(self))
errAbort("annoStreamDbNew: can't figure out which fields of %s.%s to use as "
"{chrom, chromStart, chromEnd}.", db, table);
+self->makeBaselineQuery = asdMakeBaselineQuery;
// When a table has an index on endField, sometimes the query optimizer uses it
// and that ruins the sorting. Fortunately most tables don't anymore.
self->endFieldIndexName = sqlTableIndexOnField(self->conn, self->table, self->endField);
self->notSorted = FALSE;
// Special case: genbank-updated tables are not sorted because new mappings are
// tacked on at the end.
if (isIncrementallyUpdated(table))
self->notSorted = TRUE;
self->mergeBins = FALSE;
self->maxOutRows = maxOutRows;
self->useMaxOutRows = (maxOutRows > 0);
self->needQuery = TRUE;
self->chromList = annoAssemblySeqNames(aa);
if (slCount(self->chromList) > 1000)
{