bf1d058d73bdb153279eb4e530d1257d2ccc5675 angie Mon Jun 23 10:43:04 2014 -0700 Added ENCODE Regulatory summary tracks for clustered DNase and TFBS,with support for filtering based on BED5 score and factor/cellType/treatment. refs #11461 diff --git src/hg/lib/annoStreamDb.c src/hg/lib/annoStreamDb.c index 6aa94f0..1ca9e27 100644 --- src/hg/lib/annoStreamDb.c +++ src/hg/lib/annoStreamDb.c @@ -5,30 +5,35 @@ #include "annoStreamDb.h" #include "annoGratorQuery.h" #include "binRange.h" #include "hdb.h" #include "sqlNum.h" struct annoStreamDb { struct annoStreamer streamer; // Parent class members & methods // Private members struct sqlConnection *conn; // Database connection (e.g. hg19 or customTrash) struct sqlResult *sr; // SQL query result from which we grab rows char *table; // Table name, must exist in database + struct dyString *(*makeBaselineQuery)(struct annoStreamDb *self, boolean *retHasWhere); + /* Provide baseline query, by default just 'select * from <table>'. + * Override this to make a query with specific fields, joins etc. + * If the returned query includes a join/where, set *retHasWhere to TRUE. */ + // These members enable us to extract coords from the otherwise unknown row: char *chromField; // Name of chrom-ish column in table char *startField; // Name of chromStart-ish column in table char *endField; // Name of chromEnd-ish column in table int chromIx; // Index of chrom-ish col in autoSql or bin-less table int startIx; // Index of chromStart-ish col in autoSql or bin-less table int endIx; // Index of chromEnd-ish col in autoSql or bin-less table // These members enable us to produce {chrom, start}-sorted output: char *endFieldIndexName; // SQL index on end field, if any (can mess up sorting) boolean notSorted; // TRUE if table is not sorted (e.g. genbank-updated) boolean hasBin; // 1 if SQL table's first column is bin boolean omitBin; // 1 if table hasBin and autoSql doesn't have bin boolean mergeBins; // TRUE if query results will be in bin order struct annoRow *bigItemQueue; // If mergeBins, accumulate coarse-bin items here @@ -103,56 +108,68 @@ /* Set region -- and free current sqlResult if there is one. */ { annoStreamerSetRegion(vSelf, chrom, regionStart, regionEnd); struct annoStreamDb *self = (struct annoStreamDb *)vSelf; sqlFreeResult(&(self->sr)); resetMergeState(self); resetChunkState(self); } static char **nextRowFromSqlResult(struct annoStreamDb *self) /* Stream rows directly from self->sr. */ { return sqlNextRow(self->sr); } +static struct dyString *asdMakeBaselineQuery(struct annoStreamDb *self, boolean *retHasWhere) +/* Return a baseline query, i.e. "select * from <table>". This is the default implementation + * of annoStreamDb.makeBaselineQuery. */ +{ +if (retHasWhere) + *retHasWhere = FALSE; +return sqlDyStringCreate("select * from %s ", self->table); +} + + static void asdDoQuerySimple(struct annoStreamDb *self, char *minChrom, uint minEnd) /* Return a sqlResult for a query on table items in position range. - * If doing a whole genome query. just 'select * from' table. */ + * If doing a whole genome query. just select all rows from table. */ // NOTE: it would be possible to implement filters at this level, as in hgTables. { struct annoStreamer *streamer = &(self->streamer); -struct dyString *query = sqlDyStringCreate("select * from %s", self->table); +boolean hasWhere = FALSE; +struct dyString *query = self->makeBaselineQuery(self, &hasWhere); if (!streamer->positionIsGenome) { if (minChrom && differentString(minChrom, streamer->chrom)) errAbort("annoStreamDb %s: nextRow minChrom='%s' but region chrom='%s'", streamer->name, minChrom, streamer->chrom); if (self->hasBin) { // Results will be in bin order, but we can restore chromStart order by // accumulating initial coarse-bin items and merge-sorting them with // subsequent finest-bin items which will be in chromStart order. resetMergeState(self); self->mergeBins = TRUE; self->qLm = lmInit(0); } if (self->endFieldIndexName != NULL) // Don't let mysql use a (chrom, chromEnd) index because that messes up // sorting by chromStart. sqlDyStringPrintf(query, " IGNORE INDEX (%s)", self->endFieldIndexName); - sqlDyStringPrintf(query, " where %s='%s'", self->chromField, streamer->chrom); + sqlDyStringAppend(query, hasWhere ? " and " : " where "); + sqlDyStringPrintf(query, "%s='%s'", self->chromField, streamer->chrom); int chromSize = annoAssemblySeqSize(streamer->assembly, streamer->chrom); if (streamer->regionStart != 0 || streamer->regionEnd != chromSize) { dyStringAppend(query, " and "); if (self->hasBin) hAddBinToQuery(streamer->regionStart, streamer->regionEnd, query); sqlDyStringPrintf(query, "%s < %u and %s > %u", self->startField, streamer->regionEnd, self->endField, streamer->regionStart); } if (self->notSorted) sqlDyStringPrintf(query, " order by %s", self->startField); } else if (self->notSorted) sqlDyStringPrintf(query, " order by %s,%s", self->chromField, self->startField); if (self->maxOutRows > 0) @@ -226,34 +243,35 @@ while ((row = sqlNextRow(sr)) != NULL) { if (ix >= rowBuf->size) errAbort("annoStreamDb %s: rowBuf overflow, got more than %d rows", sSelf->name, rowBuf->size); rowBuf->buf[ix++] = lmCloneRow(rowBuf->lm, row, sSelf->numCols+self->omitBin); } // Set rowBuf->size to the number of rows we actually stored. rowBuf->size = ix; sqlFreeResult(&sr); updateNextChunkState(self, queryMaxItems); } static void asdDoQueryChunking(struct annoStreamDb *self, char *minChrom, uint minEnd) /* Return a sqlResult for a query on table items in position range. - * If doing a whole genome query. just 'select * from' table. */ + * If doing a whole genome query, just select all rows from table. */ { struct annoStreamer *sSelf = &(self->streamer); -struct dyString *query = sqlDyStringCreate("select * from %s ", self->table); +boolean hasWhere = FALSE; +struct dyString *query = self->makeBaselineQuery(self, &hasWhere); if (sSelf->chrom != NULL && self->rowBuf.size > 0 && !self->doNextChunk) { // We're doing a region query, we already got some rows, and don't need another chunk: resetRowBuf(&self->rowBuf); self->eof = TRUE; } if (self->useMaxOutRows) { self->maxOutRows -= self->rowBuf.size; if (self->maxOutRows <= 0) self->eof = TRUE; } if (self->eof) return; int queryMaxItems = ASD_CHUNK_SIZE; @@ -276,31 +294,32 @@ // sorting by chromStart. sqlDyStringPrintf(query, " IGNORE INDEX (%s) ", self->endFieldIndexName); if (sSelf->chrom != NULL) { uint start = sSelf->regionStart; if (minChrom) { if (differentString(minChrom, sSelf->chrom)) errAbort("annoStreamDb %s: nextRow minChrom='%s' but region chrom='%s'", sSelf->name, minChrom, sSelf->chrom); if (start < minEnd) start = minEnd; } if (self->doNextChunk && start < self->nextChunkStart) start = self->nextChunkStart; - sqlDyStringPrintf(query, "where %s = '%s' and ", self->chromField, sSelf->chrom); + sqlDyStringAppend(query, hasWhere ? " and " : " where "); + sqlDyStringPrintf(query, "%s = '%s' and ", self->chromField, sSelf->chrom); if (self->hasBin) { if (self->doNextChunk && self->gotFinestBin) // It would be way more elegant to make a hAddBinTopLevelOnly but this will do: dyStringPrintf(query, "bin > %d and ", self->minFinestBin); hAddBinToQuery(start, sSelf->regionEnd, query); } if (self->doNextChunk) sqlDyStringPrintf(query, "%s >= %u and ", self->startField, self->nextChunkStart); sqlDyStringPrintf(query, "%s < %u and %s > %u ", self->startField, sSelf->regionEnd, self->endField, start); if (self->notSorted) sqlDyStringPrintf(query, "order by %s ", self->startField); sqlDyStringPrintf(query, "limit %d", queryMaxItems); bufferRowsFromSqlQuery(self, query->string, queryMaxItems); @@ -332,31 +351,32 @@ if (self->qLm == NULL) self->qLm = lmInit(0); } } if (self->queryChrom == NULL) self->eof = TRUE; else { char *chrom = self->queryChrom->name; int start = 0; if (minChrom != NULL && sameString(chrom, minChrom)) start = minEnd; if (self->doNextChunk && start < self->nextChunkStart) start = self->nextChunkStart; uint end = annoAssemblySeqSize(self->streamer.assembly, self->queryChrom->name); - sqlDyStringPrintf(query, "where %s = '%s' ", self->chromField, chrom); + sqlDyStringAppend(query, hasWhere ? " and " : " where "); + sqlDyStringPrintf(query, "%s = '%s' ", self->chromField, chrom); if (start > 0 || self->doNextChunk) { dyStringAppend(query, "and "); if (self->hasBin) { if (self->doNextChunk && self->gotFinestBin) // It would be way more elegant to make a hAddBinTopLevelOnly but this will do: dyStringPrintf(query, "bin > %d and ", self->minFinestBin); hAddBinToQuery(start, end, query); } if (self->doNextChunk) sqlDyStringPrintf(query, "%s >= %u and ", self->startField, self->nextChunkStart); // region end is chromSize, so no need to constrain startField here: sqlDyStringPrintf(query, "%s > %u ", self->endField, start); } @@ -645,30 +665,31 @@ streamer->nextRow = asdNextRow; streamer->close = asdClose; self->conn = conn; self->table = cloneString(table); char *asFirstColumnName = streamer->asObj->columnList->name; if (sqlFieldIndex(self->conn, self->table, "bin") == 0) { self->hasBin = 1; self->minFinestBin = binFromRange(0, 1); } if (self->hasBin && !sameString(asFirstColumnName, "bin")) self->omitBin = 1; if (!asdInitBed3Fields(self)) errAbort("annoStreamDbNew: can't figure out which fields of %s.%s to use as " "{chrom, chromStart, chromEnd}.", db, table); +self->makeBaselineQuery = asdMakeBaselineQuery; // When a table has an index on endField, sometimes the query optimizer uses it // and that ruins the sorting. Fortunately most tables don't anymore. self->endFieldIndexName = sqlTableIndexOnField(self->conn, self->table, self->endField); self->notSorted = FALSE; // Special case: genbank-updated tables are not sorted because new mappings are // tacked on at the end. if (isIncrementallyUpdated(table)) self->notSorted = TRUE; self->mergeBins = FALSE; self->maxOutRows = maxOutRows; self->useMaxOutRows = (maxOutRows > 0); self->needQuery = TRUE; self->chromList = annoAssemblySeqNames(aa); if (slCount(self->chromList) > 1000) {