06845edb9038bb632170891cebcdc8f477f2ea8d angie Fri Dec 4 08:59:38 2015 -0800 Add dbNSFP v3.1a (including VEST scores) to hg38 for hgVai. Add hgVai options for adding in transcript status info: GENCODE tags when applicable, knownCanonical for knownGene, refSeqStatus for refGene. refs #16502, #16503 diff --git src/hg/lib/annoStreamDb.c src/hg/lib/annoStreamDb.c index dad9640..f68f39d 100644 --- src/hg/lib/annoStreamDb.c +++ src/hg/lib/annoStreamDb.c @@ -52,50 +52,50 @@ struct slName *queryChrom; // most recently queried chrom for whole-genome (or NULL) boolean eof; // TRUE when we are done (maxItems or no more items) boolean needQuery; // TRUE when we haven't yet queried, or need to query again boolean doNextChunk; // TRUE if rowBuf ends before end of chrom/region uint nextChunkStart; // Start coord for next chunk of rows to query // Info for joining in related tables/fields struct joinerDtf *mainTableDtfList; // Fields from the main table to include in output struct joinerDtf *relatedDtfList; // Fields from related tables to include in output struct joiner *joiner; // Parsed all.joiner schema struct joinMixer *joinMixer; // Plan for joining related tables using sql and/or hash // (NULL if no joining is necessary) uint sqlRowSize; // Number of columns from sql query (may include related) uint bigRowSize; // Number of columns from sql + joinMixer->hashJoins boolean hasLeftJoin; // If we have to use 'left join' we'll have to 'order by'. + boolean naForMissing; // If true, insert "n/a" for missing related table values + // to match hgTables. struct rowBuf // Temporary storage for rows from chunked query { struct lm *lm; // storage for rows char ***buf; // array of pointers to rows int size; // number of rows int ix; // offset in buffer, [0..size] } rowBuf; char **(*nextRowRaw)(struct annoStreamDb *self); // Depending on query style, use either sqlNextRow or temporary row storage to get next row. // This may return NULL but set self->needQuery; asdNextRow watches for that. void (*doQuery)(struct annoStreamDb *self, char *minChrom, uint minEnd); // Depending on query style, perform either a single query or (series of) chunked query }; -static boolean naForMissing = TRUE; // This should be made into a UI option. - //#*** TODO: make a way to pass the filter with dtf into annoStreamDb. struct annoFilterDb // annoFilter has columnIx which works fine for all fields of main table, // but for joining filters we will need dtf. { struct annoFilter filter; // parent class struct joinerDtf *dtf; // {database, table, field} in case this is from // some table to be joined with the main table }; // For performance reasons, even if !useMaxItems (no limit), we need to limit the // number of rows that are returned from a query, so we can slurp them into memory and // close the sqlResult before mysql gets unhappy about the result being open so long. #define ASD_CHUNK_SIZE 100000 @@ -246,31 +246,31 @@ return listOut; } static void asdMakeBaselineQuery(struct annoStreamDb *self) /* Build a dy SQL query with no position constraints (select ... from ...) * possibly including joins and filters if specified (where ...). */ { if (self->relatedDtfList) { struct joinerDtf *outputFieldList = slCat(joinerDtfCloneList(self->mainTableDtfList), joinerDtfCloneList(self->relatedDtfList)); if (self->joiner == NULL) self->joiner = joinerRead(JOINER_FILE); int expectedRows = sqlRowCount(self->conn, self->table); self->joinMixer = joinMixerNew(self->joiner, self->db, self->table, outputFieldList, - expectedRows, naForMissing); + expectedRows, self->naForMissing); self->sqlRowSize = slCount(self->joinMixer->sqlFieldList); self->bigRowSize = self->joinMixer->bigRowSize; joinerDtfFreeList(&outputFieldList); } else { self->sqlRowSize = slCount(self->mainTableDtfList); self->bigRowSize = self->sqlRowSize; } struct dyString *query = sqlDyStringCreate("select "); appendFieldList(self, query); dyStringAppend(query, " from "); self->hasLeftJoin = appendTableList(self, query); boolean hasWhere = FALSE; self->baselineQuery = dyStringCannibalize(&query); @@ -699,31 +699,31 @@ static struct annoRow *rowToAnnoRow(struct annoStreamDb *self, char **row, boolean rightFail, struct lm *lm) /* Extract coords from row and return an annoRow including right-fail status. */ { char **finalRow = row + self->omitBin; uint numCols = self->streamer.numCols; char *swizzleRow[numCols]; if (self->joinMixer) { uint i; for (i = 0; i < numCols; i++) { uint outIx = self->joinMixer->outIxs[i+self->omitBin]; if (row[outIx] == NULL) - swizzleRow[i] = naForMissing ? "n/a" : ""; + swizzleRow[i] = self->naForMissing ? "n/a" : ""; else swizzleRow[i] = row[outIx]; } finalRow = swizzleRow; } char *chrom = finalRow[self->chromIx]; uint chromStart = sqlUnsigned(finalRow[self->startIx]); uint chromEnd = sqlUnsigned(finalRow[self->endIx]); return annoRowFromStringArray(chrom, chromStart, chromEnd, rightFail, finalRow, numCols, lm); } static char **getFinestBinItem(struct annoStreamDb *self, char **row, boolean *pRightFail, char *minChrom, uint minEnd) /* If row is a coarse-bin item, add it to bigItemQueue, get the next row(s) and * add any subsequent coarse-bin items to bigItemQueue. As soon as we get an item from a @@ -898,76 +898,84 @@ } else asObj = hel->val; return asObj; } static void makeDottedTriple(char *dtfString, size_t dtfStringSize, char *db, char *table, char *field) /* In case we don't have a struct joinerDtf for a field that we want to look up, * but we do have the db, table and field name, concat with dots into dtfString. * Unlike joinerDtfToSqlFieldString, don't bother checking whether db is the main db. */ { safef(dtfString, dtfStringSize, "%s.%s.%s", db, table, field); } +char *annoStreamDbColumnNameFromDtf(char *db, char *mainTable, struct joinerDtf *dtf) +/* Return a string with the autoSql column name that would be assigned according to dtf's + * db, table and field. */ +{ +char colName[PATH_LEN*2]; +if (differentString(dtf->table, mainTable) || differentString(dtf->database, db)) + { + joinerDtfToSqlFieldString(dtf, db, colName, sizeof(colName)); + // asParse rejects names that have '.' in them, which makes sense because it's for SQL, + // so replace the '.'s with '_'s. + subChar(colName, '.', '_'); + } +else + safecpy(colName, sizeof(colName), dtf->field); +return cloneString(colName); +} + static void addOneColumn(struct dyString *dy, struct joinerDtf *dtf, char *db, char *mainTable, struct asColumn *col, struct hash *dtfNames) /* Append an autoSql text line describing col to dy. * If col is an array whose size is some other column that has not yet been added, * coerce its type to string to avoid asParseText errAbort. */ { // First see if this col depends on a linked size column that hasn't been added yet. boolean sizeColIsMissing = FALSE; if (col->isArray && !col->fixedSize && isNotEmpty(col->linkedSizeName)) { // col's size comes from another column -- has that column already been added? char linkedDtfString[PATH_LEN]; makeDottedTriple(linkedDtfString, sizeof(linkedDtfString), dtf->database, dtf->table, col->linkedSizeName); if (!hashLookup(dtfNames, linkedDtfString)) sizeColIsMissing = TRUE; } if (col->isArray && sizeColIsMissing) { // The size column is missing, so this can't be a valid array in autoSql -- // ignore col->lowType and call it a (comma-separated) string. - dyStringAppend(dy, " string"); + dyStringAppend(dy, " lstring"); } else { dyStringPrintf(dy, " %s", col->lowType->name); if (col->isArray) { dyStringAppendC(dy, '['); if (col->fixedSize) dyStringPrintf(dy, "%d", col->fixedSize); else dyStringAppend(dy, col->linkedSizeName); dyStringAppendC(dy, ']'); } } -char colName[PATH_LEN]; -if (differentString(dtf->table, mainTable) || differentString(dtf->database, db)) - { - joinerDtfToSqlFieldString(dtf, db, colName, sizeof(colName)); - // asParse rejects names that have '.' in them, which makes sense because it's for SQL, - // so replace the '.'s with '_'s. - subChar(colName, '.', '_'); - } -else - safecpy(colName, sizeof(colName), col->name); +char *colName = annoStreamDbColumnNameFromDtf(db, mainTable, dtf); dyStringPrintf(dy, " %s; \"%s\"\n", colName, col->comment); // Store plain old dotted triple in dtfNames in case we need to look it up later. char dtfString[PATH_LEN]; makeDottedTriple(dtfString, sizeof(dtfString), dtf->database, dtf->table, dtf->field); hashAdd(dtfNames, dtfString, NULL); } static struct asObject *asdAutoSqlFromTableFields(struct annoStreamDb *self, struct asObject *mainAsObj) /* Get autoSql for each table in self->relatedDtfList and append the columns * included in self->relatedDtfList to the main table asObj columns. */ { struct dyString *newAsText = dyStringCreate("table %sCustom\n" "\"query based on %s with customized fields.\"\n" " (", @@ -1066,31 +1074,32 @@ static boolean isPubsTable(char *table) // Not absolutely every pubs* table is unsorted, but most of them are. { return startsWith("pubs", table); } static struct asObject *asdParseConfig(struct annoStreamDb *self, struct jsonElement *configEl) /* Extract the autoSql for self->table from the database. * If configEl is not NULL, expect it to be a description of related tables and fields like this: * config = { "relatedTables": [ { "table": "hg19.kgXref", * "fields": ["geneSymbol", "description"] }, * { "table": "hg19.knownCanonical", * "fields": ["clusterId"] } * ] } * If so, unpack the [db.]tables and fields into self->relatedDtfList and append autoSql - * column descriptions for each field to the autoSql object that describes our output. */ + * column descriptions for each field to the autoSql object that describes our output. + * It might also have "naForMissing": true/false; if so, set self->naForMissing. */ { //#*** TODO: hAnnoGetAutoSqlForDbTable should do its own split-table checking char maybeSplitTable[HDB_MAX_TABLE_STRING]; if (!hFindSplitTable(self->db, NULL, self->table, maybeSplitTable, NULL)) errAbort("annoStreamDbNew: can't find table (or split table) for '%s.%s'", self->db, self->table); struct asObject *asObj = hAnnoGetAutoSqlForDbTable(self->db, maybeSplitTable, NULL, TRUE); makeMainTableDtfList(self, asObj); if (configEl != NULL) { struct hash *config = jsonObjectVal(configEl, "config"); struct jsonElement *relatedTablesEl = hashFindVal(config, "relatedTables"); if (relatedTablesEl) { // relatedTables is a list of objects whose keys are related [db.]table names @@ -1112,30 +1121,33 @@ { struct jsonElement *fieldListEl = hashMustFindVal(tfObj, "fields"); struct slRef *fieldList = jsonListVal(fieldListEl, "fieldList"); struct slRef *fieldRef; for (fieldRef = fieldList; fieldRef != NULL; fieldRef = fieldRef->next) { struct jsonElement *fieldEl = fieldRef->val; char *tfField = jsonStringVal(fieldEl, "field"); slAddHead(&self->relatedDtfList, joinerDtfNew(tfDb, tfTable, tfField)); } } } slReverse(&self->relatedDtfList); asObj = asdAutoSqlFromTableFields(self, asObj); } + struct jsonElement *naForMissingEl = hashFindVal(config, "naForMissing"); + if (naForMissingEl != NULL) + self->naForMissing = jsonBooleanVal(naForMissingEl, "naForMissing"); } return asObj; } // Why isn't this in jksql.h? #define NOSQLINJ "NOSQLINJ " static char *sqlExplain(struct sqlConnection *conn, char *query) /* For now, just turn the values back into a multi-line "#"-comment string. */ { char *trimmedQuery = query; if (startsWith(NOSQLINJ, trimmedQuery)) trimmedQuery = trimmedQuery + strlen(NOSQLINJ); struct dyString *dy = dyStringCreate("# Output of 'explain %s':\n", trimmedQuery); char explainQuery[PATH_LEN*8];