src/hg/instinct/bioInt2/bioController.c 1.11
1.11 2009/07/23 19:48:39 jsanborn
shortened table name prefixes
Index: src/hg/instinct/bioInt2/bioController.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioController.c,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 1000000 -r1.10 -r1.11
--- src/hg/instinct/bioInt2/bioController.c 27 May 2009 19:31:34 -0000 1.10
+++ src/hg/instinct/bioInt2/bioController.c 23 Jul 2009 19:48:39 -0000 1.11
@@ -1,483 +1,491 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "hPrint.h"
#include "hdb.h"
#include "dystring.h"
#include "bioIntDb.h"
#include "bioIntDriver.h"
#include "cprob.h"
#include "hgStatsLib.h"
#include "bioController.h"
void usage()
/* Explain usage and exit. */
{
errAbort(
"bioController - controller for bioIntegrator pipeline\n"
"usage:\n"
" bioController cohort_id\n"
" -cohort_id = number of cohort in cohorts table"
);
}
#define BIOINT_DB "bioInt"
static struct optionSpec options[] = {
{NULL, 0}
};
char *getTableName(struct datasets *daList, char *module)
{
if (!daList)
return NULL;
struct dyString *dy = dyStringNew(10);
dyStringPrintf(dy, "%s", module);
struct datasets *da;
for (da = daList; da; da = da->next)
dyStringPrintf(dy, "_%s", da->data_table);
return dyStringCannibalize(&dy);
}
boolean analysesExists(struct sqlConnection *biConn, struct analyses *an)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where id = %d "
"and cohort_id = %d "
"and module_id = %d "
"and result_table = \"%s\" "
"and input_tables = \"%s\" ",
AN_TABLE, an->id, an->cohort_id, an->module_id,
an->result_table, an->input_tables);
return sqlExists(biConn, query);
}
boolean analysesWithResultTableExists(struct sqlConnection *biConn, char *result_table)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where result_table = \"%s\" ",
AN_TABLE, result_table);
return sqlExists(biConn, query);
}
struct hash *analysisParamsHash(struct sqlConnection *biConn, struct analyses *an)
{
struct hash *hash = hashNew(0);
char query[256];
safef(query, sizeof(query),
"select * from %s where analysis_id = %d",
AP_TABLE, an->id);
struct analysisParams *ap, *apList = analysisParamsLoadByQuery(biConn, query);
for (ap = apList; ap; ap = ap->next)
hashAdd(hash, ap->name, ap->val);
return hash;
}
void storeAnalysisParams(struct sqlConnection *biConn,
struct analyses *an, struct hash *params)
{
struct hashEl *el;
struct hashCookie cookie = hashFirst(params);
while ((el = hashNext(&cookie)) != NULL)
{
char *name = el->name;
char *val = el->val;
struct analysisParams *ap;
AllocVar(ap);
ap->analysis_id = an->id;
ap->name = cloneString(name);
ap->val = cloneString(val);
analysisParamsSaveToDbEscaped(biConn, ap, AP_TABLE, 50);
analysisParamsFree(&ap);
}
}
boolean matchingParams(struct sqlConnection *biConn, int analysis_id, struct hash *params)
{
char query[256];
safef(query, sizeof(query),
"select * from %s where analysis_id = %d",
AP_TABLE, analysis_id);
struct analysisParams *ap, *apList = analysisParamsLoadByQuery(biConn, query);
if (!apList) // no analysis params for this id
{
if (hashNumEntries(params) == 0)
return TRUE; // none here either, a match
else
return FALSE; // params exist in hash, no match
}
/* apList not NULL, run through list check that every analysisParam matches
* key-value to element in hash */
struct hashEl *el;
char *name, *val;
boolean matching = TRUE;
for (ap = apList; ap; ap = ap->next)
{
el = hashLookup(params, ap->name);
if (!el)
{
matching = FALSE;
continue;
}
val = el->val;
if (!sameString(val, ap->val))
matching = FALSE;
}
/* need to check reverse direction */
struct hashCookie cookie = hashFirst(params);
while ((el = hashNext(&cookie)) != NULL)
{
name = el->name;
val = el->val;
boolean foundMatch = FALSE;
for (ap = apList; ap; ap = ap->next)
if (sameString(ap->name, name) && sameString(ap->val, val))
foundMatch = TRUE;
if (!foundMatch)
matching = FALSE;
}
return matching;
}
char *createAnalysesResultTableName(struct sqlConnection *biConn, int cohort_id,
struct analysisModules *am, char *input_tables,
struct hash *params)
{
if (!input_tables)
return NULL;
struct slName *sl, *slList = slNameListFromComma(input_tables);
struct dyString *dy = dyStringNew(10);
/* Create prefix, e.g. "moduleName_table1_table2" */
-dyStringPrintf(dy, "%s", am->name);
+char *modulePrefix = "";
+if (sameString(am->name, "metaGeneset"))
+ modulePrefix = "m";
+else if (sameString(am->name, "metaGeneset"))
+ modulePrefix = "mg";
+else if (sameString(am->name, "factorGraph"))
+ modulePrefix = "fg";
+
+dyStringPrintf(dy, "%s", modulePrefix);
for (sl = slList; sl; sl = sl->next)
dyStringPrintf(dy, "_%s", sl->name);
char *prefix = dyStringCannibalize(&dy);
/* Initial table name is simply prefix */
char name[256];
safef(name, sizeof(name), "%s", prefix);
int count = 0;
boolean foundName = FALSE;
while (!foundName && count < 100)
{
count++;
if (!analysesWithResultTableExists(biConn, name))
{
foundName = TRUE;
break;
}
int id = findIdInTable(biConn, AN_TABLE, "id", "result_table", name);
if (matchingParams(biConn, id, params))
{
foundName = TRUE;
break;
}
/* If we got here, params don't matching existing analysis id,
* append a number to it and try again*/
safef(name, sizeof(name), "%s_%d", prefix, count);
}
if (!foundName)
errAbort("Could not find a unique table name after 100 attempts\n");
char *result_table = cloneString(name);
return result_table;
}
struct analyses *createAnalyses(struct sqlConnection *biConn, int cohort_id,
struct analysisModules *am, char *input_tables,
struct hash *params)
{
if (!input_tables)
return NULL;
/* Get unique name or name already in database for exact matching analysis */
char *result_table = createAnalysesResultTableName(biConn, cohort_id, am, input_tables, params);
int id = findIdInTable(biConn, AN_TABLE, "id", "result_table", result_table);
struct analyses *an;
AllocVar(an);
an->id = id;
an->cohort_id = cohort_id;
an->module_id = am->id;
an->result_table = cloneString(result_table);
an->input_tables = cloneString(input_tables);
if (!analysesExists(biConn, an))
{
analysesSaveToDbEscaped(biConn, an, AN_TABLE, 20);
storeAnalysisParams(biConn, an, params);
}
return an;
}
struct datasets *datasetsInCohort(struct sqlConnection *biConn, int cohort_id)
{
char query[256];
safef(query, sizeof(query),
"select * from %s join %s on %s.id = %s.dataset_id "
"where %s.cohort_id = %d;",
DA_TABLE, DC_TABLE, DA_TABLE, DC_TABLE, DC_TABLE, cohort_id);
return datasetsLoadByQuery(biConn, query);
}
struct analysisModules *analysisModulesMatching(struct sqlConnection *biConn,
char *field, char *val)
{
char query[128];
safef(query, sizeof(query),
"select * from %s where %s = \"%s\" ",
AM_TABLE, field, val);
return analysisModulesLoadByQuery(biConn, query);
}
void biAnalysisAddModule(struct sqlConnection *biConn,
struct biAnalysis *ba, int module_id)
{
if (!ba)
return;
char query[128];
safef(query, sizeof(query),
"select * from %s where id = %d",
AM_TABLE, module_id);
struct analysisModules *am = analysisModulesLoadByQuery(biConn, query);
if (!am)
errAbort("No module with id = %d", module_id);
/* Set pipeline by module type (gene, set, ...) */
if (sameString(am->type, "gene"))
ba->pipeline = geneLevelPipeline;
else if (sameString(am->type, "set"))
ba->pipeline = genesetLevelPipeline;
else if (sameString(am->type, "pathway"))
ba->pipeline = pathwayLevelPipeline;
else
ba->pipeline = NULL;
/* Set analysis algorithm by module name*/
if (sameString(am->name, "meta"))
ba->analyze = metaGene;
else if (sameString(am->name, "metaGeneset"))
ba->analyze = metaGeneset;
else if (sameString(am->name, "factorGraph"))
ba->analyze = factorGraph;
else
ba->analyze = NULL;
}
struct biAnalysis *biAnalysisListForCohort(struct sqlConnection *biConn,
char *db, int cohort_id)
{
struct biAnalysis *ba, *baList = NULL;
char query[128];
safef(query, sizeof(query),
"select * from %s where cohort_id = %d order by id",
AN_TABLE, cohort_id);
struct analyses *an, *anList = analysesLoadByQuery(biConn, query);
if (!anList)
errAbort("No analyses for cohort = %d", cohort_id);
for (an = anList; an; an = an->next)
{
AllocVar(ba);
ba->db = cloneString(db);
ba->tableName = cloneString(an->result_table);
ba->parameters = analysisParamsHash(biConn, an);
ba->inputTables = slNameListFromComma(an->input_tables);
biAnalysisAddModule(biConn, ba, an->module_id);
slAddHead(&baList, ba);
}
slReverse(&baList);
analysesFreeList(&anList);
return baList;
}
boolean cohortExists(struct sqlConnection *biConn, int cohort_id)
{
char query[128];
safef(query, sizeof(query),
"select * from %s where id = %d",
CO_TABLE, cohort_id);
return sqlExists(biConn, query);
}
boolean analysisModuleExists(struct sqlConnection *biConn, struct analysisModules *am)
{
char query[128];
safef(query, sizeof(query),
"select * from %s where id = %d "
"and name = \"%s\" "
"and type = \"%s\" ",
AM_TABLE, am->id, am->name, am->type);
return sqlExists(biConn, query);
}
void storeAnalysisModule(struct sqlConnection *biConn, char *name, char *type)
{
struct analysisModules *am;
AllocVar(am);
am->name = cloneString(name);
am->type = cloneString(type);
am->id = findIdInTable(biConn, AM_TABLE, "id", "name", am->name);
if (!analysisModuleExists(biConn, am))
analysisModulesSaveToDb(biConn, am, AM_TABLE, 10);
analysisModulesFree(&am);
}
void setupAnalysisModules(struct sqlConnection *biConn)
{
/* Meta-Gene module */
storeAnalysisModule(biConn, "meta", "gene");
/* Meta-geneset module */
storeAnalysisModule(biConn, "metaGeneset", "set");
/* Pathway Factor Graph module */
storeAnalysisModule(biConn, "factorGraph", "pathway");
/* Set up more modules below, similar to above */
}
void createStandardPipeline(struct sqlConnection *biConn, int cohort_id)
{
struct datasets *da, *daList = datasetsInCohort(biConn, cohort_id);
struct dyString *dy = dyStringNew(10);
for (da = daList; da; da = da->next)
{
dyStringPrintf(dy, "%s", da->data_table);
if (da->next)
dyStringPrintf(dy, ",");
}
char *input_tables = dyStringCannibalize(&dy);
struct analysisModules *am;
/* Gene analysis modules */
am = analysisModulesMatching(biConn, "name", "meta");
if (!am)
errAbort("No analysis module named meta");
struct hash *params = hashNew(0);
hashAdd(params, "fxn", cloneString("fishers"));
struct analyses *gene = createAnalyses(biConn, cohort_id, am, input_tables, params);
char *gene_result_table = cloneString(gene->result_table);
analysesFree(&gene);
analysisModulesFree(&am);
hashFree(¶ms);
/* Set (geneset/pathway) analysis modules */
am = analysisModulesMatching(biConn, "name", "metaGeneset");
if (!am)
errAbort("No analysis module named metaGeneset");
params = hashNew(0);
hashAdd(params, "fxn", cloneString("fishers"));
struct analyses *set = createAnalyses(biConn, cohort_id, am, gene_result_table, params);
analysesFree(&set);
analysisModulesFree(&am);
hashFree(¶ms);
/* Set factor graph analysis modules */
am = analysisModulesMatching(biConn, "name", "factorGraph");
if (!am)
errAbort("No analysis module named factorGraph");
params = hashNew(0);
hashAdd(params, "fxn", cloneString("factorGraphDAI"));
struct analyses *fg = createAnalyses(biConn, cohort_id, am, input_tables, params);
analysesFree(&fg);
analysisModulesFree(&am);
hashFree(¶ms);
}
void prepareDatabase(struct sqlConnection *biConn)
{
if (!sqlTableExists(biConn, AN_TABLE))
createAnalysesTable(biConn, AN_TABLE);
if (!sqlTableExists(biConn, AP_TABLE))
createAnalysisParamsTable(biConn, AP_TABLE);
if (!sqlTableExists(biConn, AM_TABLE))
createAnalysisModulesTable(biConn, AM_TABLE);
setupAnalysisModules(biConn);
}
void bioController(char *db, int cohort_id)
{
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
if (!cohortExists(biConn, cohort_id))
{
hFreeConn(&biConn);
errAbort("Cohort %d does not exist.", cohort_id);
}
prepareDatabase(biConn);
createStandardPipeline(biConn, cohort_id);
struct biAnalysis *baList = biAnalysisListForCohort(biConn, db, cohort_id);
hFreeConn(&biConn);
runAnalysisPipeline(baList);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 2)
usage();
bioController(BIOINT_DB, atoi(argv[1]));
return 0;
}