src/hg/instinct/bioInt2/bioController.c 1.4

1.4 2009/03/22 01:07:28 jsanborn
updated
Index: src/hg/instinct/bioInt2/bioController.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioController.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/instinct/bioInt2/bioController.c	21 Mar 2009 21:31:54 -0000	1.3
+++ src/hg/instinct/bioInt2/bioController.c	22 Mar 2009 01:07:28 -0000	1.4
@@ -1,53 +1,353 @@
 /* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "hPrint.h"
 #include "hdb.h"
 #include "dystring.h"
 #include "bioIntDb.h"
 #include "bioIntDriver.h"
 #include "cprob.h"
 #include "hgStatsLib.h"
 #include "bioController.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 	 "bioController - controller for bioIntegrator pipeline\n"
 	 "usage:\n"
 	 "   bioIntegrator datasets\n"
 	 "   -datasets = comma-separated list of datasets\n"
 	 );
 }
 
 #define BIOINT_DB "bioInt"
 
 static struct optionSpec options[] = {
     {NULL, 0}
 };                            
 
-void bioController(char *db, char *datasets)
+void createAnalysisModulesTable(struct sqlConnection *biConn, char *tableName)
 {
-/* datasets is a comma-separated string, each a different dataset table */
-struct slName *slDatasets = slNameListFromComma(datasets);
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "type varchar(255) not null,\n"); 
+dyStringPrintf(dy, "PRIMARY KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}  
+
+void createAnalysesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "cohort_id int unsigned not null,\n");
+dyStringPrintf(dy, "module_id int unsigned not null,\n"); 
+dyStringPrintf(dy, "result_table varchar(255) not null,\n");
+dyStringPrintf(dy, "input_tables longblob not null,\n");
+dyStringPrintf(dy, "PRIMARY KEY(id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}  
+
+void createAnalysisParamsTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "analysis_id int unsigned not null,\n");
+dyStringPrintf(dy, "name varchar(255) not null,\n");
+dyStringPrintf(dy, "val varchar(255) not null,\n");
+dyStringPrintf(dy, "KEY(analysis_id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}  
+
+char *getTableName(struct datasets *daList, char *module)
+{
+if (!daList)
+    return NULL;
+struct dyString *dy = dyStringNew(10);
+dyStringPrintf(dy, "%s", module);
+
+struct datasets *da;
+for (da = daList; da; da = da->next)
+    dyStringPrintf(dy, "_%s", da->data_table);
+
+return dyStringCannibalize(&dy);
+}
+
+
+boolean analysesExists(struct sqlConnection *biConn, struct analyses *an)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select * from analyses where id = %d "
+      "and cohort_id = %d "
+      "and module_id = %d "
+      "and result_table = \"%s\" "
+      "and input_tables = \"%s\" ",
+      an->id, an->cohort_id, an->module_id, an->result_table, an->input_tables);
+
+return sqlExists(biConn, query);
+}      
+
+void createAnalyses(struct sqlConnection *biConn, int cohort_id, int module_id, 
+		    char *result_table, char *input_tables)
+{
+int id = findIdInTable(biConn, "analyses", "id", "result_table", result_table);
+
+struct analyses *an;
+AllocVar(an);
+an->id = id;  
+an->cohort_id = cohort_id;
+an->module_id = module_id;
+an->result_table = cloneString(result_table);
+an->input_tables = cloneString(input_tables);
+
+if (!analysesExists(biConn, an))
+    analysesSaveToDbEscaped(biConn, an, "analyses", 20);
+
+analysesFree(&an);
+}
+
+struct datasets *datasetsInCohort(struct sqlConnection *biConn, int cohort_id)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select * from datasets join datasetCohort on datasets.id = datasetCohort.dataset_id "
+      "where datasetCohort.cohort_id = %d;",
+      cohort_id);
+
+return datasetsLoadByQuery(biConn, query);
+}
+
+struct analysisModules *analysisModulesOfType(struct sqlConnection *biConn, 
+					      char *type)
+{
+char query[128];
+safef(query, sizeof(query),
+      "select * from analysisModules where type = \"%s\" ",
+      type);
+
+return analysisModulesLoadByQuery(biConn, query);
+}
+
+
+void createGeneLevelAnalyses(struct sqlConnection *biConn, int cohort_id)
+{
+struct datasets *da, *daList = datasetsInCohort(biConn, cohort_id);
+
+struct dyString *dy = dyStringNew(10);
+for (da = daList; da; da = da->next)
+    {
+    dyStringPrintf(dy, "%s", da->data_table);
+    if (da->next)
+	dyStringPrintf(dy, ",");
+    }
+char *input_tables = dyStringCannibalize(&dy);
+
+/* Gene analysis modules */
+struct analysisModules *am, *amList = analysisModulesOfType(biConn, "gene");
+
+for (am = amList; am; am = am->next)
+    {
+    char *result_table = getTableName(daList, am->name);
+    createAnalyses(biConn, cohort_id, am->id, result_table, input_tables);
+    }
+}
+
+void createSetLevelAnalyses(struct sqlConnection *biConn, int cohort_id)
+{
+char query[256];
+safef(query, sizeof(query),
+      "select * from analyses "
+      "join analysisModules on analyses.module_id = analysisModules.id "
+      "where analyses.cohort_id = %d and analysisModules.type = \"%s\" ",
+      cohort_id, "gene");
+
+struct analyses *an, *anList = analysesLoadByQuery(biConn, query);
+
+if (!anList)
+    {
+    fprintf(stdout, "No set level analysis to perform.\n");
+    return;
+    }
+
+/* Set (geneset/pathway) analysis modules */
+struct analysisModules *am, *amList = analysisModulesOfType(biConn, "set");
+
+for (am = amList; am; am = am->next)
+    { // loop through modules
+    for (an = anList; an; an = an->next)
+	{ // loop through gene-level results (one from each gene module)
+	char result_table[128];
+	safef(result_table, sizeof(result_table),
+	      "%s_%s", am->name, an->result_table);
+	createAnalyses(biConn, cohort_id, am->id, result_table, an->result_table);
+	}
+    }
+}
+
+
+void biAnalysisAddModule(struct sqlConnection *biConn, 
+			 struct biAnalysis *ba, int module_id)
+{
+if (!ba)
+    return;
+
+char query[128];
+safef(query, sizeof(query), 
+      "select name from analysisModules where id = %d", 
+      module_id);
+
+if (!sqlExists(biConn, query))
+    errAbort("No module with id = %d", module_id);
+
+char *module = sqlQuickString(biConn, query);
+
+if (sameString(module, "meta"))
+    ba->analyze = metaGene;
+else
+    ba->analyze = NULL;
+}
+
+struct biAnalysis *biAnalysisListForCohort(struct sqlConnection *biConn, 
+					   char *db, int cohort_id)
+{
+struct biAnalysis *ba, *baList = NULL;
+
+char query[128];
+safef(query, sizeof(query), 
+      "select * from analyses where cohort_id = %d",
+      cohort_id);
+
+struct analyses *an, *anList = analysesLoadByQuery(biConn, query);
+
+if (!anList)
+    errAbort("No analyses for cohort = %d", cohort_id);
+
+struct datasets *da, *daList = datasetsInCohort(biConn, cohort_id);
+
+for (an = anList; an; an = an->next)
+    {
+    AllocVar(ba);
+    ba->db = cloneString(db);
+    ba->tableName  = cloneString(an->result_table);
+    ba->parameters = hashNew(0);
+
+    struct slName *slList = NULL;
+    for (da = daList; da; da = da->next)
+	slNameAddHead(&slList, da->data_table);
+    ba->inputTables = slList;
+    
+    biAnalysisAddModule(biConn, ba, an->module_id);
+
+    slAddHead(&baList, ba);
+    }
+slReverse(&baList);
+
+analysesFreeList(&anList);
+datasetsFreeList(&daList);
+
+return baList;
+} 
+
+boolean cohortExists(struct sqlConnection *biConn, int cohort_id)
+{
+char query[128];
+safef(query, sizeof(query), "select * from cohorts where id = %d", cohort_id);
+
+return sqlExists(biConn, query);
+}
+
+boolean analysisModuleExists(struct sqlConnection *biConn, struct analysisModules *am)
+{
+char query[128];
+safef(query, sizeof(query), 
+      "select * from analysisModules where id = %d "
+      "and name = \"%s\" "
+      "and type = \"%s\" ", 
+      am->id, am->name, am->type);
+
+return sqlExists(biConn, query);
+}
+
+void storeAnalysisModule(struct sqlConnection *biConn, char *name, char *type)
+{
+struct analysisModules *am;
+AllocVar(am);
+am->name = cloneString(name);
+am->type = cloneString(type);
+am->id = findIdInTable(biConn, "analysisModules", "id", "name", am->name);
+
+if (!analysisModuleExists(biConn, am))
+    analysisModulesSaveToDb(biConn, am, "analysisModules", 10);
+
+analysisModulesFree(&am);
+}
+
+void setupAnalysisModules(struct sqlConnection *biConn)
+{
+/* Meta-Gene module */
+storeAnalysisModule(biConn, "meta", "gene");
+
+/* Set up more modules below, similar to above */
+
+}
+
+void prepareDatabase(struct sqlConnection *biConn)
+{
+if (!sqlTableExists(biConn, "analyses"))
+    createAnalysesTable(biConn, "analyses");
+
+if (!sqlTableExists(biConn, "analysisParams"))
+    createAnalysisParamsTable(biConn, "analysisParams");
+
+if (!sqlTableExists(biConn, "analysisModules"))
+    createAnalysisModulesTable(biConn, "analysisModules");
+
+setupAnalysisModules(biConn);
+}
+
+void bioController(char *db, int cohort_id)
+{
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+
+if (!cohortExists(biConn, cohort_id))
+    {
+    hFreeConn(&biConn);
+    errAbort("Cohort %d does not exist.", cohort_id);
+    }
+
+prepareDatabase(biConn);
+
+createGeneLevelAnalyses(biConn, cohort_id);
+
+createSetLevelAnalyses(biConn, cohort_id);
 
-struct biAnalysis *baList = registerGeneLevelAnalyses(db, slDatasets);    
+struct biAnalysis *baList = biAnalysisListForCohort(biConn, db, cohort_id);
+hFreeConn(&biConn);
 
-runAnalysisPipeline(db, datasets, baList);
+runAnalysisPipeline(baList);
 }
 
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
 
-bioController(BIOINT_DB, argv[1]);
+bioController(BIOINT_DB, atoi(argv[1]));
 return 0;
 }