src/hg/instinct/bioInt2/bioController.c 1.6

1.6 2009/03/24 03:07:55 jsanborn
updated
Index: src/hg/instinct/bioInt2/bioController.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioController.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 4 -r1.5 -r1.6
--- src/hg/instinct/bioInt2/bioController.c	23 Mar 2009 18:19:29 -0000	1.5
+++ src/hg/instinct/bioInt2/bioController.c	24 Mar 2009 03:07:55 -0000	1.6
@@ -99,106 +98,210 @@
 
 return sqlExists(biConn, query);
 }      
 
-void createAnalyses(struct sqlConnection *biConn, int cohort_id, int module_id, 
-		    char *result_table, char *input_tables)
+boolean analysesWithResultTableExists(struct sqlConnection *biConn, char *result_table)
 {
-int id = findIdInTable(biConn, "analyses", "id", "result_table", result_table);
-
-struct analyses *an;
-AllocVar(an);
-an->id = id;  
-an->cohort_id = cohort_id;
-an->module_id = module_id;
-an->result_table = cloneString(result_table);
-an->input_tables = cloneString(input_tables);
-
-if (!analysesExists(biConn, an))
-    analysesSaveToDbEscaped(biConn, an, "analyses", 20);
+char query[256];
+safef(query, sizeof(query),
+      "select * from analyses where result_table = \"%s\" ",
+      result_table);
 
-analysesFree(&an);
+return sqlExists(biConn, query);
 }
 
-struct datasets *datasetsInCohort(struct sqlConnection *biConn, int cohort_id)
+struct hash *analysisParamsHash(struct sqlConnection *biConn, struct analyses *an)
 {
+struct hash *hash = hashNew(0);
 char query[256];
 safef(query, sizeof(query), 
-      "select * from datasets join datasetCohort on datasets.id = datasetCohort.dataset_id "
-      "where datasetCohort.cohort_id = %d;",
-      cohort_id);
+      "select * from analysisParams where analysis_id = %d",
+      an->id);
+struct analysisParams *ap, *apList = analysisParamsLoadByQuery(biConn, query);
 
-return datasetsLoadByQuery(biConn, query);
+for (ap = apList; ap; ap = ap->next)
+    hashAdd(hash, ap->name, ap->val);
+
+return hash;
 }
 
-struct analysisModules *analysisModulesOfType(struct sqlConnection *biConn, 
-					      char *type)
+void storeAnalysisParams(struct sqlConnection *biConn, 
+			 struct analyses *an, struct hash *params)
 {
-char query[128];
-safef(query, sizeof(query),
-      "select * from analysisModules where type = \"%s\" ",
-      type);
+struct hashEl *el;
+struct hashCookie cookie = hashFirst(params);
+while ((el = hashNext(&cookie)) != NULL)
+    {
+    char *name = el->name;
+    char *val = el->val;  
 
-return analysisModulesLoadByQuery(biConn, query);
+    struct analysisParams *ap;
+    AllocVar(ap);
+    ap->analysis_id = an->id;
+    ap->name = cloneString(name);
+    ap->val = cloneString(val);
+
+    analysisParamsSaveToDbEscaped(biConn, ap, "analysisParams", 50);
+
+    analysisParamsFree(&ap);
+    }
 }
 
 
-void createGeneLevelAnalyses(struct sqlConnection *biConn, int cohort_id)
+boolean matchingParams(struct sqlConnection *biConn, int analysis_id, struct hash *params)
 {
-struct datasets *da, *daList = datasetsInCohort(biConn, cohort_id);
+char query[256];
+safef(query, sizeof(query), 
+      "select * from analysisParams where analysis_id = %d",
+      analysis_id);
 
-struct dyString *dy = dyStringNew(10);
-for (da = daList; da; da = da->next)
+struct analysisParams *ap, *apList = analysisParamsLoadByQuery(biConn, query);
+
+if (!apList) // no analysis params for this id 
     {
-    dyStringPrintf(dy, "%s", da->data_table);
-    if (da->next)
-	dyStringPrintf(dy, ",");
+    if (hashNumEntries(params) == 0)  
+	return TRUE;     // none here either, a match
+    else
+	return FALSE;    // params exist in hash, no match
     }
-char *input_tables = dyStringCannibalize(&dy);
 
-/* Gene analysis modules */
-struct analysisModules *am, *amList = analysisModulesOfType(biConn, "gene");
+/* apList not NULL, run through list check that every analysisParam matches 
+ * key-value to element in hash */
+struct hashEl *el;
+char *name, *val;
+boolean matching = TRUE;
+for (ap = apList; ap; ap = ap->next)
+    {
+    el = hashLookup(params, ap->name);
+    if (!el)
+	{
+	matching = FALSE;
+	continue;
+	}
+
+    val = el->val;
+    if (!sameString(val, ap->val))
+	matching = FALSE;
+    }
 
-for (am = amList; am; am = am->next)
+/* need to check reverse direction */
+struct hashCookie cookie = hashFirst(params);
+while ((el = hashNext(&cookie)) != NULL)
     {
-    char *result_table = getTableName(daList, am->name);
-    createAnalyses(biConn, cohort_id, am->id, result_table, input_tables);
+    name = el->name;
+    val = el->val;
+
+    boolean foundMatch = FALSE;
+    for (ap = apList; ap; ap = ap->next)
+	if (sameString(ap->name, name) && sameString(ap->val, val))
+	    foundMatch = TRUE;
+
+    if (!foundMatch)
+	matching = FALSE;
     }
+
+return matching;
 }
 
-void createSetLevelAnalyses(struct sqlConnection *biConn, int cohort_id)
+char *createAnalysesResultTableName(struct sqlConnection *biConn, int cohort_id, 
+				    struct analysisModules *am, char *input_tables, 
+				    struct hash *params)
 {
-/* Get all gene-level analyses */
-char query[256];
-safef(query, sizeof(query),
-      "select * from analyses "
-      "join analysisModules on analyses.module_id = analysisModules.id "
-      "where analyses.cohort_id = %d and analysisModules.type = \"%s\" ",
-      cohort_id, "gene");
+if (!input_tables)
+    return NULL;
 
-struct analyses *an, *anList = analysesLoadByQuery(biConn, query);
+struct slName *sl, *slList = slNameListFromComma(input_tables);
+struct dyString *dy = dyStringNew(10);
 
-if (!anList)
+/* Create prefix, e.g. "moduleName_table1_table2" */
+dyStringPrintf(dy, "%s", am->name);
+for (sl = slList; sl; sl = sl->next)
+    dyStringPrintf(dy, "_%s", sl->name);
+char *prefix = dyStringCannibalize(&dy);
+
+/* Initial table name is simply prefix */
+char name[256];
+safef(name, sizeof(name), "%s", prefix);
+
+int count = 0;
+boolean foundName = FALSE;
+while (!foundName && count < 100)
     {
-    fprintf(stdout, "No set level analysis to perform.\n");
-    return;
+    count++;
+    if (!analysesWithResultTableExists(biConn, name))
+	{
+	foundName = TRUE;
+	break;
     }
 
-/* Set (geneset/pathway) analysis modules */
-struct analysisModules *am, *amList = analysisModulesOfType(biConn, "set");
+    int id = findIdInTable(biConn, "analyses", "id", "result_table", name);
+    if (matchingParams(biConn, id, params))
+	{ 
+	foundName = TRUE;
+	break;
+	}
 
-for (am = amList; am; am = am->next)
-    { // loop through modules
-    for (an = anList; an; an = an->next)
-	{ // loop through gene-level results (one from each gene module)
-	char result_table[128];
-	safef(result_table, sizeof(result_table),
-	      "%s_%s", am->name, an->result_table);
-	createAnalyses(biConn, cohort_id, am->id, result_table, an->result_table);
+    /* If we got here, params don't matching existing analysis id, 
+     * append a number to it and try again*/
+    safef(name, sizeof(name), "%s_%d", prefix, count);
 	}
+
+if (!foundName)
+    errAbort("Could not find a unique table name after 100 attempts\n");
+
+char *result_table = cloneString(name);
+return result_table;
+}
+
+struct analyses *createAnalyses(struct sqlConnection *biConn, int cohort_id, 
+				struct analysisModules *am, char *input_tables, 
+				struct hash *params)
+{
+if (!input_tables)
+    return NULL;
+
+/* Get unique name or name already in database for exact matching analysis */
+char *result_table = createAnalysesResultTableName(biConn, cohort_id, am, input_tables, params);
+int id = findIdInTable(biConn, "analyses", "id", "result_table", result_table);
+
+struct analyses *an;
+AllocVar(an);
+an->id = id;  
+an->cohort_id = cohort_id;
+an->module_id = am->id;
+an->result_table = cloneString(result_table);
+an->input_tables = cloneString(input_tables);
+
+if (!analysesExists(biConn, an))
+    {
+    analysesSaveToDbEscaped(biConn, an, "analyses", 20);
+    storeAnalysisParams(biConn, an, params);
     }
+
+return an;
 }
 
+struct datasets *datasetsInCohort(struct sqlConnection *biConn, int cohort_id)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select * from datasets join datasetCohort on datasets.id = datasetCohort.dataset_id "
+      "where datasetCohort.cohort_id = %d;",
+      cohort_id);
+
+return datasetsLoadByQuery(biConn, query);
+}
+
+struct analysisModules *analysisModulesMatching(struct sqlConnection *biConn, 
+						char *field, char *val)
+{
+char query[128];
+safef(query, sizeof(query),
+      "select * from analysisModules where %s = \"%s\" ",
+      field, val);
+
+return analysisModulesLoadByQuery(biConn, query);
+}
 
 void biAnalysisAddModule(struct sqlConnection *biConn, 
 			 struct biAnalysis *ba, int module_id)
 {
@@ -206,31 +309,30 @@
     return;
 
 char query[128];
 safef(query, sizeof(query), 
-      "select name from analysisModules where id = %d", 
+      "select * from analysisModules where id = %d", 
       module_id);
 
-if (!sqlExists(biConn, query))
+struct analysisModules *am = analysisModulesLoadByQuery(biConn, query);
+if (!am)
     errAbort("No module with id = %d", module_id);
 
-char *module = sqlQuickString(biConn, query);
-
-if (sameString(module, "meta"))
-    {
+/* Set pipeline by module type (gene, set, ...) */
+if (sameString(am->type, "gene"))
     ba->pipeline = geneLevelPipeline;
-    ba->analyze = metaGene;
-    }
-else if (sameString(module, "metaGeneset"))
-    {
+else if (sameString(am->type, "set"))
     ba->pipeline = genesetLevelPipeline;
-    ba->analyze = metaGeneset;
-    }
 else
-    {
     ba->pipeline = NULL;
+
+/* Set analysis algorithm by module name*/
+if (sameString(am->name, "meta"))
+    ba->analyze = metaGene;
+else if (sameString(am->name, "metaGeneset"))
+    ba->analyze = metaGeneset;
+else
     ba->analyze = NULL;
-    }
 }
 
 struct biAnalysis *biAnalysisListForCohort(struct sqlConnection *biConn, 
 					   char *db, int cohort_id)
@@ -251,9 +353,9 @@
     {
     AllocVar(ba);
     ba->db = cloneString(db);
     ba->tableName  = cloneString(an->result_table);
-    ba->parameters = hashNew(0);
+    ba->parameters = analysisParamsHash(biConn, an);
     ba->inputTables = slNameListFromComma(an->input_tables);
     
     biAnalysisAddModule(biConn, ba, an->module_id);
 
@@ -308,8 +410,50 @@
 /* Set up more modules below, similar to above */
 storeAnalysisModule(biConn, "metaGeneset", "set");
 }
 
+void createStandardPipeline(struct sqlConnection *biConn, int cohort_id)
+{
+struct datasets *da, *daList = datasetsInCohort(biConn, cohort_id);
+
+struct dyString *dy = dyStringNew(10);
+for (da = daList; da; da = da->next)
+    {
+    dyStringPrintf(dy, "%s", da->data_table);
+    if (da->next)
+	dyStringPrintf(dy, ",");
+    }
+char *input_tables = dyStringCannibalize(&dy);
+
+struct analysisModules *am;
+
+/* Gene analysis modules */
+am = analysisModulesMatching(biConn, "name", "meta");
+if (!am)
+    errAbort("No analysis module named meta");
+
+struct hash *params = hashNew(0);
+hashAdd(params, "fxn", cloneString("fishers"));
+struct analyses *gene = createAnalyses(biConn, cohort_id, am, input_tables, params);
+char *gene_result_table = cloneString(gene->result_table);
+analysesFree(&gene);
+analysisModulesFree(&am);
+hashFree(&params);
+
+/* Set (geneset/pathway) analysis modules */
+am = analysisModulesMatching(biConn, "name", "metaGeneset");
+if (!am)
+    errAbort("No analysis module named metaGeneset");
+
+params = hashNew(0);
+hashAdd(params, "fxn", cloneString("fishers"));
+struct analyses *set = createAnalyses(biConn, cohort_id, am, gene_result_table, params);
+analysesFree(&set);
+analysisModulesFree(&am);
+hashFree(&params);
+}
+
+
 void prepareDatabase(struct sqlConnection *biConn)
 {
 if (!sqlTableExists(biConn, "analyses"))
     createAnalysesTable(biConn, "analyses");
@@ -331,14 +475,11 @@
     {
     hFreeConn(&biConn);
     errAbort("Cohort %d does not exist.", cohort_id);
     }
-
 prepareDatabase(biConn);
 
-createGeneLevelAnalyses(biConn, cohort_id);
-
-createSetLevelAnalyses(biConn, cohort_id);
+createStandardPipeline(biConn, cohort_id);
 
 struct biAnalysis *baList = biAnalysisListForCohort(biConn, db, cohort_id);
 hFreeConn(&biConn);