src/hg/instinct/bioInt2/bioController.c 1.2

1.2 2009/03/21 19:54:10 jsanborn
added routine to set cohorts
Index: src/hg/instinct/bioInt2/bioController.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioController.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/instinct/bioInt2/bioController.c	20 Mar 2009 06:06:31 -0000	1.1
+++ src/hg/instinct/bioInt2/bioController.c	21 Mar 2009 19:54:10 -0000	1.2
@@ -1,425 +1,460 @@
 /* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "hPrint.h"
 #include "hdb.h"  
 #include "dystring.h"
 #include "bioIntDb.h"
 #include "bioIntDriver.h"
 #include "cprob.h"
 #include "hgStatsLib.h"
 #include "bioController.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 	 "bioController - controller for bioIntegrator pipeline\n"
 	 "usage:\n"
 	 "   bioIntegrator datasets\n"
 	 "   -datasets = comma-separated list of datasets\n"
 	 );
 }
 
 #define BIOINT_DB "bioInt"
 
 static struct optionSpec options[] = {
     {NULL, 0}
 }; 
 
 void printSlName(struct slName *slList)
 {
 if (!slList)
     fprintf(stdout, "nothing in slName list\n");
 
 struct slName *sl;
 
 fprintf(stdout, "num samples = %d\n", slCount(slList));
 for (sl = slList; sl; sl = sl->next)
     {
     fprintf(stdout, "%s", sl->name);
     if (sl->next)
 	fprintf(stdout, ",");
     }
 fprintf(stdout, "\n");
 }
 
 void slPairDoubleFree(struct slPair **pEl)
 {
 struct slPair *el;
 if ((el = *pEl) == NULL) return;
 
 freeMem(el->name);
 struct slDouble *sdList = el->val;
 slFreeList(&sdList);
 freez(pEl);
 }
 
 void slPairDoubleFreeList(struct slPair **pList)
 {
 struct slPair *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     slPairDoubleFree(&el);
     }
 *pList = NULL;
 } 
 
 void analysisResultFree(struct analysisResult **pEl)
 {
 struct analysisResult *el;
 if ((el = *pEl) == NULL) return;
 
 freeMem(el->sample);
 freeMem(el->feature);
 freez(pEl);
 }
 
 void analysisResultFreeList(struct analysisResult **pList)
 {
 struct analysisResult *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     analysisResultFree(&el);
     }
 *pList = NULL;
 } 
 
 struct slName *getAvailableGenes(char *db, struct biResults *br)
 {
 struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
 
 char query[256];
 safef(query, sizeof(query), "select DISTINCT geneSymbol from kgXref join knownGene on kgXref.kgId = knownGene.name");
 
 struct slName *sl, *slList = sqlQuickList(biConn, query);
 hFreeConn(&biConn);
 
 struct slName *geneList = NULL;
 for (sl = slList; sl; sl = sl->next)
     {
     struct slName *probes = br->probesForGene(br, sl->name);
     int numProbes = slCount(probes);
     slNameFreeList(&probes);
     if (numProbes == 0)
 	continue;
 
     slNameAddHead(&geneList, sl->name);
     }
 
 return geneList;
 } 
 
 
 struct slPair *geneLevelData(struct biResults *br, struct biData *bdList, char *sample, char *gene) 
 {
 struct biData *bd;
 struct slPair *sp, *spList = NULL;
 for (bd = bdList; bd; bd = bd->next)
     {
     char *dataset = bd->name;
     struct slName *sl, *probes = br->probesForGeneInDataset(br, gene, dataset);
 
     AllocVar(sp);
     sp->name = cloneString(bd->type);
     sp->val = NULL;
     struct slDouble *sd, *sdList = NULL;
     for (sl = probes; sl; sl = sl->next)
 	{
 	struct hashEl *el = hashLookup(bd->hash, sl->name);
 	if (!el)
 	    continue;
 	struct slDouble *sdVal = el->val;
 	sd = slDoubleNew(sdVal->val);
 	slAddHead(&sdList, sd);
 	}
     sp->val = sdList;
     slAddHead(&spList, sp);
 
     slNameFreeList(&probes);
     }
 
 return spList;
 }
 
 struct analysisResult *geneLevelAnalysis(struct biAnalysis *ba, struct biResults *br, 
 					 struct slName *genes)
 {
 fprintf(stdout, "starting gene level analysis\n");
 
 struct slName *gene, *sample, *samples = br->allSamplesInCommon(br);
 
 struct analysisResult *ar, *arList = NULL;
 for (sample = samples; sample; sample = sample->next)
     {
     struct biData *bdList = br->dataForSample(br, sample->name);
     for (gene = genes; gene; gene = gene->next)
 	{
 	struct slPair *spList = geneLevelData(br, bdList, sample->name, gene->name);
 	if (!spList)
 	    continue;
 	ar = ba->analyze(ba, spList, sample->name, gene->name);
 	slPairDoubleFreeList(&spList);
 	if (!ar)
 	    continue;
 	slAddHead(&arList, ar);
 	}
     fprintf(stdout, ".");    
     biDataFree(&bdList);    
     }
 fprintf(stdout, "\n");
 
 return arList;
 }
 
 int findIdInTable(struct sqlConnection *biConn, char *tableName,
 		  char *idField, char *sField, char *name)
 {
 if (sqlTableSize(biConn, tableName) == 0)  /* brand new table, return 0 */
     return 0;
 
 char query[256];
 safef(query, sizeof(query),
       "select DISTINCT %s from %s where %s = \"%s\";",
       idField, tableName, sField, name);
 if (sqlExists(biConn, query))  /* sample name found, use same id */
     return sqlQuickNum(biConn, query);
 
 /* Else, find maximum sample id and add one to it */
 safef(query, sizeof(query),
       "select max(%s) from %s;",
       idField, tableName);
 int maxId = sqlQuickNum(biConn, query);
 return maxId + 1;
 }   
 
 
 void createAnalysisFeaturesTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
-dyStringPrintf(dy, "feature_id int unsigned not null,\n");
+dyStringPrintf(dy, "id int unsigned not null,\n");
 dyStringPrintf(dy, "feature_name varchar(255) not null,\n");
-dyStringPrintf(dy, "KEY(feature_id),\n");
+dyStringPrintf(dy, "KEY(id),\n");
 dyStringPrintf(dy, "KEY(feature_name),\n");
-dyStringPrintf(dy, "KEY(feature_id, feature_name),\n");
-dyStringPrintf(dy, "KEY(feature_name, feature_id)\n");
+dyStringPrintf(dy, "KEY(id, feature_name),\n");
+dyStringPrintf(dy, "KEY(feature_name, id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 } 
 
 void createAnalysisValsTable(struct sqlConnection *biConn, char *tableName)
 {
 struct dyString *dy = newDyString(1024);
 dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
 dyStringPrintf(dy, "sample_id int unsigned not null,\n");
 dyStringPrintf(dy, "feature_id int unsigned not null,\n");
 dyStringPrintf(dy, "val float not null,\n");
 dyStringPrintf(dy, "conf float not null,\n");
 dyStringPrintf(dy, "KEY(feature_id, sample_id),\n");
 dyStringPrintf(dy, "KEY(sample_id, feature_id)\n");
 dyStringPrintf(dy, ")\n");
 sqlUpdate(biConn,dy->string);
 dyStringFree(&dy);
 } 
 
 boolean analysisFeatureExists(struct sqlConnection *biConn, struct analysisFeatures *af)
 {
 char query[256];
 safef(query, sizeof(query),
-      "select * from analysisFeatures where feature_id = %d "
+      "select * from analysisFeatures where id = %d "
       "and feature_name = \"%s\" ",
-      af->feature_id, af->feature_name);
+      af->id, af->feature_name);
 
 return sqlExists(biConn, query);
 } 
 
+struct hash *getAnalysisFeaturesHash(struct sqlConnection *biConn)
+{
+fprintf(stdout, "getting analysis features hash.\n");
+
+struct hash *hash = hashNew(0);
+char query[128];
+safef(query, sizeof(query), "select * from analysisFeatures");
+
+struct analysisFeatures *af, *afList = analysisFeaturesLoadByQuery(biConn, query);
+
+for (af = afList; af; af = af->next)
+    hashAddInt(hash, af->feature_name, af->id);
+
+analysisFeaturesFreeList(&afList);
+return hash;
+}  
+
 struct hash *storeAnalysisFeaturesInDb(struct sqlConnection *biConn, struct analysisResult *arList)
 {
 fprintf(stdout, "storing analysis features\n");
 
 char *tableName = "analysisFeatures";
 if (!sqlTableExists(biConn, tableName))
     createAnalysisFeaturesTable(biConn, tableName);
 
+/* Get existing analysis features */
+struct hash *hash = getAnalysisFeaturesHash(biConn);
+
 struct analysisFeatures *af;
 struct analysisResult *ar;
-struct hash *hash = hashNew(0);
 for (ar = arList; ar; ar = ar->next)
     {
     if (hashLookup(hash, ar->feature))
 	continue;  // already visited, ignore
 
-    int feature_id = findIdInTable(biConn, tableName, "feature_id", "feature_name", ar->feature); 
+    int feature_id = findIdInTable(biConn, tableName, "id", "feature_name", ar->feature); 
 
     AllocVar(af);
-    af->feature_id = feature_id;
+    af->id = feature_id;
     af->feature_name = cloneString(ar->feature);
     if (!analysisFeatureExists(biConn, af))
 	analysisFeaturesSaveToDb(biConn, af, tableName, 10);
 
-    hashAddInt(hash, af->feature_name, af->feature_id);
+    hashAddInt(hash, af->feature_name, af->id);
     analysisFeaturesFree(&af);
     }
 
 return hash;
 }
 
 struct hash *getSampleIdHash(struct sqlConnection *biConn)
 {
 fprintf(stdout, "getting sample id hash.\n");
 
 struct hash *hash = hashNew(0);
 char query[128];
 safef(query, sizeof(query), "select * from samples");
 
 struct samples *sa, *saList = samplesLoadByQuery(biConn, query);
 
 for (sa = saList; sa; sa = sa->next)
     hashAddInt(hash, sa->name, sa->id);
 
+samplesFreeList(&saList);
 return hash;
 }
 
 
 void storeAnalysisResultsInDb(struct sqlConnection *biConn, struct biAnalysis *ba, 
 			      struct analysisResult *arList)
 {
 fprintf(stdout, "storing analysis in table %s in %s db\n", 
 	ba->tableName, ba->db);
 
 if (!sqlTableExists(biConn, ba->tableName))
     createAnalysisValsTable(biConn, ba->tableName);
 
 struct hash *featureIds = storeAnalysisFeaturesInDb(biConn, arList); 
 struct hash *sampleIds = getSampleIdHash(biConn);
 
 struct analysisVals *av = AllocA(struct analysisVals);
 struct analysisResult *ar;
 for (ar = arList; ar; ar = ar->next)
     {
     int sample_id = hashIntValDefault(sampleIds, ar->sample, -1);
     int feature_id = hashIntValDefault(featureIds, ar->feature, -1);
 
     if (sample_id == -1 || feature_id == -1)
 	continue;
 
     av->sample_id  = sample_id;
     av->feature_id = feature_id;
     av->val        = ar->val;
     av->conf       = ar->conf;
 
     analysisValsSaveToDb(biConn, av, ba->tableName, 50);
     }
 
 analysisValsFree(&av);
 }
 
 boolean analysisExists(struct sqlConnection *biConn, struct biAnalysis *ba)
 {
 if (!sqlTableExists(biConn, ba->tableName))
     return FALSE;
 
 if (sqlTableSize(biConn, ba->tableName) == 0)
     return FALSE;
 
 return TRUE;
 }
 
+boolean analysisListExists(char *db, struct biAnalysis *baList)
+{
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+
+boolean exists = TRUE;
+struct biAnalysis *ba;
+for (ba = baList; ba; ba = ba->next)
+    if (!analysisExists(biConn, ba))
+	exists = FALSE;
+
+hFreeConn(&biConn);
+return exists;
+}
+
 
 void geneLevelPipeline(struct biAnalysis *baList, struct biResults *br, struct slName *genes)
 {
 if (!baList)
     return;
 
 /* This assumes all results go to same db (which is safe for now) */
 struct sqlConnection *biConn = hAllocConnProfile("localDb", baList->db);
 
 struct biAnalysis *ba;
 for (ba = baList; ba; ba = ba->next)
     {
     if (analysisExists(biConn, ba))
 	continue;
    
     struct analysisResult *arList = geneLevelAnalysis(ba, br, genes);
 
     fprintf(stdout, "storing in db\n");
     uglyTime(NULL);
     storeAnalysisResultsInDb(biConn, ba, arList);
     uglyTime("done");
 
     analysisResultFreeList(&arList);
     }
 
 hFreeConn(&biConn);
 }
 
 struct biResults *retrieveData(char *db, struct slName *datasets, boolean toLogP)
 {
 struct slName *dataset;
 
 struct biQuery *bq, *bqList = NULL;
 for (dataset = datasets; dataset; dataset = dataset->next)
     {
     bq = biQueryNew(db, dataset->name);  // Add Dataset 
     bq->getAllProbes = TRUE;        // Set flag to retrieve all probes
     biQueryAppend(&bqList, bq);     // Append query to query list 
     }
 
 /* Get results from all queries in list */
 struct biResults *br = biQueryResults(bqList);
 
 /* Convert to log p */
 if (toLogP)
     br->toLogP(br);
 
 return br;
 }
 
 void runPipelineOnDatasets(char *db, char *datasets)
 {
 /* datasets is a comma-separated string, each a different dataset table */
 struct slName *slDatasets = slNameListFromComma(datasets);
 
-/* Get raw gene/sample data for all overlapping samples in dataset list */
-uglyTime(NULL);
-struct biResults *br = retrieveData(db, slDatasets, TRUE);
-uglyTime("retrieveData");
+struct biAnalysis *baList = registerGeneLevelAnalyses(db, slDatasets);
 
-if (!br)
-    errAbort("No gene data!\n");
+if (!analysisListExists(db, baList))
+    {
+    /* Get raw gene/sample data for all overlapping samples in dataset list */
+    uglyTime(NULL);
+    struct biResults *br = retrieveData(db, slDatasets, TRUE);
+    uglyTime("retrieveData");
 
-/* Run gene level analyses (meta-gene, pathlet, etc.) */
-struct slName *genes = getAvailableGenes(db, br);
-uglyTime("Num available genes = %d", slCount(genes));
+    if (!br)
+	errAbort("No gene data!\n");
 
-struct biAnalysis *baList = registerGeneLevelAnalyses(db, slDatasets);
+    /* Run gene level analyses (meta-gene, pathlet, etc.) */
+    struct slName *genes = getAvailableGenes(db, br);
+    uglyTime("Num available genes = %d", slCount(genes));
 
-uglyTime(NULL);
-geneLevelPipeline(baList, br, genes);
-uglyTime("geneLevelPipeline");
+    geneLevelPipeline(baList, br, genes);
+    }
 
 /* Run pathway/geneset level analyses */
 
 
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
 
 runPipelineOnDatasets(BIOINT_DB, argv[1]);
 
 return 0;
 }