src/hg/instinct/bioInt2/bioLevelI.c 1.1
1.1 2009/03/21 21:31:54 jsanborn
moved stuff around
Index: src/hg/instinct/bioInt2/bioLevelI.c
===================================================================
RCS file: src/hg/instinct/bioInt2/bioLevelI.c
diff -N src/hg/instinct/bioInt2/bioLevelI.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/bioLevelI.c 21 Mar 2009 21:31:54 -0000 1.1
@@ -0,0 +1,430 @@
+/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hPrint.h"
+#include "hdb.h"
+#include "dystring.h"
+#include "bioIntDb.h"
+#include "bioIntDriver.h"
+#include "cprob.h"
+#include "hgStatsLib.h"
+#include "bioController.h"
+
+
+void printSlName(struct slName *slList)
+{
+if (!slList)
+ fprintf(stdout, "nothing in slName list\n");
+
+struct slName *sl;
+
+fprintf(stdout, "num samples = %d\n", slCount(slList));
+for (sl = slList; sl; sl = sl->next)
+ {
+ fprintf(stdout, "%s", sl->name);
+ if (sl->next)
+ fprintf(stdout, ",");
+ }
+fprintf(stdout, "\n");
+}
+
+void slPairDoubleFree(struct slPair **pEl)
+{
+struct slPair *el;
+if ((el = *pEl) == NULL) return;
+
+freeMem(el->name);
+struct slDouble *sdList = el->val;
+slFreeList(&sdList);
+freez(pEl);
+}
+
+void slPairDoubleFreeList(struct slPair **pList)
+{
+struct slPair *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+ {
+ next = el->next;
+ slPairDoubleFree(&el);
+ }
+*pList = NULL;
+}
+
+void analysisResultFree(struct analysisResult **pEl)
+{
+struct analysisResult *el;
+if ((el = *pEl) == NULL) return;
+
+freeMem(el->sample);
+freeMem(el->feature);
+freez(pEl);
+}
+
+void analysisResultFreeList(struct analysisResult **pList)
+{
+struct analysisResult *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+ {
+ next = el->next;
+ analysisResultFree(&el);
+ }
+*pList = NULL;
+}
+
+struct slName *getAvailableGenes(char *db, struct biResults *br)
+{
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+
+char query[256];
+safef(query, sizeof(query), "select DISTINCT geneSymbol from kgXref join knownGene on kgXref.kgId = knownGene.name");
+
+struct slName *sl, *slList = sqlQuickList(biConn, query);
+hFreeConn(&biConn);
+
+struct slName *geneList = NULL;
+for (sl = slList; sl; sl = sl->next)
+ {
+ struct slName *probes = br->probesForGene(br, sl->name);
+ int numProbes = slCount(probes);
+ slNameFreeList(&probes);
+ if (numProbes == 0)
+ continue;
+
+ slNameAddHead(&geneList, sl->name);
+ }
+
+return geneList;
+}
+
+
+struct slPair *geneLevelData(struct biResults *br, struct biData *bdList, char *sample, char *gene)
+{
+struct biData *bd;
+struct slPair *sp, *spList = NULL;
+for (bd = bdList; bd; bd = bd->next)
+ {
+ char *dataset = bd->name;
+ struct slName *sl, *probes = br->probesForGeneInDataset(br, gene, dataset);
+
+ AllocVar(sp);
+ sp->name = cloneString(bd->type);
+ sp->val = NULL;
+ struct slDouble *sd, *sdList = NULL;
+ for (sl = probes; sl; sl = sl->next)
+ {
+ struct hashEl *el = hashLookup(bd->hash, sl->name);
+ if (!el)
+ continue;
+ struct slDouble *sdVal = el->val;
+ sd = slDoubleNew(sdVal->val);
+ slAddHead(&sdList, sd);
+ }
+ sp->val = sdList;
+ slAddHead(&spList, sp);
+
+ slNameFreeList(&probes);
+ }
+
+return spList;
+}
+
+struct analysisResult *geneLevelAnalysis(struct biAnalysis *ba, struct biResults *br,
+ struct slName *genes)
+{
+fprintf(stdout, "starting gene level analysis\n");
+
+struct slName *gene, *sample, *samples = br->allSamplesInCommon(br);
+
+struct analysisResult *ar, *arList = NULL;
+for (sample = samples; sample; sample = sample->next)
+ {
+ struct biData *bdList = br->dataForSample(br, sample->name);
+ for (gene = genes; gene; gene = gene->next)
+ {
+ struct slPair *spList = geneLevelData(br, bdList, sample->name, gene->name);
+ if (!spList)
+ continue;
+ ar = ba->analyze(ba, spList, sample->name, gene->name);
+ slPairDoubleFreeList(&spList);
+ if (!ar)
+ continue;
+ slAddHead(&arList, ar);
+ }
+ fprintf(stdout, ".");
+ biDataFree(&bdList);
+ }
+fprintf(stdout, "\n");
+
+return arList;
+}
+
+int findIdInTable(struct sqlConnection *biConn, char *tableName,
+ char *idField, char *sField, char *name)
+{
+if (sqlTableSize(biConn, tableName) == 0) /* brand new table, return 0 */
+ return 0;
+
+char query[256];
+safef(query, sizeof(query),
+ "select DISTINCT %s from %s where %s = \"%s\";",
+ idField, tableName, sField, name);
+if (sqlExists(biConn, query)) /* sample name found, use same id */
+ return sqlQuickNum(biConn, query);
+
+/* Else, find maximum sample id and add one to it */
+safef(query, sizeof(query),
+ "select max(%s) from %s;",
+ idField, tableName);
+int maxId = sqlQuickNum(biConn, query);
+return maxId + 1;
+}
+
+
+void createAnalysisFeaturesTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "id int unsigned not null,\n");
+dyStringPrintf(dy, "feature_name varchar(255) not null,\n");
+dyStringPrintf(dy, "KEY(id),\n");
+dyStringPrintf(dy, "KEY(feature_name),\n");
+dyStringPrintf(dy, "KEY(id, feature_name),\n");
+dyStringPrintf(dy, "KEY(feature_name, id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+void createAnalysisValsTable(struct sqlConnection *biConn, char *tableName)
+{
+struct dyString *dy = newDyString(1024);
+dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
+dyStringPrintf(dy, "sample_id int unsigned not null,\n");
+dyStringPrintf(dy, "feature_id int unsigned not null,\n");
+dyStringPrintf(dy, "val float not null,\n");
+dyStringPrintf(dy, "conf float not null,\n");
+dyStringPrintf(dy, "KEY(feature_id, sample_id),\n");
+dyStringPrintf(dy, "KEY(sample_id, feature_id)\n");
+dyStringPrintf(dy, ")\n");
+sqlUpdate(biConn,dy->string);
+dyStringFree(&dy);
+}
+
+boolean analysisFeatureExists(struct sqlConnection *biConn, struct analysisFeatures *af)
+{
+char query[256];
+safef(query, sizeof(query),
+ "select * from analysisFeatures where id = %d "
+ "and feature_name = \"%s\" ",
+ af->id, af->feature_name);
+
+return sqlExists(biConn, query);
+}
+
+struct hash *getAnalysisFeaturesHash(struct sqlConnection *biConn)
+{
+fprintf(stdout, "getting analysis features hash.\n");
+
+struct hash *hash = hashNew(0);
+char query[128];
+safef(query, sizeof(query), "select * from analysisFeatures");
+
+struct analysisFeatures *af, *afList = analysisFeaturesLoadByQuery(biConn, query);
+
+for (af = afList; af; af = af->next)
+ hashAddInt(hash, af->feature_name, af->id);
+
+analysisFeaturesFreeList(&afList);
+return hash;
+}
+
+struct hash *storeAnalysisFeaturesInDb(struct sqlConnection *biConn, struct analysisResult *arList)
+{
+fprintf(stdout, "storing analysis features\n");
+
+char *tableName = "analysisFeatures";
+if (!sqlTableExists(biConn, tableName))
+ createAnalysisFeaturesTable(biConn, tableName);
+
+/* Get existing analysis features */
+struct hash *hash = getAnalysisFeaturesHash(biConn);
+
+struct analysisFeatures *af;
+struct analysisResult *ar;
+for (ar = arList; ar; ar = ar->next)
+ {
+ if (hashLookup(hash, ar->feature))
+ continue; // already visited, ignore
+
+ int feature_id = findIdInTable(biConn, tableName, "id", "feature_name", ar->feature);
+
+ AllocVar(af);
+ af->id = feature_id;
+ af->feature_name = cloneString(ar->feature);
+ if (!analysisFeatureExists(biConn, af))
+ analysisFeaturesSaveToDb(biConn, af, tableName, 10);
+
+ hashAddInt(hash, af->feature_name, af->id);
+ analysisFeaturesFree(&af);
+ }
+
+return hash;
+}
+
+struct hash *getSampleIdHash(struct sqlConnection *biConn)
+{
+fprintf(stdout, "getting sample id hash.\n");
+
+struct hash *hash = hashNew(0);
+char query[128];
+safef(query, sizeof(query), "select * from samples");
+
+struct samples *sa, *saList = samplesLoadByQuery(biConn, query);
+
+for (sa = saList; sa; sa = sa->next)
+ hashAddInt(hash, sa->name, sa->id);
+
+samplesFreeList(&saList);
+return hash;
+}
+
+
+void storeAnalysisResultsInDb(struct sqlConnection *biConn, struct biAnalysis *ba,
+ struct analysisResult *arList)
+{
+fprintf(stdout, "storing analysis in table %s in %s db\n",
+ ba->tableName, ba->db);
+
+if (!sqlTableExists(biConn, ba->tableName))
+ createAnalysisValsTable(biConn, ba->tableName);
+
+struct hash *featureIds = storeAnalysisFeaturesInDb(biConn, arList);
+struct hash *sampleIds = getSampleIdHash(biConn);
+
+struct analysisVals *av = AllocA(struct analysisVals);
+struct analysisResult *ar;
+for (ar = arList; ar; ar = ar->next)
+ {
+ int sample_id = hashIntValDefault(sampleIds, ar->sample, -1);
+ int feature_id = hashIntValDefault(featureIds, ar->feature, -1);
+
+ if (sample_id == -1 || feature_id == -1)
+ continue;
+
+ av->sample_id = sample_id;
+ av->feature_id = feature_id;
+ av->val = ar->val;
+ av->conf = ar->conf;
+
+ analysisValsSaveToDb(biConn, av, ba->tableName, 50);
+ }
+
+analysisValsFree(&av);
+}
+
+boolean analysisExists(struct sqlConnection *biConn, struct biAnalysis *ba)
+{
+if (!sqlTableExists(biConn, ba->tableName))
+ return FALSE;
+
+if (sqlTableSize(biConn, ba->tableName) == 0)
+ return FALSE;
+
+return TRUE;
+}
+
+boolean analysisListExists(char *db, struct biAnalysis *baList)
+{
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+
+boolean exists = TRUE;
+struct biAnalysis *ba;
+for (ba = baList; ba; ba = ba->next)
+ if (!analysisExists(biConn, ba))
+ exists = FALSE;
+
+hFreeConn(&biConn);
+return exists;
+}
+
+
+void geneLevelPipeline(struct biAnalysis *baList, struct biResults *br, struct slName *genes)
+{
+if (!baList)
+ return;
+
+/* This assumes all results go to same db (which is safe for now) */
+struct sqlConnection *biConn = hAllocConnProfile("localDb", baList->db);
+
+struct biAnalysis *ba;
+for (ba = baList; ba; ba = ba->next)
+ {
+ if (analysisExists(biConn, ba))
+ continue;
+
+ struct analysisResult *arList = geneLevelAnalysis(ba, br, genes);
+
+ fprintf(stdout, "storing in db\n");
+ uglyTime(NULL);
+ storeAnalysisResultsInDb(biConn, ba, arList);
+ uglyTime("done");
+
+ analysisResultFreeList(&arList);
+ }
+
+hFreeConn(&biConn);
+}
+
+struct biResults *retrieveData(char *db, struct slName *datasets, boolean toLogP)
+{
+struct slName *dataset;
+
+struct biQuery *bq, *bqList = NULL;
+for (dataset = datasets; dataset; dataset = dataset->next)
+ {
+ bq = biQueryNew(db, dataset->name); // Add Dataset
+ bq->getAllProbes = TRUE; // Set flag to retrieve all probes
+ biQueryAppend(&bqList, bq); // Append query to query list
+ }
+
+/* Get results from all queries in list */
+struct biResults *br = biQueryResults(bqList);
+
+/* Convert to log p */
+if (toLogP)
+ br->toLogP(br);
+
+return br;
+}
+
+void runAnalysisPipeline(char *db, char *datasets, struct biAnalysis *baList)
+{
+/* datasets is a comma-separated string, each a different dataset table */
+struct slName *slDatasets = slNameListFromComma(datasets);
+
+if (!analysisListExists(db, baList))
+ {
+ /* Get raw gene/sample data for all overlapping samples in dataset list */
+ uglyTime(NULL);
+ struct biResults *br = retrieveData(db, slDatasets, TRUE);
+ uglyTime("retrieveData");
+
+ if (!br)
+ errAbort("No gene data!\n");
+
+ /* Run gene level analyses (meta-gene, pathlet, etc.) */
+ struct slName *genes = getAvailableGenes(db, br);
+ uglyTime("Num available genes = %d", slCount(genes));
+
+ geneLevelPipeline(baList, br, genes);
+ }
+
+/* Run pathway/geneset level analyses */
+
+
+}