src/hg/instinct/bioInt2/bioLevelI.c 1.2
1.2 2009/03/22 01:07:28 jsanborn
updated
Index: src/hg/instinct/bioInt2/bioLevelI.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioLevelI.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/instinct/bioInt2/bioLevelI.c 21 Mar 2009 21:31:54 -0000 1.1
+++ src/hg/instinct/bioInt2/bioLevelI.c 22 Mar 2009 01:07:28 -0000 1.2
@@ -1,430 +1,430 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "hPrint.h"
#include "hdb.h"
#include "dystring.h"
#include "bioIntDb.h"
#include "bioIntDriver.h"
#include "cprob.h"
#include "hgStatsLib.h"
#include "bioController.h"
void printSlName(struct slName *slList)
{
if (!slList)
fprintf(stdout, "nothing in slName list\n");
struct slName *sl;
fprintf(stdout, "num samples = %d\n", slCount(slList));
for (sl = slList; sl; sl = sl->next)
{
fprintf(stdout, "%s", sl->name);
if (sl->next)
fprintf(stdout, ",");
}
fprintf(stdout, "\n");
}
void slPairDoubleFree(struct slPair **pEl)
{
struct slPair *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
struct slDouble *sdList = el->val;
slFreeList(&sdList);
freez(pEl);
}
void slPairDoubleFreeList(struct slPair **pList)
{
struct slPair *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
slPairDoubleFree(&el);
}
*pList = NULL;
}
void analysisResultFree(struct analysisResult **pEl)
{
struct analysisResult *el;
if ((el = *pEl) == NULL) return;
freeMem(el->sample);
freeMem(el->feature);
freez(pEl);
}
void analysisResultFreeList(struct analysisResult **pList)
{
struct analysisResult *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
analysisResultFree(&el);
}
*pList = NULL;
}
struct slName *getAvailableGenes(char *db, struct biResults *br)
{
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
char query[256];
safef(query, sizeof(query), "select DISTINCT geneSymbol from kgXref join knownGene on kgXref.kgId = knownGene.name");
struct slName *sl, *slList = sqlQuickList(biConn, query);
hFreeConn(&biConn);
struct slName *geneList = NULL;
for (sl = slList; sl; sl = sl->next)
{
struct slName *probes = br->probesForGene(br, sl->name);
int numProbes = slCount(probes);
slNameFreeList(&probes);
if (numProbes == 0)
continue;
slNameAddHead(&geneList, sl->name);
}
return geneList;
}
struct slPair *geneLevelData(struct biResults *br, struct biData *bdList, char *sample, char *gene)
{
struct biData *bd;
struct slPair *sp, *spList = NULL;
for (bd = bdList; bd; bd = bd->next)
{
char *dataset = bd->name;
struct slName *sl, *probes = br->probesForGeneInDataset(br, gene, dataset);
AllocVar(sp);
sp->name = cloneString(bd->type);
sp->val = NULL;
struct slDouble *sd, *sdList = NULL;
for (sl = probes; sl; sl = sl->next)
{
struct hashEl *el = hashLookup(bd->hash, sl->name);
if (!el)
continue;
struct slDouble *sdVal = el->val;
sd = slDoubleNew(sdVal->val);
slAddHead(&sdList, sd);
}
sp->val = sdList;
slAddHead(&spList, sp);
slNameFreeList(&probes);
}
return spList;
}
struct analysisResult *geneLevelAnalysis(struct biAnalysis *ba, struct biResults *br,
struct slName *genes)
{
fprintf(stdout, "starting gene level analysis\n");
struct slName *gene, *sample, *samples = br->allSamplesInCommon(br);
struct analysisResult *ar, *arList = NULL;
for (sample = samples; sample; sample = sample->next)
{
struct biData *bdList = br->dataForSample(br, sample->name);
for (gene = genes; gene; gene = gene->next)
{
struct slPair *spList = geneLevelData(br, bdList, sample->name, gene->name);
if (!spList)
continue;
ar = ba->analyze(ba, spList, sample->name, gene->name);
slPairDoubleFreeList(&spList);
if (!ar)
continue;
slAddHead(&arList, ar);
}
fprintf(stdout, ".");
biDataFree(&bdList);
}
fprintf(stdout, "\n");
return arList;
}
int findIdInTable(struct sqlConnection *biConn, char *tableName,
char *idField, char *sField, char *name)
{
if (sqlTableSize(biConn, tableName) == 0) /* brand new table, return 0 */
return 0;
char query[256];
safef(query, sizeof(query),
"select DISTINCT %s from %s where %s = \"%s\";",
idField, tableName, sField, name);
if (sqlExists(biConn, query)) /* sample name found, use same id */
return sqlQuickNum(biConn, query);
/* Else, find maximum sample id and add one to it */
safef(query, sizeof(query),
"select max(%s) from %s;",
idField, tableName);
int maxId = sqlQuickNum(biConn, query);
return maxId + 1;
}
void createAnalysisFeaturesTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "id int unsigned not null,\n");
dyStringPrintf(dy, "feature_name varchar(255) not null,\n");
dyStringPrintf(dy, "KEY(id),\n");
dyStringPrintf(dy, "KEY(feature_name),\n");
dyStringPrintf(dy, "KEY(id, feature_name),\n");
dyStringPrintf(dy, "KEY(feature_name, id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
void createAnalysisValsTable(struct sqlConnection *biConn, char *tableName)
{
struct dyString *dy = newDyString(1024);
dyStringPrintf(dy, "CREATE TABLE %s (\n", tableName);
dyStringPrintf(dy, "sample_id int unsigned not null,\n");
dyStringPrintf(dy, "feature_id int unsigned not null,\n");
dyStringPrintf(dy, "val float not null,\n");
dyStringPrintf(dy, "conf float not null,\n");
dyStringPrintf(dy, "KEY(feature_id, sample_id),\n");
dyStringPrintf(dy, "KEY(sample_id, feature_id)\n");
dyStringPrintf(dy, ")\n");
sqlUpdate(biConn,dy->string);
dyStringFree(&dy);
}
boolean analysisFeatureExists(struct sqlConnection *biConn, struct analysisFeatures *af)
{
char query[256];
safef(query, sizeof(query),
"select * from analysisFeatures where id = %d "
"and feature_name = \"%s\" ",
af->id, af->feature_name);
return sqlExists(biConn, query);
}
struct hash *getAnalysisFeaturesHash(struct sqlConnection *biConn)
{
fprintf(stdout, "getting analysis features hash.\n");
struct hash *hash = hashNew(0);
char query[128];
safef(query, sizeof(query), "select * from analysisFeatures");
struct analysisFeatures *af, *afList = analysisFeaturesLoadByQuery(biConn, query);
for (af = afList; af; af = af->next)
hashAddInt(hash, af->feature_name, af->id);
analysisFeaturesFreeList(&afList);
return hash;
}
struct hash *storeAnalysisFeaturesInDb(struct sqlConnection *biConn, struct analysisResult *arList)
{
fprintf(stdout, "storing analysis features\n");
char *tableName = "analysisFeatures";
if (!sqlTableExists(biConn, tableName))
createAnalysisFeaturesTable(biConn, tableName);
/* Get existing analysis features */
struct hash *hash = getAnalysisFeaturesHash(biConn);
struct analysisFeatures *af;
struct analysisResult *ar;
for (ar = arList; ar; ar = ar->next)
{
if (hashLookup(hash, ar->feature))
continue; // already visited, ignore
int feature_id = findIdInTable(biConn, tableName, "id", "feature_name", ar->feature);
AllocVar(af);
af->id = feature_id;
af->feature_name = cloneString(ar->feature);
if (!analysisFeatureExists(biConn, af))
analysisFeaturesSaveToDb(biConn, af, tableName, 10);
hashAddInt(hash, af->feature_name, af->id);
analysisFeaturesFree(&af);
}
return hash;
}
struct hash *getSampleIdHash(struct sqlConnection *biConn)
{
fprintf(stdout, "getting sample id hash.\n");
struct hash *hash = hashNew(0);
char query[128];
safef(query, sizeof(query), "select * from samples");
struct samples *sa, *saList = samplesLoadByQuery(biConn, query);
for (sa = saList; sa; sa = sa->next)
hashAddInt(hash, sa->name, sa->id);
samplesFreeList(&saList);
return hash;
}
void storeAnalysisResultsInDb(struct sqlConnection *biConn, struct biAnalysis *ba,
struct analysisResult *arList)
{
fprintf(stdout, "storing analysis in table %s in %s db\n",
ba->tableName, ba->db);
if (!sqlTableExists(biConn, ba->tableName))
createAnalysisValsTable(biConn, ba->tableName);
struct hash *featureIds = storeAnalysisFeaturesInDb(biConn, arList);
struct hash *sampleIds = getSampleIdHash(biConn);
struct analysisVals *av = AllocA(struct analysisVals);
struct analysisResult *ar;
for (ar = arList; ar; ar = ar->next)
{
int sample_id = hashIntValDefault(sampleIds, ar->sample, -1);
int feature_id = hashIntValDefault(featureIds, ar->feature, -1);
if (sample_id == -1 || feature_id == -1)
continue;
av->sample_id = sample_id;
av->feature_id = feature_id;
av->val = ar->val;
av->conf = ar->conf;
analysisValsSaveToDb(biConn, av, ba->tableName, 50);
}
analysisValsFree(&av);
}
boolean analysisExists(struct sqlConnection *biConn, struct biAnalysis *ba)
{
if (!sqlTableExists(biConn, ba->tableName))
return FALSE;
if (sqlTableSize(biConn, ba->tableName) == 0)
return FALSE;
return TRUE;
}
boolean analysisListExists(char *db, struct biAnalysis *baList)
{
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
boolean exists = TRUE;
struct biAnalysis *ba;
for (ba = baList; ba; ba = ba->next)
if (!analysisExists(biConn, ba))
exists = FALSE;
hFreeConn(&biConn);
return exists;
}
void geneLevelPipeline(struct biAnalysis *baList, struct biResults *br, struct slName *genes)
{
if (!baList)
return;
/* This assumes all results go to same db (which is safe for now) */
struct sqlConnection *biConn = hAllocConnProfile("localDb", baList->db);
struct biAnalysis *ba;
for (ba = baList; ba; ba = ba->next)
{
if (analysisExists(biConn, ba))
continue;
struct analysisResult *arList = geneLevelAnalysis(ba, br, genes);
fprintf(stdout, "storing in db\n");
uglyTime(NULL);
storeAnalysisResultsInDb(biConn, ba, arList);
uglyTime("done");
analysisResultFreeList(&arList);
}
hFreeConn(&biConn);
}
struct biResults *retrieveData(char *db, struct slName *datasets, boolean toLogP)
{
struct slName *dataset;
struct biQuery *bq, *bqList = NULL;
for (dataset = datasets; dataset; dataset = dataset->next)
{
bq = biQueryNew(db, dataset->name); // Add Dataset
bq->getAllProbes = TRUE; // Set flag to retrieve all probes
biQueryAppend(&bqList, bq); // Append query to query list
}
/* Get results from all queries in list */
struct biResults *br = biQueryResults(bqList);
/* Convert to log p */
if (toLogP)
br->toLogP(br);
return br;
}
-void runAnalysisPipeline(char *db, char *datasets, struct biAnalysis *baList)
+void runAnalysisPipeline(struct biAnalysis *baList)
{
-/* datasets is a comma-separated string, each a different dataset table */
-struct slName *slDatasets = slNameListFromComma(datasets);
+char *db = baList->db;
+struct slName *slDatasets = baList->inputTables;
if (!analysisListExists(db, baList))
{
/* Get raw gene/sample data for all overlapping samples in dataset list */
uglyTime(NULL);
struct biResults *br = retrieveData(db, slDatasets, TRUE);
uglyTime("retrieveData");
if (!br)
errAbort("No gene data!\n");
/* Run gene level analyses (meta-gene, pathlet, etc.) */
struct slName *genes = getAvailableGenes(db, br);
uglyTime("Num available genes = %d", slCount(genes));
geneLevelPipeline(baList, br, genes);
}
/* Run pathway/geneset level analyses */
}