src/hg/instinct/bioInt2/bioSetLevel.c 1.2
1.2 2009/04/27 06:15:48 jsanborn
updated lots of stuff, will break older implementation of database
Index: src/hg/instinct/bioInt2/bioSetLevel.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioSetLevel.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/instinct/bioInt2/bioSetLevel.c 5 Apr 2009 21:08:07 -0000 1.1
+++ src/hg/instinct/bioInt2/bioSetLevel.c 27 Apr 2009 06:15:48 -0000 1.2
@@ -1,251 +1,261 @@
/* mapProbesToGenes - Will maps probes in BED format to overlapping gene(s). */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "hPrint.h"
#include "hdb.h"
#include "dystring.h"
#include "bioIntDb.h"
#include "bioIntDriver.h"
#include "cprob.h"
#include "hgStatsLib.h"
#include "bioController.h"
/* Gene-level analysis functions */
struct genesetData {
struct slName *genes;
struct hash *data;
};
-struct analysisResult *metaGeneset(struct biAnalysis *ba, void *data,
- char *sample, char *gene)
+struct analysisVals *metaGeneset(struct biAnalysis *ba, void *data,
+ int sample_id, int feature_id)
{
if (!data)
return NULL;
struct genesetData *gd = data;
struct slName *sl, *genes = gd->genes;
struct hash *geneData = gd->data;
+double total = 0.0;
+double count = 0.0;
+
struct slDouble *sd, *sdList = NULL;
for (sl = genes; sl; sl = sl->next)
{
struct hashEl *el = hashLookup(geneData, sl->name);
if (!el)
continue;
- struct analysisResult *ar = el->val;
+ struct analysisVals *av = el->val;
- sd = slDoubleNew(ar->val);
+ total += av->val;
+ count += 1.0;
+ sd = slDoubleNew(av->conf);
slAddHead(&sdList, sd);
}
if (!sdList)
return NULL;
float chi2, metaP;
if (!fishersMetaSigned(sdList, &chi2, &metaP))
return NULL;
-struct analysisResult *ar;
-AllocVar(ar);
-ar->sample = cloneString(sample);
-ar->feature = cloneString(gene);
-ar->val = metaP;
-ar->conf = chi2;
-return ar;
+struct analysisVals *av;
+AllocVar(av);
+av->sample_id = sample_id;
+av->feature_id = feature_id;
+av->val = total / count;
+av->conf = metaP;
+return av;
}
/* Pipeline Stuff */
-struct slPair *getGenesets()
+struct slPair *getGenesets(struct sqlConnection *biConn)
{
char query[256];
-safef(query, sizeof(query), "select * from genesets");
+safef(query, sizeof(query), "select * from %s", GG_TABLE);
-struct sqlConnection *conn = hAllocConnProfile("localDb", "pathway");
-struct sqlResult *sr = sqlGetResult(conn, query);
+struct sqlResult *sr = sqlGetResult(biConn, query);
char **row = NULL;
+struct hash *hash = hashNew(0);
+
struct slPair *sp, *spList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
+ char *gs_id = row[0]; // name
+ char *gene_id = row[1]; // members
+
+ struct hashEl *el = hashLookup(hash, gs_id);
+ if (!el)
+ {
AllocVar(sp);
- sp->name = cloneString(row[0]); // name
- sp->val = cloneString(row[1]); // members
+ sp->name = cloneString(gs_id);
+ sp->val = NULL;
slAddHead(&spList, sp);
+ hashAdd(hash, gs_id, sp);
}
-sqlFreeResult(&sr);
+ else
+ sp = el->val;
+ struct slName *sl = slNameNew(gene_id);
+ slAddTail(&sp->val, sl);
+ }
slReverse(&spList);
-hFreeConn(&conn);
+
+sqlFreeResult(&sr);
+hashFree(&hash);
return spList;
}
-struct analysisResult *genesetLevelAnalysis(struct biAnalysis *ba,
+struct analysisVals *genesetLevelAnalysis(struct sqlConnection *biConn, struct biAnalysis *ba,
struct slPair *spData, struct slPair *spGenesets)
{
if (!ba->analyze)
return NULL;
+//struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
+
fprintf(stdout, "starting geneset analysis.\n");
struct slPair *gs, *sp;
struct genesetData *gd;
AllocVar(gd);
-struct analysisResult *ar, *arList = NULL;
+int count = 0, numGenesets = slCount(spGenesets);
+
+struct analysisVals *av, *avList = NULL;
for (gs = spGenesets; gs; gs = gs->next)
{
- char *members = gs->val;
-
- struct slName *gList = slNameListFromComma(members);
- gd->genes = gList;
+ int feature_id = atoi(gs->name); //hashIntValDefault(featureHash, gs->name, -1);
+ struct slName *members = gs->val;
+ gd->genes = members;
for (sp = spData; sp; sp = sp->next)
{
gd->data = sp->val;
- ar = ba->analyze(ba, gd, sp->name, gs->name);
- if (!ar)
+ int sample_id = atoi(sp->name);
+ av = ba->analyze(ba, gd, sample_id, feature_id);
+ if (!av)
continue;
- slAddHead(&arList, ar);
+ slAddHead(&avList, av);
}
- fprintf(stdout, ".");
+ count++;
+ fprintf(stdout, "%d of %d genesets\n", count, numGenesets);
fflush(stdout);
- slNameFreeList(&gList);
gd->genes = NULL;
gd->data = NULL;
}
fprintf(stdout, "\n");
-return arList;
+return avList;
}
void slPairHashesFree(struct slPair **pEl)
{
struct slPair *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
struct hash *hash = el->val;
-hashFreeWithVals(&hash, analysisResultFree);
+hashFreeWithVals(&hash, analysisValsFree);
freez(pEl);
}
void slPairHashesFreeList(struct slPair **pList)
{
struct slPair *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
slPairHashesFree(&el);
}
*pList = NULL;
}
void slPairStringFree(struct slPair **pEl)
{
struct slPair *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
char *name = el->val;
freeMem(name);
freez(pEl);
}
void slPairStringFreeList(struct slPair **pList)
{
struct slPair *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
slPairStringFree(&el);
}
*pList = NULL;
}
struct slPair *analysisValsSamplesHashes(struct sqlConnection *biConn,
struct slName *dataset)
{
-struct hash *sampleHash = createHash(biConn, SA_TABLE, "id", "name");
-struct hash *featureHash = createHash(biConn, AF_TABLE, "id", "feature_name");
-
/* Currently only looks at first dataset in slName list passed in */
char query[128];
safef(query, sizeof(query), "select * from %s", dataset->name);
struct slPair *sp, *spList = NULL;
struct hash *hash = hashNew(0);
struct sqlResult *sr = sqlGetResult(biConn, query);
char **row = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
char *sample_id = row[0];
char *feature_id = row[1];
- char *sample_name = hashMustFindVal(sampleHash, sample_id);
- char *feature_name = hashMustFindVal(featureHash, feature_id);
-
- struct analysisResult *ar;
- AllocVar(ar);
- ar->sample = cloneString(sample_name);
- ar->feature = cloneString(feature_name);
- ar->val = atof(row[2]);
- ar->conf = atof(row[3]);
+ struct analysisVals *av = analysisValsLoad(row);
- struct hashEl *el = hashLookup(hash, ar->sample);
+ struct hashEl *el = hashLookup(hash, sample_id);
if (!el)
{
AllocVar(sp);
- sp->name = cloneString(ar->sample);
+ sp->name = cloneString(sample_id);
sp->val = hashNew(0);
- hashAdd(hash, ar->sample, sp);
+ hashAdd(hash, sample_id, sp);
slAddHead(&spList, sp);
}
else
sp = el->val;
struct hash *featureHash = sp->val;
- hashAdd(featureHash, ar->feature, ar);
+ hashAdd(featureHash, feature_id, av);
}
sqlFreeResult(&sr);
hashFree(&hash);
-hashFree(&sampleHash);
-hashFree(&featureHash);
return spList;
}
void genesetLevelPipeline(struct biAnalysis *ba)
{
uglyTime(NULL);
struct sqlConnection *biConn = hAllocConnProfile("localDb", ba->db);
struct slPair *spData = analysisValsSamplesHashes(biConn, ba->inputTables);
uglyTime("got sample hashes");
-struct slPair *spGenesets = getGenesets();
+struct slPair *spGenesets = getGenesets(biConn);
fprintf(stderr, "got %d genesets\n", slCount(spGenesets));
-struct analysisResult *arList = genesetLevelAnalysis(ba, spData, spGenesets);
+struct analysisVals *avList = genesetLevelAnalysis(biConn, ba, spData, spGenesets);
uglyTime("analyzed all genesets");
fprintf(stdout, "storing results...\n");
-storeAnalysisResultsInDb(biConn, ba, arList);
+storeAnalysisValsInDb(biConn, ba->tableName, avList);
uglyTime("analyzed all genesets");
hFreeConn(&biConn);
slPairHashesFreeList(&spData);
slPairStringFreeList(&spGenesets);
hFreeConn(&biConn);
}