src/hg/instinct/bioInt2/bioPathwayLevel.c 1.3
1.3 2009/08/16 02:19:17 sbenz
Updated pipeline to be compatible with new binary
Index: src/hg/instinct/bioInt2/bioPathwayLevel.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioPathwayLevel.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/instinct/bioInt2/bioPathwayLevel.c 20 May 2009 22:42:13 -0000 1.2
+++ src/hg/instinct/bioInt2/bioPathwayLevel.c 16 Aug 2009 02:19:17 -0000 1.3
@@ -1,534 +1,676 @@
/* bioPathwayLevel.c -- code to run pathway pipeline */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "hPrint.h"
#include "hdb.h"
#include "dystring.h"
#include "bioIntDb.h"
#include "bioIntDriver.h"
#include "cprob.h"
#include "hgStatsLib.h"
#include "bioController.h"
char *fgDir = "factorGraph";
char *tmpDir = "tmp";
/* Gene-level analysis functions */
struct links {
struct links *next;
char *parent_name;
char *child_name;
char *link_type;
};
struct entities {
struct entities *next;
int id;
char *type;
char *name;
};
struct pathwayData {
int id;
struct entities *entities;
struct links *links;
struct typeHash *data;
struct hash *featureIds;
};
struct analysisVals *readAnalysisValsFromFile(char *filename,
struct entities *enList, int sample_id)
{
struct lineFile *lf = lineFileOpen(filename, TRUE);
if (!lf)
errAbort("File does not exist.");
struct analysisVals *av, *avList = NULL;
struct entities *en;
struct hash *hash = hashNew(0);
for (en = enList; en; en = en->next)
hashAddInt(hash, en->name, en->id);
+// skip the first
+lineFileSkip(lf,1);
+
char *row[2];
while (lineFileRowTab(lf, row))
{
char *name = row[0];
char *valStr = row[1];
if (sameString(valStr, "NA"))
continue;
double val = atof(valStr);
int entity_id = hashIntValDefault(hash, name, -1);
if (entity_id == -1)
{
fprintf(stderr, "entity %s not found\n", name);
continue;
}
AllocVar(av);
av->sample_id = sample_id;
av->feature_id = entity_id;
av->val = val;
av->conf = val;
slAddHead(&avList, av);
}
lineFileClose(&lf);
slReverse(&avList);
hashFree(&hash);
return avList;
}
struct pathwayVals *convertToPathwayVals(struct analysisVals *avList, int pathway_id)
{
struct pathwayVals *pv, *pvList = NULL;
struct analysisVals *av;
for (av = avList; av; av = av->next)
{
AllocVar(pv);
pv->pathway_id = pathway_id;
pv->sample_id = av->sample_id;
pv->feature_id = av->feature_id;
pv->val = av->val;
pv->conf = av->conf;
slAddHead(&pvList, pv);
}
slReverse(&pvList);
return pvList;
}
boolean writePathwayFile(char *filename, struct pathwayData *pd)
{
if (!pd)
return FALSE;
if (!pd->entities || !pd->links)
return FALSE;
FILE *f = mustOpen(filename, "w");
if (!f)
return FALSE;
struct entities *en;
for (en = pd->entities; en; en = en->next)
fprintf(f, "%s\t%s\n", en->type, en->name);
struct links *li;
for (li = pd->links; li; li = li->next)
fprintf(f, "%s\t%s\t%s\n", li->parent_name, li->child_name, li->link_type);
return carefulCloseWarn(&f);
}
boolean prepFactorGraph(struct biAnalysis *ba, void *data)
{
if (!data)
return FALSE;
struct pathwayData *pd = data;
/* make temporary file containing pathway info */
char tmpPathway[512];
-safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
+//safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
+safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_pid_%d_path.tab", tmpDir, ba->tableName, pd->id);
+
if (!writePathwayFile(tmpPathway, pd))
errAbort("Problem writing pathway file %s\n", tmpPathway);
-char command[512];
+/*char command[512];
safef(command, sizeof(command),
"%s/patient_2stage_prep.sh %s", fgDir, tmpPathway);
-
+*/
/* Run command (cross fingers!) */
-int ret = system(command);
-return (ret == 0);
+/*int ret = system(command);
+return (ret == 0);*/
+return TRUE;
}
+// this no longer gets called
boolean cleanupFactorGraph(struct biAnalysis *ba)
{
/* make temporary file containing pathway info */
char tmpPathway[512];
safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
char command[512];
safef(command, sizeof(command),
"%s/patient_2stage_cleanup.sh %s", fgDir, tmpPathway);
/* Run command (cross fingers!) */
int ret = system(command);
return (ret == 0);
}
-boolean writeEvidenceFile(char *filename, struct pathwayData *pd)
+boolean writeEvidenceFile_OLD(char *filename, struct pathwayData *pd)
{
if (!pd)
return FALSE;
if (!pd->entities || !pd->links)
return FALSE;
FILE *f = mustOpen(filename, "w");
if (!f)
return FALSE;
struct entities *en, *enList = pd->entities;
char idStr[128];
for (en = enList; en; en = en->next)
{
int id = hashIntValDefault(pd->featureIds, en->name, -1);
if (id == -1)
continue;
safef(idStr, sizeof(idStr), "%d", id);
struct typeHash *th;
for (th = pd->data; th; th = th->next)
{
struct hashEl *el = hashLookup(th->hash, idStr);
if (!el)
continue;
struct analysisVals *av = el->val;
double val = av->conf;
char *type;
if (sameString(th->type, "Expression"))
type = "mRNA";
else if (sameString(th->type, "CNV"))
type = "genome";
else
continue;
fprintf(f, "%s\t%s\t%f\n", en->name, type, val);
}
}
return carefulCloseWarn(&f);
}
+boolean writeEvidenceFile(char *filenamePrefix, struct pathwayData *pd)
+{
+if (!pd)
+ return FALSE;
+if (!pd->entities || !pd->links)
+ return FALSE;
+
+char genomeEvidenceFile[512];
+safef(genomeEvidenceFile, sizeof(genomeEvidenceFile), "%s_genome.tab", filenamePrefix);
+FILE *genomeF = mustOpen(genomeEvidenceFile, "w");
+if (!genomeF)
+ return FALSE;
+
+char mrnaEvidenceFile[512];
+safef(mrnaEvidenceFile, sizeof(mrnaEvidenceFile), "%s_mRNA.tab", filenamePrefix);
+FILE *mrnaF = mustOpen(mrnaEvidenceFile, "w");
+if (!mrnaF)
+ return FALSE;
+
+// write the upper-left node
+fprintf(genomeF,"id");
+fprintf(mrnaF,"id");
+struct entities *en, *enList = pd->entities;
+char idStr[128];
+for (en = enList; en; en = en->next)
+ {
+ int id = hashIntValDefault(pd->featureIds, en->name, -1);
+ if (id == -1)
+ continue;
+ safef(idStr, sizeof(idStr), "%d", id);
+
+ struct typeHash *th;
+ for (th = pd->data; th; th = th->next)
+ {
+ struct hashEl *el = hashLookup(th->hash, idStr);
+ if (!el)
+ continue;
+
+ char *type;
+ if (sameString(th->type, "Expression"))
+ fprintf(mrnaF, "\t%s", en->name);
+ else if (sameString(th->type, "CNV"))
+ fprintf(genomeF, "\t%s", en->name);
+ else
+ continue;
+ }
+ }
+
+ fprintf(genomeF,"\nsample");
+ fprintf(mrnaF,"\nsample");
+ for (en = enList; en; en = en->next)
+ {
+ int id = hashIntValDefault(pd->featureIds, en->name, -1);
+ if (id == -1)
+ continue;
+ safef(idStr, sizeof(idStr), "%d", id);
+
+ struct typeHash *th;
+ for (th = pd->data; th; th = th->next)
+ {
+ struct hashEl *el = hashLookup(th->hash, idStr);
+ if (!el)
+ continue;
+
+ struct analysisVals *av = el->val;
+ double val = av->conf;
+
+ char *type;
+ if (sameString(th->type, "Expression"))
+ fprintf(mrnaF, "\t%f", val);
+ else if (sameString(th->type, "CNV"))
+ fprintf(genomeF, "\t%f", val);
+ else
+ continue;
+ }
+ }
+
+
+fprintf(genomeF,"\n");
+fprintf(mrnaF,"\n");
+boolean gClose = carefulCloseWarn(&genomeF);
+boolean mClose = carefulCloseWarn(&mrnaF);
+
+return (gClose && mClose);
+}
+
+// eventually this will be more dynamic, but for now...
+boolean writeConfigFile(char *filename, char *evidPrefix)
+{
+
+FILE *f = mustOpen(filename, "w");
+if (!f)
+ return FALSE;
+
+/*fprintf(f, "inference [pathway_match=_pid_66_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [pathway_match=_pid_48_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [pathway_match=_pid_3_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [pathway_match=_pid_9_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [method=JTREE,updates=HUGIN,verbose=1]\n");
+fprintf(f, "evidence [suffix=_genome.tab,node=genome,disc=-0.451727;0.467580,epsilon=0.01,epsilon0=0.2]\n");
+fprintf(f, "evidence [suffix=_mRNA.tab,node=mRNA,disc=-0.1817733;0.1824913,epsilon=0.01,epsilon0=0.2]\n");
+fprintf(f, "em_step [_mRNA.tab=-obs>,_genome.tab=-obs>]\n");
+fprintf(f, "em [max_iters=0,log_z_tol=0.001]\n");
+*/
+fprintf(f, "inference [pathway_match=_pid_66_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [pathway_match=_pid_48_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [pathway_match=_pid_3_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [pathway_match=_pid_9_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [method=JTREE,updates=HUGIN,verbose=1]\n");
+fprintf(f, "evidence [suffix=_genome.tab,node=genome,disc=-0.451727;0.467580,factorParams=0.680434527777778;0.188006489814815;0.131558978703704;0.241894870370370;0.55878737962963;0.199317706481481;0.127232687037037;0.246933030555556;0.625834314814815]\n");
+fprintf(f, "evidence [suffix=_mRNA.tab,node=mRNA,disc=-0.1817733;0.1824913,factorParams=0.6211706;0.250951192592593;0.127878243518519;0.259141574074074;0.488297342592593;0.252561083333333;0.137161049074074;0.213898711111111;0.648940222222222]\n");
+fprintf(f, "em [max_iters=0,log_z_tol=0.001]\n");
+
+return carefulCloseWarn(&f);
+}
+
struct analysisVals *factorGraph(struct biAnalysis *ba, void *data,
int sample_id, int feature_id)
{
if (!data)
return NULL;
struct pathwayData *pd = data;
/* make temporary file containing pathway info */
char tmpPathway[512];
-safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
+safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_pid_%d_path.tab", tmpDir, ba->tableName, pd->id);
/* make temporary file containing evidence */
char tmpEvidence[512];
-safef(tmpEvidence, sizeof(tmpEvidence), "%s/tmp_%s_evid.tab", tmpDir, ba->tableName);
+//safef(tmpEvidence, sizeof(tmpEvidence), "%s/tmp_%s_evid.tab", tmpDir, ba->tableName);
+safef(tmpEvidence, sizeof(tmpEvidence), "%s/tmp_%s_evid", tmpDir, ba->tableName);
if (!writeEvidenceFile(tmpEvidence, pd))
errAbort("Problem writing evidence file %s\n", tmpEvidence);
+/* make a new tmp file containing the config */
+char tmpConfig[512];
+safef(tmpConfig, sizeof(tmpConfig), "%s/tmp_%s_config.txt", tmpDir, ba->tableName);
+
+if (!writeConfigFile(tmpConfig, tmpEvidence))
+ errAbort("Problem writing config file %s\n", tmpConfig);
+
+
/* build command */
char tmpOutput[128];
safef(tmpOutput, sizeof(tmpOutput), "%s/tmp.out", tmpDir);
char command[512];
safef(command, sizeof(command),
+ "%s/hgFactorGraph -p %s -c %s -b %s > %s",
+ fgDir, tmpPathway, tmpConfig, tmpEvidence, tmpOutput);
+
+/*safef(command, sizeof(command),
"%s/patient_2stage_exec.sh %s %s \"-d -1.3,1.3\" > %s",
fgDir, tmpPathway, tmpEvidence, tmpOutput);
+*/
+
/* Run command (cross fingers!) */
int ret = system(command);
if (ret)
{
fprintf(stderr, "Something went wrong!!");
return NULL;
}
/* read in data from output */
struct analysisVals *avList = readAnalysisValsFromFile(tmpOutput, pd->entities, sample_id);
return avList;
}
/* Pipeline Stuff */
struct pathwayVals *pathwayLevelAnalysis(struct sqlConnection *biConn, struct biAnalysis *ba,
struct slPair *spData, struct slPair *spPathways)
{
if (!ba->analyze)
return NULL;
-fprintf(stdout, "starting geneset analysis.\n");
+fprintf(stdout, "starting pathway analysis.\n");
struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
struct slPair *pa, *sp;
int count = 0, numPathways = slCount(spPathways);
struct pathwayVals *pvList = NULL;
for (pa = spPathways; pa; pa = pa->next)
{
struct pathwayData *pd = pa->val;
int feature_id = pd->id;
fprintf(stderr, "prepping pathway %s:\n", pa->name);
if (!prepFactorGraph(ba, pd))
{
fprintf(stderr, "problem with prep, skipping pathway.\n");
continue;
}
struct analysisVals *av, *avList = NULL;
for (sp = spData; sp; sp = sp->next)
{
pd->data = sp->val;
if (slCount(pd->data) < 2)
continue; // currently only consider samples with more than one type of evidence.
pd->featureIds = featureHash;
int sample_id = atoi(sp->name);
av = ba->analyze(ba, pd, sample_id, -1);
if (!av)
break;
avList = slCat(avList, av);
fprintf(stderr, ".");
fflush(stderr);
}
fprintf(stderr, "\n");
struct pathwayVals *newPvList = convertToPathwayVals(avList, feature_id);
pvList = slCat(pvList, newPvList);
count++;
- fprintf(stderr, "%d of %d pathways\n", count, numPathways);
+ fprintf(stderr, "%d of %d pathways complete!\n", count, numPathways);
fflush(stderr);
- if (!cleanupFactorGraph(ba))
+ char tmpPathway[512];
+ safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_pid_%d_path.tab", tmpDir, ba->tableName, pd->id);
+ if( remove( tmpPathway ) != 0 )
fprintf(stderr, "problem with cleanup.\n");
+ /*if (!cleanupFactorGraph(ba))
+ fprintf(stderr, "problem with cleanup.\n");*/
analysisValsFreeList(&avList);
}
fprintf(stdout, "\n");
return pvList;
}
void entitiesFree(struct entities **pEl)
{
struct entities *el;
if ((el = *pEl) == NULL) return;
freeMem(el->type);
freeMem(el->name);
freez(pEl);
}
void entitiesFreeList(struct entities **pList)
/* Free a list of dynamically allocated pathwayVals's */
{
struct entities *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
entitiesFree(&el);
}
*pList = NULL;
}
void linksFree(struct links **pEl)
{
struct links *el;
if ((el = *pEl) == NULL) return;
freeMem(el->parent_name);
freeMem(el->child_name);
freeMem(el->link_type);
freez(pEl);
}
void linksFreeList(struct links **pList)
/* Free a list of dynamically allocated link's */
{
struct links *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
linksFree(&el);
}
*pList = NULL;
}
void pathwayDataFree(struct pathwayData **pEl)
{
struct pathwayData *el;
if ((el = *pEl) == NULL) return;
linksFreeList(&el->links);
entitiesFreeList(&el->entities);
freez(pEl);
}
void slPairPathwayFree(struct slPair **pEl)
{
struct slPair *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
struct pathwayData *pd = el->val;
pathwayDataFree(&pd);
freez(pEl);
}
void slPairPathwayFreeList(struct slPair **pList)
{
struct slPair *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
slPairPathwayFree(&el);
}
*pList = NULL;
}
void storePathwayValsInDb(struct sqlConnection *biConn, char *tableName,
struct pathwayVals *pvList)
{
if (!sqlTableExists(biConn, tableName))
createPathwayValsTable(biConn, tableName);
struct pathwayVals *pv;
for (pv = pvList; pv; pv = pv->next)
pathwayValsSaveToDb(biConn, pv, tableName, 50);
}
struct slPair *getPathways(struct sqlConnection *biConn)
{
char query[1024];
safef(query, sizeof(query),
"select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
"join %s on %s.pathway_id=%s.pathway_id", // a join across a few tables
EN_TABLE, EP_TABLE, EN_TABLE,
EP_TABLE, EN_TABLE, EP_TABLE);
struct sqlResult *sr = sqlGetResult(biConn, query);
char **row = NULL;
struct hash *hash = hashNew(0);
struct pathwayData *pd;
struct slPair *sp, *spList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
int pathway_id = atoi(row[0]);
char *pathway_name = row[1]; // pathway name
int entity_id = atoi(row[2]);
char *entity_name = row[3]; // entity name
char *entity_type = row[4]; // type (protein, abstract, whatever...)
struct hashEl *el = hashLookup(hash, pathway_name);
if (!el)
{
AllocVar(sp);
sp->name = cloneString(pathway_name);
AllocVar(pd);
pd->id = pathway_id;
pd->entities = NULL;
pd->links = NULL;
pd->data = NULL;
pd->featureIds = NULL;
sp->val = pd;
slAddHead(&spList, sp);
hashAdd(hash, pathway_name, sp);
}
else
sp = el->val;
pd = sp->val;
if (!pd)
continue;
struct entities *en;
AllocVar(en);
en->id = entity_id;
en->name = cloneString(entity_name);
en->type = cloneString(entity_type);
slAddHead(&pd->entities, en);
}
sqlFreeResult(&sr);
safef(query, sizeof(query),
"select pathway_name,t1.entity_name,t2.entity_name,link_name from %s "
"join %s on %s.pathway=%s.pathway_id "
"join %s on link_type=%s.id "
"join %s as t1 on %s.parent_entity=t1.entity_id "
"join %s as t2 on %s.child_entity=t2.entity_id ", // a join across a few tables
EL_TABLE, EP_TABLE, EL_TABLE, EP_TABLE,
ELT_TABLE, ELT_TABLE,
EN_TABLE, EL_TABLE,
EN_TABLE, EL_TABLE);
sr = sqlGetResult(biConn, query);
while ((row = sqlNextRow(sr)) != NULL)
{
char *pathway_name = row[0]; // name
char *parent_entity = row[1]; // parent name
char *child_entity = row[2]; // child name
char *link_type = row[3]; // link type, e.g. '->'
struct hashEl *el = hashLookup(hash, pathway_name);
if (!el)
continue;
sp = el->val;
pd = sp->val;
if (!pd)
continue;
struct links *li;
AllocVar(li);
li->parent_name = cloneString(parent_entity);
li->child_name = cloneString(child_entity);
li->link_type = cloneString(link_type);
slAddHead(&pd->links, li);
}
sqlFreeResult(&sr);
hashFree(&hash);
slReverse(&spList);
return spList;
}
void pathwayLevelPipeline(struct biAnalysis *ba)
{
uglyTime(NULL);
struct sqlConnection *biConn = hAllocConnProfile("localDb", ba->db);
struct slPair *spData = analysisValsSamplesHashesList(biConn, ba->inputTables);
uglyTime("got sample hashes");
struct slPair *spPathways = getPathways(biConn);
uglyTime("got pathways");
struct pathwayVals *pvList = pathwayLevelAnalysis(biConn, ba, spData, spPathways);
-uglyTime("analyzed all genesets");
+uglyTime("analyzed all pathways");
/* store analysis vals in database table */
fprintf(stdout, "storing pathway results...\n");
storePathwayValsInDb(biConn, ba->tableName, pvList);
uglyTime("stored all pathways\n");
hFreeConn(&biConn);
slPairHashesFreeList(&spData);
slPairPathwayFreeList(&spPathways);
hFreeConn(&biConn);
}