src/hg/instinct/bioInt2/pathwayNullDist.c 1.2
1.2 2009/05/20 20:34:36 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/pathwayNullDist.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/pathwayNullDist.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/instinct/bioInt2/pathwayNullDist.c 14 May 2009 22:34:07 -0000 1.1
+++ src/hg/instinct/bioInt2/pathwayNullDist.c 20 May 2009 20:34:36 -0000 1.2
@@ -1,778 +1,848 @@
/* bioPathwayLevel.c -- code to run pathway pipeline */
#include "common.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "hPrint.h"
#include "hdb.h"
#include "dystring.h"
#include "bioIntDb.h"
#include "bioIntDriver.h"
#include "cprob.h"
#include "hgStatsLib.h"
#define BAD_VAL 999999.9
void usage()
/* Explain usage and exit. */
{
errAbort(
"pathwayNullDist \n"
"usage:\n"
" pathwayNullDist db pathway_name cohort_id num_iterations\n"
);
}
char *fgDir = "factorGraph";
char *tmpDir = "tmpNull";
struct dataTuple {
struct dataTuple *next;
double cnv;
double exp;
};
/* Gene-level analysis functions */
struct links {
struct links *next;
char *parent_name;
char *child_name;
char *link_type;
};
struct entities {
struct entities *next;
int id;
char *type;
char *name;
};
struct pathwayData {
int id;
struct entities *entities;
struct links *links;
void *data;
struct hash *featureIds;
};
void dataTupleFree(struct dataTuple **pEl)
{
struct dataTuple *el;
if ((el = *pEl) == NULL) return;
freez(pEl);
}
void dataTupleFreeList(struct dataTuple **pList)
{
struct dataTuple *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
dataTupleFree(&el);
}
*pList = NULL;
}
struct analysisVals *readAnalysisValsFromFile(char *filename,
struct entities *enList, int sample_id)
{
struct lineFile *lf = lineFileOpen(filename, TRUE);
if (!lf)
errAbort("File does not exist.");
struct analysisVals *av, *avList = NULL;
struct entities *en;
struct hash *hash = hashNew(0);
for (en = enList; en; en = en->next)
hashAddInt(hash, en->name, en->id);
char *row[2];
while (lineFileRowTab(lf, row))
{
char *name = row[0];
double val = atof(row[1]);
int entity_id = hashIntValDefault(hash, name, -1);
if (entity_id == -1)
{
fprintf(stderr, "entity %s not found\n", name);
continue;
}
AllocVar(av);
av->sample_id = sample_id;
av->feature_id = entity_id;
av->val = val;
av->conf = val;
slAddHead(&avList, av);
}
lineFileClose(&lf);
slReverse(&avList);
hashFree(&hash);
return avList;
}
struct pathwayVals *convertToPathwayVals(struct analysisVals *avList, int pathway_id)
{
struct pathwayVals *pv, *pvList = NULL;
struct analysisVals *av;
for (av = avList; av; av = av->next)
{
AllocVar(pv);
pv->pathway_id = pathway_id;
pv->sample_id = av->sample_id;
pv->feature_id = av->feature_id;
pv->val = av->val;
pv->conf = av->conf;
slAddHead(&pvList, pv);
}
slReverse(&pvList);
return pvList;
}
boolean writePathwayFile(char *filename, struct pathwayData *pd)
{
if (!pd)
return FALSE;
if (!pd->entities || !pd->links)
return FALSE;
FILE *f = mustOpen(filename, "w");
if (!f)
return FALSE;
struct entities *en;
for (en = pd->entities; en; en = en->next)
fprintf(f, "%s\t%s\n", en->type, en->name);
struct links *li;
for (li = pd->links; li; li = li->next)
fprintf(f, "%s\t%s\t%s\n", li->parent_name, li->child_name, li->link_type);
return carefulCloseWarn(&f);
}
boolean prepFactorGraph(char *tableName, void *data)
{
if (!data)
return FALSE;
struct pathwayData *pd = data;
/* make temporary file containing pathway info */
char tmpPathway[512];
safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, tableName);
if (!writePathwayFile(tmpPathway, pd))
errAbort("Problem writing pathway file %s\n", tmpPathway);
char command[512];
safef(command, sizeof(command),
"%s/patient_2stage_prep.sh %s", fgDir, tmpPathway);
/* Run command (cross fingers!) */
int ret = system(command);
return (ret == 0);
}
boolean cleanupFactorGraph(char *tableName)
{
/* make temporary file containing pathway info */
char tmpPathway[512];
safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, tableName);
char command[512];
safef(command, sizeof(command),
"%s/patient_2stage_cleanup.sh %s", fgDir, tmpPathway);
/* Run command (cross fingers!) */
int ret = system(command);
return (ret == 0);
}
boolean writeEvidenceFile(char *filename, struct pathwayData *pd)
{
if (!pd)
return FALSE;
if (!pd->entities || !pd->links)
return FALSE;
FILE *f = mustOpen(filename, "w");
if (!f)
return FALSE;
struct entities *en, *enList = pd->entities;
char idStr[128];
for (en = enList; en; en = en->next)
{
int id = hashIntValDefault(pd->featureIds, en->name, -1);
if (id == -1)
continue;
safef(idStr, sizeof(idStr), "%d", id);
struct typeHash *th;
for (th = pd->data; th; th = th->next)
{
struct hashEl *el = hashLookup(th->hash, idStr);
if (!el)
continue;
struct analysisVals *av = el->val;
double val = av->conf;
char *type;
if (sameString(th->type, "Expression"))
type = "mRNA";
else if (sameString(th->type, "CNV"))
type = "genome";
else
continue;
fprintf(f, "%s\t%s\t%f\n", en->name, type, val);
}
}
return carefulCloseWarn(&f);
}
boolean writeNullEvidenceFile(char *filename, struct pathwayData *pd)
{
if (!pd)
return FALSE;
if (!pd->entities || !pd->links)
return FALSE;
FILE *f = mustOpen(filename, "w");
if (!f)
return FALSE;
struct dataTuple *dt, *dtList = pd->data;
int numTuples = slCount(dtList);
struct entities *en, *enList = pd->entities;
for (en = enList; en; en = en->next)
{
int id = hashIntValDefault(pd->featureIds, en->name, -1);
if (id == -1)
continue;
int randIndex = rand() % numTuples;
char *type;
dt = slElementFromIx(dtList, randIndex);
if (dt->cnv != BAD_VAL)
{
type = "genome";
fprintf(f, "%s\t%s\t%f\n", en->name, type, dt->cnv);
}
if (dt->exp != BAD_VAL)
{
type = "mRNA";
fprintf(f, "%s\t%s\t%f\n", en->name, type, dt->exp);
}
}
return carefulCloseWarn(&f);
}
struct analysisVals *factorGraph(char *tableName, void *data, int sample_id, int feature_id)
{
if (!data)
return NULL;
struct pathwayData *pd = data;
/* make temporary file containing pathway info */
char tmpPathway[512];
safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, tableName);
/* make temporary file containing evidence */
char tmpEvidence[512];
-safef(tmpEvidence, sizeof(tmpEvidence), "%s/%s_pid_%d_sample_%d_evid.tab",
- tmpDir, tableName, pd->id, sample_id);
+safef(tmpEvidence, sizeof(tmpEvidence), "%s/tmp_%s_evid.tab", tmpDir, tableName);
+
if (!writeEvidenceFile(tmpEvidence, pd))
errAbort("Problem writing evidence file %s\n", tmpEvidence);
/* build command */
char tmpOutput[128];
safef(tmpOutput, sizeof(tmpOutput), "%s/%s_pid_%d_sample_%d_output.tab",
tmpDir, tableName, pd->id, sample_id);
char command[512];
safef(command, sizeof(command),
"%s/patient_2stage_exec.sh %s %s \"-d -1.3,1.3\" > %s",
fgDir, tmpPathway, tmpEvidence, tmpOutput);
/* Run command (cross fingers!) */
int ret = system(command);
if (ret)
errAbort("Something went wrong!!");
/* read in data from output */
struct analysisVals *avList = readAnalysisValsFromFile(tmpOutput, pd->entities, sample_id);
return avList;
}
-void factorGraphNull(char *tableName, void *data, int sample_id, int feature_id)
+void factorGraphNull(char *tableName, void *data, int sample_id, int feature_id,
+ char *nullType)
{
-
struct pathwayData *pd = data;
/* make temporary file containing pathway info */
char tmpPathway[512];
safef(tmpPathway, sizeof(tmpPathway),
"%s/tmp_%s_path.tab", tmpDir, tableName);
/* make temporary file containing evidence */
char tmpEvidence[512];
-safef(tmpEvidence, sizeof(tmpEvidence),
- "%s/null_%s_pid_%d_iter_%d_evid.tab",
- tmpDir, tableName, pd->id, sample_id);
+safef(tmpEvidence, sizeof(tmpEvidence), "%s/tmp_%s_evid.tab", tmpDir, tableName);
if (!writeNullEvidenceFile(tmpEvidence, pd))
errAbort("Problem writing evidence file %s\n", tmpEvidence);
/* build command */
char tmpOutput[128];
-safef(tmpOutput, sizeof(tmpOutput),
- "%s/null_%s_pid_%d_iter_%d_output.tab",
+if (sameString(nullType, "allData"))
+ safef(tmpOutput, sizeof(tmpOutput),
+ "%s/null_allData_%s_pid_%d_iter_%d_output.tab",
+ tmpDir, tableName, pd->id, sample_id);
+else
+ safef(tmpOutput, sizeof(tmpOutput),
+ "%s/null_within_%s_pid_%d_iter_%d_output.tab",
tmpDir, tableName, pd->id, sample_id);
char command[512];
safef(command, sizeof(command),
"%s/patient_2stage_exec.sh %s %s \"-d -1.3,1.3\" > %s",
fgDir, tmpPathway, tmpEvidence, tmpOutput);
/* Run command (cross fingers!) */
int ret = system(command);
if (ret)
errAbort("Something went wrong!!");
}
/* Pipeline Stuff */
struct pathwayVals *pathwayLevelAnalysis(struct sqlConnection *biConn, char *tableName,
struct slPair *spData, struct slPair *spPathways)
{
fprintf(stdout, "starting geneset analysis.\n");
struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
struct slPair *pa, *sp;
int count = 0, numPathways = slCount(spPathways);
for (pa = spPathways; pa; pa = pa->next)
{
struct pathwayData *pd = pa->val;
fprintf(stderr, "prepping pathway %s:\n", pa->name);
if (!prepFactorGraph(tableName, pd))
{
fprintf(stderr, "problem with prep, skipping pathway.\n");
continue;
}
for (sp = spData; sp; sp = sp->next)
{
pd->data = sp->val;
if (slCount(pd->data) < 2)
continue; // currently only consider samples with more than one type of evidence.
pd->featureIds = featureHash;
int sample_id = atoi(sp->name);
factorGraph(tableName, pd, sample_id, -1);
fprintf(stderr, ".");
fflush(stderr);
}
fprintf(stderr, "\n");
count++;
fprintf(stderr, "%d of %d pathways\n", count, numPathways);
fflush(stderr);
if (!cleanupFactorGraph(tableName))
fprintf(stderr, "problem with cleanup.\n");
}
fprintf(stdout, "\n");
return NULL;
}
struct slName *getAnalysisFeatureNames(struct sqlConnection *biConn, char *tableName,
char *type)
{
char query[256];
safef(query, sizeof(query),
"select feature_name from %s where type = \"%s\"",
tableName, type);
return sqlQuickList(biConn, query);
}
-struct dataTuple *getDataTuples(struct sqlConnection *biConn, struct hash *featureHash,
+struct dataTuple *getAllDataTuples(struct sqlConnection *biConn, struct hash *featureHash,
struct slPair *spData)
{
struct slName *sl, *featureNames = getAnalysisFeatureNames(biConn, AF_TABLE, "gene");
char idStr[256];
struct slName *featureIds = NULL;
for (sl = featureNames; sl; sl = sl->next)
{
int id = hashIntValDefault(featureHash, sl->name, -1);
if (id == -1)
continue;
safef(idStr, sizeof(idStr), "%d", id);
slNameAddHead(&featureIds, idStr);
}
slNameFreeList(&featureNames);
struct slPair *sp;
struct dataTuple *dt, *dtList = NULL;
for (sp = spData; sp; sp = sp->next)
{
struct typeHash *th, *thList = sp->val;
if (slCount(thList) < 2)
continue; // currently only consider samples with more than one type of evidence.
for (sl = featureIds; sl; sl = sl->next)
{
AllocVar(dt);
dt->cnv = BAD_VAL;
dt->exp = BAD_VAL;
for (th = thList; th; th = th->next)
{
struct hashEl *el = hashLookup(th->hash, sl->name);
if (!el)
continue;
struct analysisVals *av = el->val;
if (sameString(th->type, "Expression"))
dt->exp = av->conf;
else if (sameString(th->type, "CNV"))
dt->cnv = av->conf;
}
if (dt->cnv == BAD_VAL && dt->exp == BAD_VAL)
dataTupleFree(&dt);
else
slAddHead(&dtList, dt);
}
}
return dtList;
}
+struct dataTuple *getPathwayDataTuples(struct sqlConnection *biConn, struct hash *featureHash,
+ struct slPair *spData, struct slPair *spPathway)
+{
+struct pathwayData *pd = spPathway->val;
+struct entities *en, *enList = pd->entities;
+struct slName *featureIds = NULL;
+char idStr[128];
+for (en = enList; en; en = en->next)
+ {
+ int id = hashIntValDefault(featureHash, en->name, -1);
+ if (id == -1)
+ continue;
+
+ safef(idStr, sizeof(idStr), "%d", id);
+ slNameAddHead(&featureIds, idStr);
+ }
+
+struct slPair *sp;
+struct dataTuple *dt, *dtList = NULL;
+for (sp = spData; sp; sp = sp->next)
+ {
+ struct typeHash *th, *thList = sp->val;
+ if (slCount(thList) < 2)
+ continue; // currently only consider samples with more than one type of evidence.
+
+ struct slName *sl;
+ for (sl = featureIds; sl; sl = sl->next)
+ {
+ AllocVar(dt);
+ dt->cnv = BAD_VAL;
+ dt->exp = BAD_VAL;
+ for (th = thList; th; th = th->next)
+ {
+ struct hashEl *el = hashLookup(th->hash, sl->name);
+ if (!el)
+ continue;
+
+ struct analysisVals *av = el->val;
+ if (sameString(th->type, "Expression"))
+ dt->exp = av->conf;
+ else if (sameString(th->type, "CNV"))
+ dt->cnv = av->conf;
+ }
+ if (dt->cnv == BAD_VAL && dt->exp == BAD_VAL)
+ dataTupleFree(&dt);
+ else
+ slAddHead(&dtList, dt);
+ }
+ }
+
+return dtList;
+}
+
/* Pipeline Stuff */
struct pathwayVals *pathwayNullAnalysis(struct sqlConnection *biConn, char *tableName,
struct slPair *spData, struct slPair *spPathways,
- int numIters)
+ int numIters, char *nullType)
{
fprintf(stdout, "starting null analysis analysis.\n");
struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
-struct dataTuple *dtList = getDataTuples(biConn, featureHash, spData);
+
+struct dataTuple *dtList = NULL;
+if (sameString(nullType, "allData"))
+ dtList = getAllDataTuples(biConn, featureHash, spData);
+else if (sameString(nullType, "withinPathway"))
+ dtList = getPathwayDataTuples(biConn, featureHash, spData, spPathways);
+else
+ errAbort("%s not supported", nullType);
int count = 0, numPathways = slCount(spPathways);
struct slPair *pa;
for (pa = spPathways; pa; pa = pa->next)
{
struct pathwayData *pd = pa->val;
pd->featureIds = featureHash;
fprintf(stderr, "prepping pathway %s:\n", pa->name);
if (!prepFactorGraph(tableName, pd))
{
fprintf(stderr, "problem with prep, skipping pathway.\n");
continue;
}
+
pd->data = dtList;
int i;
for (i = 0; i < numIters; i++)
{
- factorGraphNull(tableName, pd, i, -1);
+ factorGraphNull(tableName, pd, i, -1, nullType);
fprintf(stderr, ".");
fflush(stderr);
}
fprintf(stderr, "\n");
count++;
fprintf(stderr, "%d of %d pathways\n", count, numPathways);
fflush(stderr);
}
fprintf(stdout, "\n");
return NULL;
}
void entitiesFree(struct entities **pEl)
{
struct entities *el;
if ((el = *pEl) == NULL) return;
freeMem(el->type);
freeMem(el->name);
freez(pEl);
}
void entitiesFreeList(struct entities **pList)
/* Free a list of dynamically allocated pathwayVals's */
{
struct entities *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
entitiesFree(&el);
}
*pList = NULL;
}
void linksFree(struct links **pEl)
{
struct links *el;
if ((el = *pEl) == NULL) return;
freeMem(el->parent_name);
freeMem(el->child_name);
freeMem(el->link_type);
freez(pEl);
}
void linksFreeList(struct links **pList)
/* Free a list of dynamically allocated link's */
{
struct links *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
linksFree(&el);
}
*pList = NULL;
}
void pathwayDataFree(struct pathwayData **pEl)
{
struct pathwayData *el;
if ((el = *pEl) == NULL) return;
linksFreeList(&el->links);
entitiesFreeList(&el->entities);
freez(pEl);
}
void slPairPathwayFree(struct slPair **pEl)
{
struct slPair *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
struct pathwayData *pd = el->val;
pathwayDataFree(&pd);
freez(pEl);
}
void slPairPathwayFreeList(struct slPair **pList)
{
struct slPair *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
slPairPathwayFree(&el);
}
*pList = NULL;
}
struct slPair *getPathwaysByName(struct sqlConnection *biConn, char *pathwayName)
{
char query[2048];
if (!pathwayName)
safef(query, sizeof(query),
"select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
"join %s on %s.pathway_id=%s.pathway_id", // a join across a few tables
EN_TABLE, EP_TABLE, EN_TABLE,
EP_TABLE, EN_TABLE, EP_TABLE);
else
safef(query, sizeof(query),
"select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
"join %s on %s.pathway_id=%s.pathway_id where %s.pathway_name = \"%s\"",
EN_TABLE, EP_TABLE, EN_TABLE,
EP_TABLE, EN_TABLE, EP_TABLE,
EP_TABLE, pathwayName);
struct sqlResult *sr = sqlGetResult(biConn, query);
char **row = NULL;
struct hash *hash = hashNew(0);
struct pathwayData *pd;
struct slPair *sp, *spList = NULL;
while ((row = sqlNextRow(sr)) != NULL)
{
int pathway_id = atoi(row[0]);
char *pathway_name = row[1]; // pathway name
int entity_id = atoi(row[2]);
char *entity_name = row[3]; // entity name
char *entity_type = row[4]; // type (protein, abstract, whatever...)
struct hashEl *el = hashLookup(hash, pathway_name);
if (!el)
{
AllocVar(sp);
sp->name = cloneString(pathway_name);
AllocVar(pd);
pd->id = pathway_id;
pd->entities = NULL;
pd->links = NULL;
pd->data = NULL;
pd->featureIds = NULL;
sp->val = pd;
slAddHead(&spList, sp);
hashAdd(hash, pathway_name, sp);
}
else
sp = el->val;
pd = sp->val;
if (!pd)
continue;
struct entities *en;
AllocVar(en);
en->id = entity_id;
en->name = cloneString(entity_name);
en->type = cloneString(entity_type);
slAddHead(&pd->entities, en);
}
sqlFreeResult(&sr);
safef(query, sizeof(query),
"select pathway_name,t1.entity_name,t2.entity_name,link_name from %s "
"join %s on %s.pathway=%s.pathway_id "
"join %s on link_type=%s.id "
"join %s as t1 on %s.parent_entity=t1.entity_id "
"join %s as t2 on %s.child_entity=t2.entity_id ", // a join across a few tables
EL_TABLE, EP_TABLE, EL_TABLE, EP_TABLE,
ELT_TABLE, ELT_TABLE,
EN_TABLE, EL_TABLE,
EN_TABLE, EL_TABLE);
sr = sqlGetResult(biConn, query);
while ((row = sqlNextRow(sr)) != NULL)
{
char *pathway_name = row[0]; // name
char *parent_entity = row[1]; // parent name
char *child_entity = row[2]; // child name
char *link_type = row[3]; // link type, e.g. '->'
struct hashEl *el = hashLookup(hash, pathway_name);
if (!el)
continue;
sp = el->val;
pd = sp->val;
if (!pd)
continue;
struct links *li;
AllocVar(li);
li->parent_name = cloneString(parent_entity);
li->child_name = cloneString(child_entity);
li->link_type = cloneString(link_type);
slAddHead(&pd->links, li);
}
sqlFreeResult(&sr);
hashFree(&hash);
slReverse(&spList);
return spList;
}
char *getCohortName(struct sqlConnection *biConn, int cohort_id)
{
char query[256];
safef(query, sizeof(query),
"select name from %s where id = %d",
CO_TABLE, cohort_id);
return sqlQuickString(biConn, query);
}
struct slName *getCohortTables(struct sqlConnection *biConn, int cohort_id)
{
char query[256];
safef(query, sizeof(query),
"select data_table from %s join %s on id=dataset_id where cohort_id = %d",
DC_TABLE, DA_TABLE, cohort_id);
return sqlQuickList(biConn, query);
}
void pathwayLevelPipeline(char *db, char *pathway, int cohort_id, int numIters)
{
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
char *cohort_name = getCohortName(biConn, cohort_id);
struct slName *tableNames = getCohortTables(biConn, cohort_id);
uglyTime(NULL);
struct slPair *spPathways = getPathwaysByName(biConn, pathway);
uglyTime("got pathways");
+if (slCount(spPathways) == 0)
+ errAbort("No pathways by name of %s", pathway);
+
struct slPair *spData = analysisValsSamplesHashesList(biConn, tableNames);
uglyTime("got sample hashes");
-//pathwayLevelAnalysis(biConn, cohort_name, spData, spPathways);
-//uglyTime("analyzed all genesets");
+pathwayLevelAnalysis(biConn, cohort_name, spData, spPathways);
+uglyTime("analyzed all genesets");
+
+pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, "withinPathway");
+uglyTime("produced null distribution");
-pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters);
+pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, "allData");
uglyTime("produced null distribution");
slPairHashesFreeList(&spData);
slPairPathwayFreeList(&spPathways);
hFreeConn(&biConn);
}
int main(int argc, char *argv[])
/* Process command line. */
{
if (argc != 5)
usage();
srand ( time(NULL) );
char *db = argv[1];
char *pathway = argv[2];
int cohort_id = atoi(argv[3]);
int numIters = atoi(argv[4]);
pathwayLevelPipeline(db, pathway, cohort_id, numIters);
return 0;
}