src/hg/instinct/bioInt2/bioPathwayLevel.c 1.1
1.1 2009/05/20 20:34:36 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/bioPathwayLevel.c
===================================================================
RCS file: src/hg/instinct/bioInt2/bioPathwayLevel.c
diff -N src/hg/instinct/bioInt2/bioPathwayLevel.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/bioPathwayLevel.c 20 May 2009 20:34:36 -0000 1.1
@@ -0,0 +1,527 @@
+/* bioPathwayLevel.c -- code to run pathway pipeline */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hPrint.h"
+#include "hdb.h"
+#include "dystring.h"
+#include "bioIntDb.h"
+#include "bioIntDriver.h"
+#include "cprob.h"
+#include "hgStatsLib.h"
+#include "bioController.h"
+
+
+char *fgDir = "factorGraph";
+char *tmpDir = "tmp";
+
+/* Gene-level analysis functions */
+struct links {
+ struct links *next;
+ char *parent_name;
+ char *child_name;
+ char *link_type;
+};
+
+struct entities {
+ struct entities *next;
+ int id;
+ char *type;
+ char *name;
+};
+
+struct pathwayData {
+ int id;
+ struct entities *entities;
+ struct links *links;
+
+ struct typeHash *data;
+ struct hash *featureIds;
+};
+
+struct analysisVals *readAnalysisValsFromFile(char *filename,
+ struct entities *enList, int sample_id)
+{
+struct lineFile *lf = lineFileOpen(filename, TRUE);
+if (!lf)
+ errAbort("File does not exist.");
+struct analysisVals *av, *avList = NULL;
+
+struct entities *en;
+struct hash *hash = hashNew(0);
+for (en = enList; en; en = en->next)
+ hashAddInt(hash, en->name, en->id);
+
+char *row[2];
+while (lineFileRowTab(lf, row))
+ {
+ char *name = row[0];
+ double val = atof(row[1]);
+ int entity_id = hashIntValDefault(hash, name, -1);
+ if (entity_id == -1)
+ {
+ fprintf(stderr, "entity %s not found\n", name);
+ continue;
+ }
+ AllocVar(av);
+ av->sample_id = sample_id;
+ av->feature_id = entity_id;
+ av->val = val;
+ av->conf = val;
+ slAddHead(&avList, av);
+ }
+lineFileClose(&lf);
+slReverse(&avList);
+
+hashFree(&hash);
+return avList;
+}
+
+struct pathwayVals *convertToPathwayVals(struct analysisVals *avList, int pathway_id)
+{
+struct pathwayVals *pv, *pvList = NULL;
+
+struct analysisVals *av;
+
+for (av = avList; av; av = av->next)
+ {
+ AllocVar(pv);
+ pv->pathway_id = pathway_id;
+ pv->sample_id = av->sample_id;
+ pv->feature_id = av->feature_id;
+ pv->val = av->val;
+ pv->conf = av->conf;
+ slAddHead(&pvList, pv);
+ }
+slReverse(&pvList);
+
+return pvList;
+}
+
+boolean writePathwayFile(char *filename, struct pathwayData *pd)
+{
+if (!pd)
+ return FALSE;
+if (!pd->entities || !pd->links)
+ return FALSE;
+
+FILE *f = mustOpen(filename, "w");
+if (!f)
+ return FALSE;
+
+struct entities *en;
+for (en = pd->entities; en; en = en->next)
+ fprintf(f, "%s\t%s\n", en->type, en->name);
+
+struct links *li;
+for (li = pd->links; li; li = li->next)
+ fprintf(f, "%s\t%s\t%s\n", li->parent_name, li->child_name, li->link_type);
+
+return carefulCloseWarn(&f);
+}
+
+boolean prepFactorGraph(struct biAnalysis *ba, void *data)
+{
+if (!data)
+ return FALSE;
+
+struct pathwayData *pd = data;
+
+/* make temporary file containing pathway info */
+char tmpPathway[512];
+safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
+
+if (!writePathwayFile(tmpPathway, pd))
+ errAbort("Problem writing pathway file %s\n", tmpPathway);
+
+char command[512];
+safef(command, sizeof(command),
+ "%s/patient_2stage_prep.sh %s", fgDir, tmpPathway);
+
+/* Run command (cross fingers!) */
+int ret = system(command);
+return (ret == 0);
+}
+
+boolean cleanupFactorGraph(struct biAnalysis *ba)
+{
+/* make temporary file containing pathway info */
+char tmpPathway[512];
+safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
+
+char command[512];
+safef(command, sizeof(command),
+ "%s/patient_2stage_cleanup.sh %s", fgDir, tmpPathway);
+
+/* Run command (cross fingers!) */
+int ret = system(command);
+return (ret == 0);
+}
+
+
+boolean writeEvidenceFile(char *filename, struct pathwayData *pd)
+{
+if (!pd)
+ return FALSE;
+if (!pd->entities || !pd->links)
+ return FALSE;
+
+FILE *f = mustOpen(filename, "w");
+if (!f)
+ return FALSE;
+
+struct entities *en, *enList = pd->entities;
+char idStr[128];
+for (en = enList; en; en = en->next)
+ {
+ int id = hashIntValDefault(pd->featureIds, en->name, -1);
+ if (id == -1)
+ continue;
+ safef(idStr, sizeof(idStr), "%d", id);
+
+ struct typeHash *th;
+ for (th = pd->data; th; th = th->next)
+ {
+ struct hashEl *el = hashLookup(th->hash, idStr);
+ if (!el)
+ continue;
+
+ struct analysisVals *av = el->val;
+ double val = av->conf;
+
+ char *type;
+ if (sameString(th->type, "Expression"))
+ type = "mRNA";
+ else if (sameString(th->type, "CNV"))
+ type = "genome";
+ else
+ continue;
+
+ fprintf(f, "%s\t%s\t%f\n", en->name, type, val);
+ }
+ }
+
+return carefulCloseWarn(&f);
+}
+
+struct analysisVals *factorGraph(struct biAnalysis *ba, void *data,
+ int sample_id, int feature_id)
+{
+if (!data)
+ return NULL;
+
+struct pathwayData *pd = data;
+
+/* make temporary file containing pathway info */
+char tmpPathway[512];
+safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
+
+/* make temporary file containing evidence */
+char tmpEvidence[512];
+safef(tmpEvidence, sizeof(tmpEvidence), "%s/tmp_%s_evid.tab", tmpDir, ba->tableName);
+
+if (!writeEvidenceFile(tmpEvidence, pd))
+ errAbort("Problem writing evidence file %s\n", tmpEvidence);
+
+/* build command */
+char tmpOutput[128];
+safef(tmpOutput, sizeof(tmpOutput), "%s/tmp.out", tmpDir);
+
+char command[512];
+safef(command, sizeof(command),
+ "%s/patient_2stage_exec.sh %s %s \"-d -1.3,1.3\" > %s",
+ fgDir, tmpPathway, tmpEvidence, tmpOutput);
+
+/* Run command (cross fingers!) */
+int ret = system(command);
+if (ret)
+ errAbort("Something went wrong!!");
+
+/* read in data from output */
+struct analysisVals *avList = readAnalysisValsFromFile(tmpOutput, pd->entities, sample_id);
+return avList;
+}
+
+/* Pipeline Stuff */
+struct pathwayVals *pathwayLevelAnalysis(struct sqlConnection *biConn, struct biAnalysis *ba,
+ struct slPair *spData, struct slPair *spPathways)
+{
+if (!ba->analyze)
+ return NULL;
+
+fprintf(stdout, "starting geneset analysis.\n");
+
+struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
+
+struct slPair *pa, *sp;
+
+int count = 0, numPathways = slCount(spPathways);
+
+struct pathwayVals *pvList = NULL;
+for (pa = spPathways; pa; pa = pa->next)
+ {
+ struct pathwayData *pd = pa->val;
+ int feature_id = pd->id;
+
+ fprintf(stderr, "prepping pathway %s:\n", pa->name);
+ if (!prepFactorGraph(ba, pd))
+ {
+ fprintf(stderr, "problem with prep, skipping pathway.\n");
+ continue;
+ }
+ struct analysisVals *av, *avList = NULL;
+ for (sp = spData; sp; sp = sp->next)
+ {
+ pd->data = sp->val;
+ if (slCount(pd->data) < 2)
+ continue; // currently only consider samples with more than one type of evidence.
+ pd->featureIds = featureHash;
+ int sample_id = atoi(sp->name);
+ av = ba->analyze(ba, pd, sample_id, -1);
+ if (!av)
+ break;
+ avList = slCat(avList, av);
+ fprintf(stderr, ".");
+ fflush(stderr);
+ }
+ fprintf(stderr, "\n");
+
+ struct pathwayVals *newPvList = convertToPathwayVals(avList, feature_id);
+ pvList = slCat(pvList, newPvList);
+ count++;
+ fprintf(stderr, "%d of %d pathways\n", count, numPathways);
+ fflush(stderr);
+
+ if (!cleanupFactorGraph(ba))
+ fprintf(stderr, "problem with cleanup.\n");
+ analysisValsFreeList(&avList);
+ }
+
+fprintf(stdout, "\n");
+
+return pvList;
+}
+
+void entitiesFree(struct entities **pEl)
+{
+struct entities *el;
+
+if ((el = *pEl) == NULL) return;
+
+freeMem(el->type);
+freeMem(el->name);
+freez(pEl);
+}
+
+void entitiesFreeList(struct entities **pList)
+/* Free a list of dynamically allocated pathwayVals's */
+{
+struct entities *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+ {
+ next = el->next;
+ entitiesFree(&el);
+ }
+*pList = NULL;
+}
+
+
+void linksFree(struct links **pEl)
+{
+struct links *el;
+
+if ((el = *pEl) == NULL) return;
+
+freeMem(el->parent_name);
+freeMem(el->child_name);
+freeMem(el->link_type);
+freez(pEl);
+}
+
+void linksFreeList(struct links **pList)
+/* Free a list of dynamically allocated link's */
+{
+struct links *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+ {
+ next = el->next;
+ linksFree(&el);
+ }
+*pList = NULL;
+}
+
+
+void pathwayDataFree(struct pathwayData **pEl)
+{
+struct pathwayData *el;
+
+if ((el = *pEl) == NULL) return;
+
+linksFreeList(&el->links);
+entitiesFreeList(&el->entities);
+
+freez(pEl);
+}
+
+void slPairPathwayFree(struct slPair **pEl)
+{
+struct slPair *el;
+
+if ((el = *pEl) == NULL) return;
+
+freeMem(el->name);
+
+struct pathwayData *pd = el->val;
+pathwayDataFree(&pd);
+freez(pEl);
+}
+
+void slPairPathwayFreeList(struct slPair **pList)
+{
+struct slPair *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+ {
+ next = el->next;
+ slPairPathwayFree(&el);
+ }
+*pList = NULL;
+}
+
+void storePathwayValsInDb(struct sqlConnection *biConn, char *tableName,
+ struct pathwayVals *pvList)
+{
+if (!sqlTableExists(biConn, tableName))
+ createPathwayValsTable(biConn, tableName);
+
+struct pathwayVals *pv;
+for (pv = pvList; pv; pv = pv->next)
+ pathwayValsSaveToDb(biConn, pv, tableName, 50);
+}
+
+struct slPair *getPathways(struct sqlConnection *biConn)
+{
+char query[1024];
+safef(query, sizeof(query),
+ "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
+ "join %s on %s.pathway_id=%s.pathway_id", // a join across a few tables
+ EN_TABLE, EP_TABLE, EN_TABLE,
+ EP_TABLE, EN_TABLE, EP_TABLE);
+
+struct sqlResult *sr = sqlGetResult(biConn, query);
+char **row = NULL;
+
+struct hash *hash = hashNew(0);
+
+struct pathwayData *pd;
+struct slPair *sp, *spList = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ int pathway_id = atoi(row[0]);
+ char *pathway_name = row[1]; // pathway name
+ int entity_id = atoi(row[2]);
+ char *entity_name = row[3]; // entity name
+ char *entity_type = row[4]; // type (protein, abstract, whatever...)
+
+ struct hashEl *el = hashLookup(hash, pathway_name);
+ if (!el)
+ {
+ AllocVar(sp);
+ sp->name = cloneString(pathway_name);
+ AllocVar(pd);
+ pd->id = pathway_id;
+ pd->entities = NULL;
+ pd->links = NULL;
+ pd->data = NULL;
+ pd->featureIds = NULL;
+ sp->val = pd;
+ slAddHead(&spList, sp);
+ hashAdd(hash, pathway_name, sp);
+ }
+ else
+ sp = el->val;
+
+ pd = sp->val;
+ if (!pd)
+ continue;
+
+ struct entities *en;
+ AllocVar(en);
+ en->id = entity_id;
+ en->name = cloneString(entity_name);
+ en->type = cloneString(entity_type);
+ slAddHead(&pd->entities, en);
+ }
+sqlFreeResult(&sr);
+
+safef(query, sizeof(query),
+ "select pathway_name,t1.entity_name,t2.entity_name,link_name from %s "
+ "join %s on %s.pathway=%s.pathway_id "
+ "join %s on link_type=%s.id "
+ "join %s as t1 on %s.parent_entity=t1.entity_id "
+ "join %s as t2 on %s.child_entity=t2.entity_id ", // a join across a few tables
+ EL_TABLE, EP_TABLE, EL_TABLE, EP_TABLE,
+ ELT_TABLE, ELT_TABLE,
+ EN_TABLE, EL_TABLE,
+ EN_TABLE, EL_TABLE);
+
+sr = sqlGetResult(biConn, query);
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ char *pathway_name = row[0]; // name
+ char *parent_entity = row[1]; // parent name
+ char *child_entity = row[2]; // child name
+ char *link_type = row[3]; // link type, e.g. '->'
+
+ struct hashEl *el = hashLookup(hash, pathway_name);
+ if (!el)
+ continue;
+
+ sp = el->val;
+ pd = sp->val;
+
+ if (!pd)
+ continue;
+
+ struct links *li;
+ AllocVar(li);
+ li->parent_name = cloneString(parent_entity);
+ li->child_name = cloneString(child_entity);
+ li->link_type = cloneString(link_type);
+ slAddHead(&pd->links, li);
+ }
+sqlFreeResult(&sr);
+
+hashFree(&hash);
+
+slReverse(&spList);
+return spList;
+}
+
+void pathwayLevelPipeline(struct biAnalysis *ba)
+{
+uglyTime(NULL);
+struct sqlConnection *biConn = hAllocConnProfile("localDb", ba->db);
+struct slPair *spData = analysisValsSamplesHashesList(biConn, ba->inputTables);
+uglyTime("got sample hashes");
+
+struct slPair *spPathways = getPathways(biConn);
+uglyTime("got pathways");
+
+struct pathwayVals *pvList = pathwayLevelAnalysis(biConn, ba, spData, spPathways);
+uglyTime("analyzed all genesets");
+
+/* store analysis vals in database table */
+fprintf(stdout, "storing pathway results...\n");
+storePathwayValsInDb(biConn, ba->tableName, pvList);
+uglyTime("stored all pathways\n");
+hFreeConn(&biConn);
+
+slPairHashesFreeList(&spData);
+slPairPathwayFreeList(&spPathways);
+hFreeConn(&biConn);
+}