src/hg/instinct/bioInt2/bioPathwayLevel.c 1.5

1.5 2009/09/05 01:12:01 sbenz
Added em to pipeline
Index: src/hg/instinct/bioInt2/bioPathwayLevel.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioPathwayLevel.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/instinct/bioInt2/bioPathwayLevel.c	1 Sep 2009 05:18:02 -0000	1.4
+++ src/hg/instinct/bioInt2/bioPathwayLevel.c	5 Sep 2009 01:12:01 -0000	1.5
@@ -1,602 +1,611 @@
 /* bioPathwayLevel.c -- code to run pathway pipeline */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "hPrint.h"
 #include "hdb.h"  
 #include "dystring.h"
 #include "bioIntDb.h"
 #include "bioIntDriver.h"
 #include "cprob.h"
 #include "hgStatsLib.h"
 #include "bioController.h"
 #include "bioPathway.h"
 
 
 char *fgDir = "factorGraph";
 char *tmpDir = "tmp";
 
 struct analysisVals *readAnalysisValsFromFile(char *filename, 
 					      struct entities *enList, int sample_id_old)
 {
 struct lineFile *lf = lineFileOpen(filename, TRUE);
 if (!lf)
     errAbort("File does not exist.");
 struct analysisVals *av, *avList = NULL;
 
 struct entities *en;
 struct hash *hash = hashNew(0);
 for (en = enList; en; en = en->next)
     hashAddInt(hash, en->name, en->id);
 
 // skip the first
 //lineFileSkip(lf,1);
 
 //char *row[2];
 char *line = NULL;
 int sample_id = -1;
 fprintf(stderr,"Starting result parsing...");
 fflush(stderr);
 while (lineFileNext(lf, &line, NULL))
     {
 	if(startsWith(">",line))
 		{
 		char *row[3];		
 		chopString(line," ",row,3);
 		sample_id = atoi(row[1]);
 		}
 	else
 		{
 		char *row[2];
 		chopString(line,"\t",row,2);
 		char *name = row[0];
 		char *valStr = row[1];
 		if (sameString(valStr, "NA"))
 		continue;
 	
 		double val = atof(valStr);
 		int entity_id = hashIntValDefault(hash, name, -1);
 
 		if (entity_id == -1)
 		{
 		fprintf(stderr, "entity %s not found\n", name);
 		continue;
 		}
 		AllocVar(av);
 		av->sample_id = sample_id;
 		av->feature_id = entity_id;
 		av->val = val;
 		av->conf = val;
 		slAddHead(&avList, av);
 		}
     }
 lineFileClose(&lf);
 slReverse(&avList);
 fprintf(stderr,"done!\n");
 
 hashFree(&hash);
 return avList;  
 }
 
 struct pathwayVals *convertToPathwayVals(struct analysisVals *avList, int pathway_id)
 {
 struct pathwayVals *pv, *pvList = NULL;
 
 struct analysisVals *av;
 
 for (av = avList; av; av = av->next)
     {
     AllocVar(pv);
     pv->pathway_id = pathway_id;
     pv->sample_id  = av->sample_id;
     pv->feature_id = av->feature_id;
     pv->val        = av->val;
     pv->conf       = av->conf;
     slAddHead(&pvList, pv);
     }
 slReverse(&pvList);
 
 return pvList;
 }
 
 boolean writePathwayFile(char *filename, struct pathwayData *pd)
 {
 if (!pd)
     return FALSE;
 if (!pd->entities || !pd->links)
     return FALSE;
 
 FILE *f = mustOpen(filename, "w");
 if (!f)
     return FALSE;
 
 struct entities *en;
 for (en = pd->entities; en; en = en->next)
     fprintf(f, "%s\t%s\n", en->type, en->name);
 
 struct links *li;
 for (li = pd->links; li; li = li->next)
     fprintf(f, "%s\t%s\t%s\n", li->parent_name, li->child_name, li->link_type);
  
 return carefulCloseWarn(&f);
 }
 
 boolean prepFactorGraph(struct biAnalysis *ba, void *data)
 {
 if (!data)
     return FALSE;
 
 struct pathwayData *pd = data;
 
 /* make temporary file containing pathway info */
 char tmpPathway[512];
 //safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
 safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_pid_%d_path.tab", tmpDir, ba->tableName, pd->id);
 
 
 if (!writePathwayFile(tmpPathway, pd))
     errAbort("Problem writing pathway file %s\n", tmpPathway);
 
 return TRUE;
 }
 
 boolean writeEvidenceFile(char *filenamePrefix, struct pathwayData *pd)
 {
 if (!pd)
     return FALSE;
 if (!pd->entities || !pd->links)
     return FALSE;
 
 fprintf(stderr, "Writing evidence file");
 fflush(stderr);
 
 char genomeEvidenceFile[512];
 safef(genomeEvidenceFile, sizeof(genomeEvidenceFile), "%s_genome.tab", filenamePrefix);
 FILE *genomeF = mustOpen(genomeEvidenceFile, "w");
 if (!genomeF)
     return FALSE;
 
 char mrnaEvidenceFile[512];
 safef(mrnaEvidenceFile, sizeof(mrnaEvidenceFile), "%s_mRNA.tab", filenamePrefix);
 FILE *mrnaF = mustOpen(mrnaEvidenceFile, "w");
 if (!mrnaF)
     return FALSE;
 
 // write the upper-left node
 fprintf(genomeF,"id");
 fprintf(mrnaF,"id");
 struct entities *en, *enList = pd->entities;
 char idStr[128];
 // and the header
 for (en = enList; en; en = en->next)
     {
     int id = hashIntValDefault(pd->featureIds, en->name, -1);
     if (id == -1)
 	continue;
     safef(idStr, sizeof(idStr), "%d", id);
 
 	fprintf(mrnaF, "\t%s", en->name);
 	fprintf(genomeF, "\t%s", en->name);
     }
 fprintf(genomeF,"\n");
 fprintf(mrnaF,"\n");
 
 // now write out the data
 struct slPair *sp;
 for (sp = pd->data; sp; sp = sp->next)
 	{
 	struct typeHash *data = sp->val;
 	if (slCount(data) < 2) 
 		continue;  // currently only consider samples with more than one type of evidence.
 	int sample_id = atoi(sp->name);
 	
 	fprintf(genomeF,"%d",sample_id);
 	fprintf(mrnaF,"%d",sample_id);
 	for (en = enList; en; en = en->next)
 		{
 		int id = hashIntValDefault(pd->featureIds, en->name, -1);
 		if (id == -1)
 			continue;
 		safef(idStr, sizeof(idStr), "%d", id);
 		
 		struct typeHash *th;
 		for (th = data; th; th = th->next)
 			{
 			struct hashEl *el = hashLookup(th->hash, idStr);
 			if (!el)
 				continue;
 			
 			struct analysisVals *av = el->val;
 			double val = av->conf;
 			
 			char *type;
 			if (sameString(th->type, "Expression"))
 				fprintf(mrnaF, "\t%f", val);
 			else if (sameString(th->type, "CNV"))
 				fprintf(genomeF, "\t%f", val);
 			else
 				continue;
 			}
 		}
 
 
 	fprintf(genomeF,"\n");
 	fprintf(mrnaF,"\n");
 	fprintf(stderr, ".");
 	fflush(stderr);
 	}
 fprintf(stderr, "done.\n");
 
 boolean gClose = carefulCloseWarn(&genomeF);
 boolean mClose = carefulCloseWarn(&mrnaF);
 
 return (gClose && mClose);
 }
 
 // eventually this will be more dynamic, but for now...
-boolean writeConfigFile(char *filename, char *evidPrefix)
+boolean writeConfigFile(char *filename, char *evidPrefix, struct emParams *emParameters)
 {
 
 FILE *f = mustOpen(filename, "w");
 if (!f)
     return FALSE;
 
 fprintf(f, "inference [pathway_match=_pid_66_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
 fprintf(f, "inference [pathway_match=_pid_48_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
 //fprintf(f, "inference [pathway_match=_pid_3_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
 //fprintf(f, "inference [pathway_match=_pid_9_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
 fprintf(f, "inference [method=JTREE,updates=HUGIN,verbose=1]\n");
-fprintf(f, "evidence [suffix=_genome.tab,node=genome,disc=-0.451727;0.467580,epsilon=0.01,epsilon0=0.2]\n");
-fprintf(f, "evidence [suffix=_mRNA.tab,node=mRNA,disc=-0.1817733;0.1824913,epsilon=0.01,epsilon0=0.2]\n");
+
+// print out the EM parameters, if they exist... otherwise use the default epsilon vals
+struct dyString *genomeParamString = dyStringNew(256);
+struct dyString *mrnaParamString = dyStringNew(256);
+if(emParameters != NULL)
+	{
+	int i,j;
+	dyStringPrintf(genomeParamString, "factorParams=");
+	dyStringPrintf(mrnaParamString, "factorParams=");
+	for(i = 0; i < 3; i++)
+		{
+		for(j = 0; j < 3; j++)
+			{
+			dyStringPrintf(genomeParamString, "%f;",emParameters->genome[i][j]);
+			dyStringPrintf(mrnaParamString, "%f;",emParameters->mRNA[i][j]);
+			}
+		}
+	// trim the last ; off
+	dyStringResize(genomeParamString,dyStringLen(genomeParamString)-1);
+	dyStringResize(mrnaParamString,dyStringLen(mrnaParamString)-1);	
+	}
+else
+	{
+	dyStringPrintf(genomeParamString, "epsilon=0.01,epsilon0=0.2");
+	dyStringPrintf(mrnaParamString, "epsilon=0.01,epsilon0=0.2");
+	}
+fprintf(f, "evidence [suffix=_genome.tab,node=genome,disc=-0.451727;0.467580,%s]\n",genomeParamString->string);
+fprintf(f, "evidence [suffix=_mRNA.tab,node=mRNA,disc=-0.1817733;0.1824913,%s]\n",mrnaParamString->string);
 fprintf(f, "em_step [_mRNA.tab=-obs>,_genome.tab=-obs>]\n");
 fprintf(f, "em [max_iters=0,log_z_tol=0.001]\n");
 
 return carefulCloseWarn(&f);
 }
 
 struct analysisVals *factorGraph(struct biAnalysis *ba, void *data,
 				 int sample_id, int feature_id)
 { 
 if (!data)
     return NULL;
 
 struct pathwayData *pd = data;
 
 /* make temporary file containing pathway info */
 char tmpPathway[512];
 safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_pid_%d_path.tab", tmpDir, ba->tableName, pd->id);
 
 /* make temporary file containing evidence */
 char tmpEvidence[512];
 //safef(tmpEvidence, sizeof(tmpEvidence), "%s/tmp_%s_evid.tab", tmpDir, ba->tableName);
 safef(tmpEvidence, sizeof(tmpEvidence), "%s/tmp_%s_evid", tmpDir, ba->tableName);
 
 if (!writeEvidenceFile(tmpEvidence, pd))
     errAbort("Problem writing evidence file %s\n", tmpEvidence);
 
 /* make a new tmp file containing the config */
 char tmpConfig[512];
 safef(tmpConfig, sizeof(tmpConfig), "%s/tmp_%s_config.txt", tmpDir, ba->tableName);
 
-if (!writeConfigFile(tmpConfig, tmpEvidence))
+if (!writeConfigFile(tmpConfig, tmpEvidence, pd->emParameters))
     errAbort("Problem writing config file %s\n", tmpConfig);
 
 /* build command */
 char tmpOutput[128];
 safef(tmpOutput, sizeof(tmpOutput), "%s/tmp.out", tmpDir);
 
 fprintf(stderr, "Running inference...");
 fflush(stderr);
 char command[512];
 safef(command, sizeof(command), 
     "%s/hgFactorGraph -p %s -c %s -b %s > %s", 
     fgDir, tmpPathway, tmpConfig, tmpEvidence, tmpOutput);
 
 /* Run command (cross fingers!) */
 int ret = system(command);
 if (ret)
     {
     fprintf(stderr, "Something went wrong!!\n");
     return NULL;
     }
 fprintf(stderr, "Done!\n");
 
 /* read in data from output */
 struct analysisVals *avList = readAnalysisValsFromFile(tmpOutput, pd->entities, -1); 
 return avList;
 }
 
 /* Pipeline Stuff */
 struct pathwayVals *pathwayLevelAnalysis(struct sqlConnection *biConn, struct biAnalysis *ba, 
 					 struct slPair *spData, struct slPair *spPathways)
 {
 if (!ba->analyze)
     return NULL; 
 
 fprintf(stdout, "starting pathway analysis.\n");
 
 struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
 
 struct slPair *pa, *sp;
 
 int count = 0, numPathways = slCount(spPathways);
 
 struct pathwayVals *pvList = NULL;
 for (pa = spPathways; pa; pa = pa->next)
     {
     struct pathwayData *pd = pa->val;
     int feature_id = pd->id;
 
     fprintf(stderr, "prepping pathway %s:\n", pa->name);
     if (!prepFactorGraph(ba, pd))
 	{
 	fprintf(stderr, "problem with prep, skipping pathway.\n");
 	continue;
 	}
 
 	// redesigned to handle all data
 	pd->data = spData;
 	pd->featureIds = featureHash;
+	pd->emParameters = NULL; // null this out for this method
 	struct analysisVals *avList = ba->analyze(ba, pd, -1, feature_id);
 	
-    /* old, single value method
-	struct analysisVals *av, *avList = NULL;
-    for (sp = spData; sp; sp = sp->next)
-	{
-	pd->data = sp->val;
-	if (slCount(pd->data) < 2) 
-	    continue;  // currently only consider samples with more than one type of evidence.
-	pd->featureIds = featureHash;
-	int sample_id = atoi(sp->name);
-	av = ba->analyze(ba, pd, sample_id, -1);
-	if (!av)
-	    break;
-	avList = slCat(avList, av);
-	fprintf(stderr, ".");
-	fflush(stderr);
-	}
-    fprintf(stderr, "\n");*/
-    
     struct pathwayVals *newPvList = convertToPathwayVals(avList, feature_id);
     pvList = slCat(pvList, newPvList);
     count++;
     fprintf(stderr, "%d of %d pathways complete!\n", count, numPathways);
     fflush(stderr);
 
 	char tmpPathway[512];
 	safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_pid_%d_path.tab", tmpDir, ba->tableName, pd->id);
 	if( remove( tmpPathway ) != 0 )
 		fprintf(stderr, "problem with cleanup.\n");
     /*if (!cleanupFactorGraph(ba))
 	fprintf(stderr, "problem with cleanup.\n");*/
     analysisValsFreeList(&avList);
     }
 
 fprintf(stdout, "\n");
 
 return pvList; 
 }            
 
 void entitiesFree(struct entities **pEl)
 {
 struct entities *el;
 
 if ((el = *pEl) == NULL) return;
 
 freeMem(el->type);
 freeMem(el->name);
 freez(pEl);
 }
 
 void entitiesFreeList(struct entities **pList)
 /* Free a list of dynamically allocated pathwayVals's */
 {
 struct entities *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     entitiesFree(&el);
     }
 *pList = NULL;
 }    
 
 
 void linksFree(struct links **pEl)
 {
 struct links *el;
 
 if ((el = *pEl) == NULL) return;
 
 freeMem(el->parent_name);
 freeMem(el->child_name);
 freeMem(el->link_type);
 freez(pEl);
 }
 
 void linksFreeList(struct links **pList)
 /* Free a list of dynamically allocated link's */
 {
 struct links *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     linksFree(&el);
     }
 *pList = NULL;
 }    
 
 
 void pathwayDataFree(struct pathwayData **pEl)
 {
 struct pathwayData *el;
 
 if ((el = *pEl) == NULL) return;
 
 linksFreeList(&el->links);
 entitiesFreeList(&el->entities);
 
 freez(pEl);
 }
 
 void slPairPathwayFree(struct slPair **pEl)
 {
 struct slPair *el;
 
 if ((el = *pEl) == NULL) return;
 
 freeMem(el->name);
 
 struct pathwayData *pd = el->val;
 pathwayDataFree(&pd);
 freez(pEl);
 }
 
 void slPairPathwayFreeList(struct slPair **pList)
 {
 struct slPair *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     slPairPathwayFree(&el);
     }
 *pList = NULL;
 } 
 
 void storePathwayValsInDb(struct sqlConnection *biConn, char *tableName,
 			  struct pathwayVals *pvList)
 {
 if (!sqlTableExists(biConn, tableName))
     createPathwayValsTable(biConn, tableName);
 
 struct pathwayVals *pv;
 for (pv = pvList; pv; pv = pv->next)
     pathwayValsSaveToDb(biConn, pv, tableName, 50);
 }
     
 struct slPair *getPathways(struct sqlConnection *biConn)
 {
 char query[1024];
 safef(query, sizeof(query), 
       "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
       "join %s on %s.pathway_id=%s.pathway_id", // a join across a few tables
       EN_TABLE, EP_TABLE, EN_TABLE, 
       EP_TABLE, EN_TABLE, EP_TABLE);
 
 struct sqlResult *sr = sqlGetResult(biConn, query);
 char **row = NULL;
 
 struct hash *hash = hashNew(0);
 
 struct pathwayData *pd;
 struct slPair *sp, *spList = NULL;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     int pathway_id     = atoi(row[0]);
     char *pathway_name = row[1];   // pathway name
     int entity_id      = atoi(row[2]);
     char *entity_name  = row[3];   // entity name
     char *entity_type  = row[4];   // type (protein, abstract, whatever...)
 
     struct hashEl *el = hashLookup(hash, pathway_name);
     if (!el)
 	{
 	AllocVar(sp);
 	sp->name = cloneString(pathway_name);
 	AllocVar(pd);
 	pd->id = pathway_id;
 	pd->entities = NULL;
 	pd->links = NULL;
 	pd->data = NULL;
 	pd->featureIds = NULL;
 	sp->val = pd;
 	slAddHead(&spList, sp);
 	hashAdd(hash, pathway_name, sp);
 	}
     else
 	sp = el->val;
 
     pd = sp->val;
     if (!pd)
 	continue;
 
     struct entities *en;
     AllocVar(en);
     en->id   = entity_id;
     en->name = cloneString(entity_name);
     en->type = cloneString(entity_type);
     slAddHead(&pd->entities, en);
     }
 sqlFreeResult(&sr);
 
 safef(query, sizeof(query), 
       "select pathway_name,t1.entity_name,t2.entity_name,link_name from %s "
       "join %s on %s.pathway=%s.pathway_id "
       "join %s on link_type=%s.id "
       "join %s as t1 on %s.parent_entity=t1.entity_id "
       "join %s as t2 on %s.child_entity=t2.entity_id ", // a join across a few tables
       EL_TABLE, EP_TABLE, EL_TABLE, EP_TABLE,
       ELT_TABLE, ELT_TABLE, 
       EN_TABLE, EL_TABLE, 
       EN_TABLE, EL_TABLE);
 
 sr = sqlGetResult(biConn, query);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     char *pathway_name  = row[0];   // name
     char *parent_entity = row[1];   // parent name
     char *child_entity  = row[2];   // child name
     char *link_type     = row[3];   // link type, e.g. '->'
     
     struct hashEl *el = hashLookup(hash, pathway_name);
     if (!el)
 	continue;  
 
     sp = el->val;
     pd = sp->val;
 
     if (!pd)
 	continue;
 
     struct links *li;
     AllocVar(li);
     li->parent_name = cloneString(parent_entity);
     li->child_name  = cloneString(child_entity);
     li->link_type   = cloneString(link_type);
     slAddHead(&pd->links, li);
     }
 sqlFreeResult(&sr);
 
 hashFree(&hash);
 
 slReverse(&spList);
 return spList;
 }
 
 void pathwayLevelPipeline(struct biAnalysis *ba)
 {
 uglyTime(NULL);
 struct sqlConnection *biConn = hAllocConnProfile("localDb", ba->db);
 struct slPair *spData = analysisValsSamplesHashesList(biConn, ba->inputTables);
 uglyTime("got sample hashes");   
 
 struct slPair *spPathways = getPathways(biConn);
 uglyTime("got pathways");
 
 struct pathwayVals *pvList = pathwayLevelAnalysis(biConn, ba, spData, spPathways);
 uglyTime("analyzed all pathways");
 
 /* store analysis vals in database table */
 fprintf(stdout, "storing pathway results...\n");
 storePathwayValsInDb(biConn, ba->tableName, pvList);
 uglyTime("stored all pathways\n");
 hFreeConn(&biConn);   
 
 slPairHashesFreeList(&spData);
 slPairPathwayFreeList(&spPathways);
 hFreeConn(&biConn);
 }