src/hg/instinct/bioInt2/bioPathwayLevel.c 1.4

1.4 2009/09/01 05:18:02 sbenz
Added beginnings of EM pathway analysis
Index: src/hg/instinct/bioInt2/bioPathwayLevel.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/bioPathwayLevel.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 4 -r1.3 -r1.4
--- src/hg/instinct/bioInt2/bioPathwayLevel.c	16 Aug 2009 02:19:17 -0000	1.3
+++ src/hg/instinct/bioInt2/bioPathwayLevel.c	1 Sep 2009 05:18:02 -0000	1.4
@@ -11,39 +11,16 @@
 #include "bioIntDriver.h"
 #include "cprob.h"
 #include "hgStatsLib.h"
 #include "bioController.h"
+#include "bioPathway.h"
 
 
 char *fgDir = "factorGraph";
 char *tmpDir = "tmp";
 
-/* Gene-level analysis functions */
-struct links {
-    struct links *next;
-    char *parent_name;
-    char *child_name;
-    char *link_type;
-};
-
-struct entities {
-    struct entities *next;
-    int id;
-    char *type;
-    char *name;
-};
-
-struct pathwayData {
-    int id;
-    struct entities *entities;
-    struct links *links;
-    
-    struct typeHash *data;
-    struct hash *featureIds;
-};
-
 struct analysisVals *readAnalysisValsFromFile(char *filename, 
-					      struct entities *enList, int sample_id)
+					      struct entities *enList, int sample_id_old)
 {
 struct lineFile *lf = lineFileOpen(filename, TRUE);
 if (!lf)
     errAbort("File does not exist.");
@@ -54,13 +31,27 @@
 for (en = enList; en; en = en->next)
     hashAddInt(hash, en->name, en->id);
 
 // skip the first
-lineFileSkip(lf,1);
+//lineFileSkip(lf,1);
 
-char *row[2];
-while (lineFileRowTab(lf, row))
+//char *row[2];
+char *line = NULL;
+int sample_id = -1;
+fprintf(stderr,"Starting result parsing...");
+fflush(stderr);
+while (lineFileNext(lf, &line, NULL))
+    {
+	if(startsWith(">",line))
+		{
+		char *row[3];		
+		chopString(line," ",row,3);
+		sample_id = atoi(row[1]);
+		}
+	else
     {
+		char *row[2];
+		chopString(line,"\t",row,2);
     char *name = row[0];
     char *valStr = row[1];
     if (sameString(valStr, "NA"))
 	continue;
@@ -78,10 +70,12 @@
     av->val = val;
     av->conf = val;
     slAddHead(&avList, av);
     }
+    }
 lineFileClose(&lf);
 slReverse(&avList);
+fprintf(stderr,"done!\n");
 
 hashFree(&hash);
 return avList;  
 }
@@ -144,87 +138,21 @@
 
 if (!writePathwayFile(tmpPathway, pd))
     errAbort("Problem writing pathway file %s\n", tmpPathway);
 
-/*char command[512];
-safef(command, sizeof(command), 
-      "%s/patient_2stage_prep.sh %s", fgDir, tmpPathway);
-*/
-/* Run command (cross fingers!) */
-/*int ret = system(command);
-return (ret == 0);*/
 return TRUE;
 }
 
-// this no longer gets called
-boolean cleanupFactorGraph(struct biAnalysis *ba)
-{
-/* make temporary file containing pathway info */
-char tmpPathway[512];
-safef(tmpPathway, sizeof(tmpPathway), "%s/tmp_%s_path.tab", tmpDir, ba->tableName);
-
-char command[512];
-safef(command, sizeof(command), 
-      "%s/patient_2stage_cleanup.sh %s", fgDir, tmpPathway);
-
-/* Run command (cross fingers!) */
-int ret = system(command);
-return (ret == 0);
-}
-
-
-boolean writeEvidenceFile_OLD(char *filename, struct pathwayData *pd)
-{
-if (!pd)
-    return FALSE;
-if (!pd->entities || !pd->links)
-    return FALSE;
-
-FILE *f = mustOpen(filename, "w");
-if (!f)
-    return FALSE;
-
-struct entities *en, *enList = pd->entities;
-char idStr[128];
-for (en = enList; en; en = en->next)
-    {
-    int id = hashIntValDefault(pd->featureIds, en->name, -1);
-    if (id == -1)
-	continue;
-    safef(idStr, sizeof(idStr), "%d", id);
-
-    struct typeHash *th;
-    for (th = pd->data; th; th = th->next)
-	{
-	struct hashEl *el = hashLookup(th->hash, idStr);
-	if (!el)
-	    continue;
-
-	struct analysisVals *av = el->val;
-	double val = av->conf;
-
-	char *type;
-	if (sameString(th->type, "Expression"))
-	    type = "mRNA";
-	else if (sameString(th->type, "CNV"))
-	    type = "genome";
-	else
-	    continue;
-
-	fprintf(f, "%s\t%s\t%f\n", en->name, type, val);
-	}
-    }
-
-return carefulCloseWarn(&f);
-}
-
 boolean writeEvidenceFile(char *filenamePrefix, struct pathwayData *pd)
 {
 if (!pd)
     return FALSE;
 if (!pd->entities || !pd->links)
     return FALSE;
 
+fprintf(stderr, "Writing evidence file");
+fflush(stderr);
+
 char genomeEvidenceFile[512];
 safef(genomeEvidenceFile, sizeof(genomeEvidenceFile), "%s_genome.tab", filenamePrefix);
 FILE *genomeF = mustOpen(genomeEvidenceFile, "w");
 if (!genomeF)
@@ -240,43 +168,42 @@
 fprintf(genomeF,"id");
 fprintf(mrnaF,"id");
 struct entities *en, *enList = pd->entities;
 char idStr[128];
+// and the header
 for (en = enList; en; en = en->next)
     {
     int id = hashIntValDefault(pd->featureIds, en->name, -1);
     if (id == -1)
 	continue;
     safef(idStr, sizeof(idStr), "%d", id);
 
-    struct typeHash *th;
-    for (th = pd->data; th; th = th->next)
-		{
-		struct hashEl *el = hashLookup(th->hash, idStr);
-		if (!el)
-			continue;
-	
-		char *type;
-		if (sameString(th->type, "Expression"))
 			fprintf(mrnaF, "\t%s", en->name);
-		else if (sameString(th->type, "CNV"))
 			fprintf(genomeF, "\t%s", en->name);
-		else
-			continue;	
-		}
     }
+fprintf(genomeF,"\n");
+fprintf(mrnaF,"\n");
+
+// now write out the data
+struct slPair *sp;
+for (sp = pd->data; sp; sp = sp->next)
+	{
+	struct typeHash *data = sp->val;
+	if (slCount(data) < 2) 
+		continue;  // currently only consider samples with more than one type of evidence.
+	int sample_id = atoi(sp->name);
 
- 	fprintf(genomeF,"\nsample");
-	fprintf(mrnaF,"\nsample");
+	fprintf(genomeF,"%d",sample_id);
+	fprintf(mrnaF,"%d",sample_id);
  for (en = enList; en; en = en->next)
     {
     int id = hashIntValDefault(pd->featureIds, en->name, -1);
     if (id == -1)
 	continue;
     safef(idStr, sizeof(idStr), "%d", id);
 
     struct typeHash *th;
-	for (th = pd->data; th; th = th->next)
+		for (th = data; th; th = th->next)
 		{
 		struct hashEl *el = hashLookup(th->hash, idStr);
 		if (!el)
 			continue;
@@ -294,10 +221,15 @@
 		}
    }
 
 
-fprintf(genomeF,"\n");
-fprintf(mrnaF,"\n");
+	fprintf(genomeF,"\n");
+	fprintf(mrnaF,"\n");
+	fprintf(stderr, ".");
+	fflush(stderr);
+	}
+fprintf(stderr, "done.\n");
+
 boolean gClose = carefulCloseWarn(&genomeF);
 boolean mClose = carefulCloseWarn(&mrnaF);
 
 return (gClose && mClose);
@@ -310,26 +242,17 @@
 FILE *f = mustOpen(filename, "w");
 if (!f)
     return FALSE;
 
-/*fprintf(f, "inference [pathway_match=_pid_66_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+fprintf(f, "inference [pathway_match=_pid_66_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
 fprintf(f, "inference [pathway_match=_pid_48_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
-fprintf(f, "inference [pathway_match=_pid_3_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
-fprintf(f, "inference [pathway_match=_pid_9_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+//fprintf(f, "inference [pathway_match=_pid_3_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
+//fprintf(f, "inference [pathway_match=_pid_9_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
 fprintf(f, "inference [method=JTREE,updates=HUGIN,verbose=1]\n");
 fprintf(f, "evidence [suffix=_genome.tab,node=genome,disc=-0.451727;0.467580,epsilon=0.01,epsilon0=0.2]\n");
 fprintf(f, "evidence [suffix=_mRNA.tab,node=mRNA,disc=-0.1817733;0.1824913,epsilon=0.01,epsilon0=0.2]\n");
 fprintf(f, "em_step [_mRNA.tab=-obs>,_genome.tab=-obs>]\n");
 fprintf(f, "em [max_iters=0,log_z_tol=0.001]\n");
-*/
-fprintf(f, "inference [pathway_match=_pid_66_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
-fprintf(f, "inference [pathway_match=_pid_48_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
-fprintf(f, "inference [pathway_match=_pid_3_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
-fprintf(f, "inference [pathway_match=_pid_9_,method=BP,updates=SEQFIX,tol=1e-9,maxiter=10000,logdomain=0]\n");
-fprintf(f, "inference [method=JTREE,updates=HUGIN,verbose=1]\n");
-fprintf(f, "evidence [suffix=_genome.tab,node=genome,disc=-0.451727;0.467580,factorParams=0.680434527777778;0.188006489814815;0.131558978703704;0.241894870370370;0.55878737962963;0.199317706481481;0.127232687037037;0.246933030555556;0.625834314814815]\n");
-fprintf(f, "evidence [suffix=_mRNA.tab,node=mRNA,disc=-0.1817733;0.1824913,factorParams=0.6211706;0.250951192592593;0.127878243518519;0.259141574074074;0.488297342592593;0.252561083333333;0.137161049074074;0.213898711111111;0.648940222222222]\n");
-fprintf(f, "em [max_iters=0,log_z_tol=0.001]\n");
 
 return carefulCloseWarn(&f);
 }
 
@@ -364,29 +286,26 @@
 /* build command */
 char tmpOutput[128];
 safef(tmpOutput, sizeof(tmpOutput), "%s/tmp.out", tmpDir);
 
+fprintf(stderr, "Running inference...");
+fflush(stderr);
 char command[512];
 safef(command, sizeof(command), 
       "%s/hgFactorGraph -p %s -c %s -b %s > %s", 
       fgDir, tmpPathway, tmpConfig, tmpEvidence, tmpOutput);
 
-/*safef(command, sizeof(command), 
-      "%s/patient_2stage_exec.sh %s %s \"-d -1.3,1.3\" > %s", 
-      fgDir, tmpPathway, tmpEvidence, tmpOutput);
-*/
-
-
 /* Run command (cross fingers!) */
 int ret = system(command);
 if (ret)
     {
-    fprintf(stderr, "Something went wrong!!");
+    fprintf(stderr, "Something went wrong!!\n");
     return NULL;
     }
+fprintf(stderr, "Done!\n");
 
 /* read in data from output */
-struct analysisVals *avList = readAnalysisValsFromFile(tmpOutput, pd->entities, sample_id); 
+struct analysisVals *avList = readAnalysisValsFromFile(tmpOutput, pd->entities, -1); 
 return avList;
 }
 
 /* Pipeline Stuff */
@@ -415,8 +334,15 @@
 	{
 	fprintf(stderr, "problem with prep, skipping pathway.\n");
 	continue;
 	}
+
+	// redesigned to handle all data
+	pd->data = spData;
+	pd->featureIds = featureHash;
+	struct analysisVals *avList = ba->analyze(ba, pd, -1, feature_id);
+	
+    /* old, single value method
     struct analysisVals *av, *avList = NULL;
     for (sp = spData; sp; sp = sp->next)
 	{
 	pd->data = sp->val;
@@ -430,9 +356,9 @@
 	avList = slCat(avList, av);
 	fprintf(stderr, ".");
 	fflush(stderr);
 	}
-    fprintf(stderr, "\n");
+    fprintf(stderr, "\n");*/
     
     struct pathwayVals *newPvList = convertToPathwayVals(avList, feature_id);
     pvList = slCat(pvList, newPvList);
     count++;