src/hg/instinct/bioInt2/makeClusterFiles.c 1.1

1.1 2009/06/15 17:45:23 jsanborn
initial commit
Index: src/hg/instinct/bioInt2/makeClusterFiles.c
===================================================================
RCS file: src/hg/instinct/bioInt2/makeClusterFiles.c
diff -N src/hg/instinct/bioInt2/makeClusterFiles.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/instinct/bioInt2/makeClusterFiles.c	15 Jun 2009 17:45:23 -0000	1.1
@@ -0,0 +1,974 @@
+/* bioPathwayLevel.c -- code to run pathway pipeline */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hPrint.h"
+#include "hdb.h"  
+#include "dystring.h"
+#include "bioIntDb.h"
+#include "bioIntDriver.h"
+#include "cprob.h"
+#include "hgStatsLib.h"
+
+#define BAD_VAL 999999.9
+#define NUM_IN_BATCH 500
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+	 "makeClusterFiles \n"
+	 "usage:\n"
+	 "  makeClusterFiles db pathway_name directory cohort_id num_iterations\n"
+	 );
+}
+
+char *fgDir = "factorGraph";
+char *tmpDir = "tmpNull";
+
+struct dataTuple {
+    struct dataTuple *next;
+    double cnv;
+    double exp;
+};
+
+/* Gene-level analysis functions */
+struct links {
+    struct links *next;
+    char *parent_name;
+    char *child_name;
+    char *link_type;
+};
+
+struct entities {
+    struct entities *next;
+    int id;
+    char *type;
+    char *name;
+};
+
+struct pathwayData {
+    int id;
+    struct entities *entities;
+    struct links *links;
+    
+    void *data;
+    struct hash *featureIds;
+};
+
+void dataTupleFree(struct dataTuple **pEl)
+{
+struct dataTuple *el;
+
+if ((el = *pEl) == NULL) return;
+freez(pEl);
+}
+
+void dataTupleFreeList(struct dataTuple **pList)
+{
+struct dataTuple *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+    {
+    next = el->next;
+    dataTupleFree(&el);
+    }
+*pList = NULL;
+}  
+
+struct analysisVals *readAnalysisValsFromFile(char *filename, 
+					      struct entities *enList, int sample_id)
+{
+struct lineFile *lf = lineFileOpen(filename, TRUE);
+if (!lf)
+    errAbort("File does not exist.");
+struct analysisVals *av, *avList = NULL;
+
+struct entities *en;
+struct hash *hash = hashNew(0);
+for (en = enList; en; en = en->next)
+    hashAddInt(hash, en->name, en->id);
+
+char *row[2];
+while (lineFileRowTab(lf, row))
+    {
+    char *name = row[0];
+    double val = atof(row[1]);
+    int entity_id = hashIntValDefault(hash, name, -1);
+    if (entity_id == -1)
+	{
+	fprintf(stderr, "entity %s not found\n", name);
+	continue;
+	}
+    AllocVar(av);
+    av->sample_id = sample_id;
+    av->feature_id = entity_id;
+    av->val = val;
+    av->conf = val;
+    slAddHead(&avList, av);
+    }
+lineFileClose(&lf);
+slReverse(&avList);
+
+hashFree(&hash);
+return avList;  
+}
+
+struct pathwayVals *convertToPathwayVals(struct analysisVals *avList, int pathway_id)
+{
+struct pathwayVals *pv, *pvList = NULL;
+
+struct analysisVals *av;
+
+for (av = avList; av; av = av->next)
+    {
+    AllocVar(pv);
+    pv->pathway_id = pathway_id;
+    pv->sample_id  = av->sample_id;
+    pv->feature_id = av->feature_id;
+    pv->val        = av->val;
+    pv->conf       = av->conf;
+    slAddHead(&pvList, pv);
+    }
+slReverse(&pvList);
+
+return pvList;
+}
+
+boolean writePathwayFile(char *filename, struct pathwayData *pd)
+{
+if (!pd)
+    return FALSE;
+if (!pd->entities || !pd->links)
+    return FALSE;
+
+FILE *f = mustOpen(filename, "w");
+if (!f)
+    return FALSE;
+
+struct entities *en;
+for (en = pd->entities; en; en = en->next)
+    fprintf(f, "%s\t%s\n", en->type, en->name);
+
+struct links *li;
+for (li = pd->links; li; li = li->next)
+    fprintf(f, "%s\t%s\t%s\n", li->parent_name, li->child_name, li->link_type);
+ 
+return carefulCloseWarn(&f);
+}
+
+boolean prepFactorGraph(char *tableName, void *data)
+{
+if (!data)
+    return FALSE;
+
+struct pathwayData *pd = data;
+
+/* make temporary file containing pathway info */
+char tmpPathway[512];
+safef(tmpPathway, sizeof(tmpPathway), "%s/%s_pid_%d_pathway.tab", 
+      tmpDir, tableName, pd->id); 
+
+if (!writePathwayFile(tmpPathway, pd))
+    errAbort("Problem writing pathway file %s\n", tmpPathway);
+
+return TRUE;
+}
+
+boolean prepEvidenceFiles(char *prefix, void *data)
+{
+if (!data)
+    return FALSE;
+
+struct pathwayData *pd = data;
+
+char cnvFile[1024];
+safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
+FILE *cnvF = mustOpen(cnvFile, "w");
+if (!cnvF)
+    return FALSE;
+
+char expFile[1024];
+safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
+FILE *expF = mustOpen(expFile, "w");
+if (!expF)
+    return FALSE;
+
+fprintf(cnvF, "id\t");
+fprintf(expF, "id\t");
+
+struct entities *en, *enList = pd->entities;
+for (en = enList; en; en = en->next)
+    {
+    int id = hashIntValDefault(pd->featureIds, en->name, -1);
+    if (id == -1)
+	continue;
+
+    fprintf(cnvF, "%s", en->name);
+    fprintf(expF, "%s", en->name);
+    if (en->next)
+	{
+	fprintf(cnvF, "\t");
+	fprintf(expF, "\t");
+	}
+    }
+fprintf(cnvF, "\n");
+fprintf(expF, "\n");
+
+boolean closedExp = carefulCloseWarn(&expF);
+boolean closedCNV = carefulCloseWarn(&cnvF);
+return (closedExp && closedCNV);
+}
+
+boolean writeEvidenceFile(char *prefix, char *sampleName, struct pathwayData *pd)
+{
+if (!pd)
+    return FALSE;
+if (!pd->entities || !pd->links)
+    return FALSE;
+
+char cnvFile[1024];
+safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
+FILE *cnvF = mustOpen(cnvFile, "a");
+if (!cnvF)
+    return FALSE;
+
+char expFile[1024];
+safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
+FILE *expF = mustOpen(expFile, "a");
+if (!expF)
+    return FALSE;
+
+fprintf(cnvF, "%s\t", sampleName); 
+fprintf(expF, "%s\t", sampleName);
+ 
+struct entities *en, *enList = pd->entities;
+char idStr[128];
+for (en = enList; en; en = en->next)
+    {
+    int id = hashIntValDefault(pd->featureIds, en->name, -1);
+    if (id == -1)
+	continue;
+
+    safef(idStr, sizeof(idStr), "%d", id);
+	
+    boolean hadExp = FALSE;
+    boolean hadCNV = FALSE;
+    
+    struct typeHash *th;
+    for (th = pd->data; th; th = th->next)
+	{
+	struct hashEl *el = hashLookup(th->hash, idStr);
+	if (!el)
+	    continue;
+
+	struct analysisVals *av = el->val;
+	double val = av->conf;
+	
+	if (sameString(th->type, "Expression"))
+	    {
+	    fprintf(expF, "%f", val);
+	    hadExp = TRUE;
+	    }
+	else if (sameString(th->type, "CNV"))
+	    {
+	    fprintf(cnvF, "%f", val);
+	    hadCNV = TRUE;
+	    }
+	else
+	    continue;
+	}
+    if (!hadExp)
+	fprintf(expF, "NA");
+    
+    if (!hadCNV)
+	fprintf(cnvF, "NA");
+
+    if (en->next)
+	{
+	fprintf(cnvF, "\t");
+	fprintf(expF, "\t");
+	}
+    }
+
+fprintf(cnvF, "\n");
+fprintf(expF, "\n");
+
+boolean closedExp = carefulCloseWarn(&expF);
+boolean closedCNV = carefulCloseWarn(&cnvF);
+
+return (closedExp && closedCNV);
+}
+
+boolean writeNullEvidenceFile(char *prefix, char *sampleName, struct pathwayData *pd)
+{
+if (!pd)
+    return FALSE;
+if (!pd->entities || !pd->links)
+    return FALSE;
+
+char cnvFile[1024];
+safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
+FILE *cnvF = mustOpen(cnvFile, "a");
+if (!cnvF)
+    return FALSE;
+
+char expFile[1024];
+safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
+FILE *expF = mustOpen(expFile, "a");
+if (!expF)
+    return FALSE;
+
+fprintf(cnvF, "%s\t", sampleName); 
+fprintf(expF, "%s\t", sampleName);
+
+struct dataTuple *dt;
+struct hash *dtHash = pd->data;
+int numTuples = hashNumEntries(dtHash);
+
+struct entities *en, *enList = pd->entities;
+
+for (en = enList; en; en = en->next)
+    {
+    int id = hashIntValDefault(pd->featureIds, en->name, -1);
+    if (id == -1)
+	continue;
+
+    int randIndex = rand() % numTuples;
+    char randStr[128];
+    safef(randStr, sizeof(randStr), "%d", randIndex);
+
+    struct hashEl *el = hashLookup(dtHash, randStr);
+    if (!el)
+	{
+	fprintf(stderr, "uh-oh!");
+	continue;
+	}
+    dt = el->val; 
+
+    if (dt->cnv != BAD_VAL)
+	fprintf(cnvF, "%f", dt->cnv);
+    else
+	fprintf(cnvF, "NA");
+
+
+    if (dt->exp != BAD_VAL)
+	fprintf(expF, "%f", dt->exp);
+    else
+	fprintf(expF, "NA");
+
+    if (en->next)
+	{
+	fprintf(cnvF, "\t");
+	fprintf(expF, "\t");
+	}
+    }
+
+fprintf(cnvF, "\n");
+fprintf(expF, "\n");
+
+boolean closeExp = carefulCloseWarn(&expF);
+boolean closeCNV = carefulCloseWarn(&cnvF);
+
+return (closeExp && closeCNV);
+}
+
+void factorGraph(char *tableName, void *data, int sample_id, int feature_id)
+{ 
+if (!data)
+    return;
+
+struct pathwayData *pd = data;
+
+char prefix[512];
+safef(prefix, sizeof(prefix), "%s/%s_pid_%d", 
+      tmpDir, tableName, pd->id); 
+
+char sampleName[512];
+safef(sampleName, sizeof(sampleName), "sample_%d", sample_id);
+
+if (!writeEvidenceFile(prefix, sampleName, pd))
+    errAbort("Problem writing evidence file %s\n", prefix);
+}
+
+void factorGraphNull(char *tableName, void *data, int sample_id, int batchNum, 
+		     char *nullType)
+{ 
+struct pathwayData *pd = data;
+
+/* make temporary file containing evidence */
+char prefix[512];
+if (sameString(nullType, "allData"))
+    {
+    safef(prefix, sizeof(prefix), 
+	  "%s/na_%s_pid_%d_batch_%d", 
+	  tmpDir, tableName, pd->id, batchNum); 
+    }
+else
+    {
+    safef(prefix, sizeof(prefix), 
+	  "%s/nw_%s_pid_%d_batch_%d", 
+	  tmpDir, tableName, pd->id, batchNum); 
+    }
+
+char sampleName[512];
+safef(sampleName, sizeof(sampleName), "iter_%d", sample_id);
+
+if (!writeNullEvidenceFile(prefix, sampleName, pd))
+    errAbort("Problem writing evidence file %s\n", prefix);
+}
+
+/* Pipeline Stuff */
+struct pathwayVals *pathwayLevelAnalysis(struct sqlConnection *biConn, char *tableName, 
+					 struct slPair *spData, struct slPair *spPathways)
+{
+fprintf(stdout, "starting geneset analysis.\n");
+
+struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
+
+struct slPair *pa, *sp;
+
+int count = 0;
+
+for (pa = spPathways; pa; pa = pa->next)
+    {
+    struct pathwayData *pd = pa->val;
+    pd->featureIds = featureHash; 
+
+    fprintf(stderr, "prepping pathway %s:\n", pa->name);
+    if (!prepFactorGraph(tableName, pd))
+	{
+	fprintf(stderr, "problem with prep, skipping pathway.\n");
+	continue;
+	}
+    char prefix[512];
+    safef(prefix, sizeof(prefix), "%s/%s_pid_%d", 
+	  tmpDir, tableName, pd->id);
+
+    if (!prepEvidenceFiles(prefix, pd))
+	{
+	fprintf(stderr, "problem with evidence prep, skipping pathway.\n");
+	continue;
+	}
+
+    for (sp = spData; sp; sp = sp->next)
+	{
+	pd->data = sp->val;
+	if (slCount(pd->data) < 2) 
+	    continue;  // currently only consider samples with more than one type of evidence.
+
+	int sample_id = atoi(sp->name);
+	factorGraph(tableName, pd, sample_id, -1);
+	fprintf(stderr, ".");
+	fflush(stderr);
+	}
+    fprintf(stderr, "\n");
+    
+    count++;
+    }
+
+fprintf(stderr, "\n");
+
+return NULL; 
+}            
+
+struct slName *getAnalysisFeatureNames(struct sqlConnection *biConn, char *tableName, 
+				       char *type)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select feature_name from %s where type = \"%s\"",
+      tableName, type);
+
+return sqlQuickList(biConn, query);
+}
+
+
+struct dataTuple *getAllDataTuples(struct sqlConnection *biConn, struct hash *featureHash, 
+				   struct slPair *spData)
+{
+struct slName *sl, *featureNames = getAnalysisFeatureNames(biConn, AF_TABLE, "gene");
+
+char idStr[256];
+struct slName *featureIds = NULL;
+for (sl = featureNames; sl; sl = sl->next)
+    {
+    int id = hashIntValDefault(featureHash, sl->name, -1);
+    if (id == -1)
+	continue;
+    
+    safef(idStr, sizeof(idStr), "%d", id);
+    slNameAddHead(&featureIds, idStr);
+    }
+
+slNameFreeList(&featureNames);
+
+struct slPair *sp;
+struct dataTuple *dt, *dtList = NULL;
+for (sp = spData; sp; sp = sp->next)
+    {
+    struct typeHash *th, *thList = sp->val;
+    if (slCount(thList) < 2) 
+	continue;  // currently only consider samples with more than one type of evidence.
+    
+    for (sl = featureIds; sl; sl = sl->next)
+	{
+	AllocVar(dt);
+	dt->cnv = BAD_VAL;
+	dt->exp = BAD_VAL;
+	for (th = thList; th; th = th->next)
+	    {
+	    struct hashEl *el = hashLookup(th->hash, sl->name);
+	    if (!el)
+		continue;
+	    
+	    struct analysisVals *av = el->val;
+	    if (sameString(th->type, "Expression"))
+		dt->exp = av->conf;
+	    else if (sameString(th->type, "CNV"))
+		dt->cnv = av->conf;	    
+	    }
+	if (dt->cnv == BAD_VAL && dt->exp == BAD_VAL)
+	    dataTupleFree(&dt);
+	else
+	    slAddHead(&dtList, dt);
+	}
+    }
+
+return dtList;
+}
+
+struct dataTuple *getPathwayDataTuples(struct sqlConnection *biConn, struct hash *featureHash, 
+				       struct slPair *spData, struct slPair *spPathway)
+{
+struct pathwayData *pd = spPathway->val;
+struct entities *en, *enList = pd->entities;
+struct slName *featureIds = NULL;
+char idStr[128];
+for (en = enList; en; en = en->next)
+    {
+    int id = hashIntValDefault(featureHash, en->name, -1);
+    if (id == -1)
+	continue;
+    
+    safef(idStr, sizeof(idStr), "%d", id);
+    slNameAddHead(&featureIds, idStr);
+    }
+
+struct slPair *sp;
+struct dataTuple *dt, *dtList = NULL;
+for (sp = spData; sp; sp = sp->next)
+    {
+    struct typeHash *th, *thList = sp->val;
+    if (slCount(thList) < 2) 
+	continue;  // currently only consider samples with more than one type of evidence.
+    
+    struct slName *sl;
+    for (sl = featureIds; sl; sl = sl->next)
+	{
+	AllocVar(dt);
+	dt->cnv = BAD_VAL;
+	dt->exp = BAD_VAL;
+	for (th = thList; th; th = th->next)
+	    {
+	    struct hashEl *el = hashLookup(th->hash, sl->name);
+	    if (!el)
+		continue;
+	    
+	    struct analysisVals *av = el->val;
+	    if (sameString(th->type, "Expression"))
+		dt->exp = av->conf;
+	    else if (sameString(th->type, "CNV"))
+		dt->cnv = av->conf;	    
+	    }
+	if (dt->cnv == BAD_VAL && dt->exp == BAD_VAL)
+	    dataTupleFree(&dt);
+	else
+	    slAddHead(&dtList, dt);
+	}
+    }
+
+return dtList;
+}
+
+struct hash *hashDataTuples(struct dataTuple *dtList)
+{
+if (!dtList)
+    return NULL;
+
+struct hash *hash = hashNew(0);
+
+char indexStr[128];
+int index = 0;
+struct dataTuple *dt;
+for (dt = dtList; dt; dt = dt->next)
+    {
+    safef(indexStr, sizeof(indexStr), "%d", index);
+    hashAdd(hash, indexStr, dt);
+    index++;
+    }
+
+return hash;
+}
+
+/* Pipeline Stuff */
+struct pathwayVals *pathwayNullAnalysis(struct sqlConnection *biConn, char *tableName, 
+					struct slPair *spData, struct slPair *spPathways,
+					int numIters, char *nullType)
+{
+fprintf(stderr, "starting null analysis analysis.\n");
+
+struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
+
+struct dataTuple *dtList = NULL;
+if (sameString(nullType, "allData"))
+    dtList = getAllDataTuples(biConn, featureHash, spData);
+else if (sameString(nullType, "withinPathway"))
+    dtList = getPathwayDataTuples(biConn, featureHash, spData, spPathways);
+else
+    errAbort("%s not supported", nullType);
+
+fprintf(stderr, "hashing data tuples...\n");
+struct hash *dtHash = hashDataTuples(dtList);
+
+struct slPair *pa;
+for (pa = spPathways; pa; pa = pa->next)
+    {
+    struct pathwayData *pd = pa->val;
+    pd->featureIds = featureHash;
+
+    fprintf(stderr, "prepping pathway %s:\n", pa->name);
+    if (!prepFactorGraph(tableName, pd))
+	{
+	fprintf(stderr, "problem with prep, skipping pathway.\n");
+	continue;
+	}
+
+    pd->data = dtHash;
+    
+    // Tune batch size to get around ~20 minutes per batch.
+    // .. Pathways listed below take forever.
+    int numInBatch = 0;
+    if (sameString(pa->name, "LPA receptor mediated events"))
+	numInBatch = 100;
+    else if (sameString(pa->name, "PDGFR-beta signaling pathway"))
+	numInBatch = 250;
+    else if (sameString(pa->name, "TCGA08_rtk_signaling"))
+	numInBatch = 250;
+    else if (sameString(pa->name, "p38 MAPK signaling pathway"))
+	numInBatch = 250;
+    else if (sameString(pa->name, "Canonical Wnt signaling pathway"))
+	numInBatch = 250;
+    else if (sameString(pa->name, "RXR and RAR heterodimerization with other nuclear receptor"))
+	numInBatch = 250;
+    else
+	numInBatch = NUM_IN_BATCH;  // default batch size
+    
+    int i;
+    int batchNum = 0;
+    for (i = 0; i < numIters; i++)
+	{
+	if ((i % numInBatch) == 0)
+	    {
+	    char prefix[512];
+	    batchNum++;
+	    if (sameString(nullType, "allData"))
+		{
+		safef(prefix, sizeof(prefix),
+		      "%s/na_%s_pid_%d_batch_%d",
+		      tmpDir, tableName, pd->id, batchNum);
+		}
+	    else
+		{
+		safef(prefix, sizeof(prefix),
+		      "%s/nw_%s_pid_%d_batch_%d",
+		      tmpDir, tableName, pd->id, batchNum);
+		} 
+	    
+	    if (!prepEvidenceFiles(prefix, pd))
+		{
+		fprintf(stderr, "problem with evidence prep, skipping pathway.\n");
+		continue;
+		}
+	    }
+	factorGraphNull(tableName, pd, i, batchNum, nullType);
+	fprintf(stderr, ".");
+	fflush(stderr);
+	}
+    fprintf(stderr, "\n");
+    }
+
+fprintf(stderr, "\n");
+
+return NULL; 
+}            
+
+void entitiesFree(struct entities **pEl)
+{
+struct entities *el;
+
+if ((el = *pEl) == NULL) return;
+
+freeMem(el->type);
+freeMem(el->name);
+freez(pEl);
+}
+
+void entitiesFreeList(struct entities **pList)
+/* Free a list of dynamically allocated pathwayVals's */
+{
+struct entities *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+    {
+    next = el->next;
+    entitiesFree(&el);
+    }
+*pList = NULL;
+}    
+
+
+void linksFree(struct links **pEl)
+{
+struct links *el;
+
+if ((el = *pEl) == NULL) return;
+
+freeMem(el->parent_name);
+freeMem(el->child_name);
+freeMem(el->link_type);
+freez(pEl);
+}
+
+void linksFreeList(struct links **pList)
+/* Free a list of dynamically allocated link's */
+{
+struct links *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+    {
+    next = el->next;
+    linksFree(&el);
+    }
+*pList = NULL;
+}    
+
+
+void pathwayDataFree(struct pathwayData **pEl)
+{
+struct pathwayData *el;
+
+if ((el = *pEl) == NULL) return;
+
+linksFreeList(&el->links);
+entitiesFreeList(&el->entities);
+
+freez(pEl);
+}
+
+void slPairPathwayFree(struct slPair **pEl)
+{
+struct slPair *el;
+
+if ((el = *pEl) == NULL) return;
+
+freeMem(el->name);
+
+struct pathwayData *pd = el->val;
+pathwayDataFree(&pd);
+freez(pEl);
+}
+
+void slPairPathwayFreeList(struct slPair **pList)
+{
+struct slPair *el, *next;
+
+for (el = *pList; el != NULL; el = next)
+    {
+    next = el->next;
+    slPairPathwayFree(&el);
+    }
+*pList = NULL;
+} 
+    
+struct slPair *getPathwaysByName(struct sqlConnection *biConn, char *pathwayName)
+{
+char query[2048];
+
+if (!pathwayName)
+    safef(query, sizeof(query), 
+	  "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
+	  "join %s on %s.pathway_id=%s.pathway_id", // a join across a few tables
+	  EN_TABLE, EP_TABLE, EN_TABLE, 
+	  EP_TABLE, EN_TABLE, EP_TABLE);
+else
+    safef(query, sizeof(query), 
+	  "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
+	  "join %s on %s.pathway_id=%s.pathway_id where %s.pathway_name = \"%s\"", 
+	  EN_TABLE, EP_TABLE, EN_TABLE, 
+	  EP_TABLE, EN_TABLE, EP_TABLE, 
+	  EP_TABLE, pathwayName);
+
+struct sqlResult *sr = sqlGetResult(biConn, query);
+char **row = NULL;
+
+struct hash *hash = hashNew(0);
+
+struct pathwayData *pd;
+struct slPair *sp, *spList = NULL;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    int pathway_id     = atoi(row[0]);
+    char *pathway_name = row[1];   // pathway name
+    int entity_id      = atoi(row[2]);
+    char *entity_name  = row[3];   // entity name
+    char *entity_type  = row[4];   // type (protein, abstract, whatever...)
+
+    struct hashEl *el = hashLookup(hash, pathway_name);
+    if (!el)
+	{
+	AllocVar(sp);
+	sp->name = cloneString(pathway_name);
+	AllocVar(pd);
+	pd->id = pathway_id;
+	pd->entities = NULL;
+	pd->links = NULL;
+	pd->data = NULL;
+	pd->featureIds = NULL;
+	sp->val = pd;
+	slAddHead(&spList, sp);
+	hashAdd(hash, pathway_name, sp);
+	}
+    else
+	sp = el->val;
+
+    pd = sp->val;
+    if (!pd)
+	continue;
+
+    struct entities *en;
+    AllocVar(en);
+    en->id   = entity_id;
+    en->name = cloneString(entity_name);
+    en->type = cloneString(entity_type);
+    slAddHead(&pd->entities, en);
+    }
+sqlFreeResult(&sr);
+
+safef(query, sizeof(query), 
+      "select pathway_name,t1.entity_name,t2.entity_name,link_name from %s "
+      "join %s on %s.pathway=%s.pathway_id "
+      "join %s on link_type=%s.id "
+      "join %s as t1 on %s.parent_entity=t1.entity_id "
+      "join %s as t2 on %s.child_entity=t2.entity_id ", // a join across a few tables
+      EL_TABLE, EP_TABLE, EL_TABLE, EP_TABLE,
+      ELT_TABLE, ELT_TABLE, 
+      EN_TABLE, EL_TABLE, 
+      EN_TABLE, EL_TABLE);
+
+sr = sqlGetResult(biConn, query);
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    char *pathway_name  = row[0];   // name
+    char *parent_entity = row[1];   // parent name
+    char *child_entity  = row[2];   // child name
+    char *link_type     = row[3];   // link type, e.g. '->'
+    
+    struct hashEl *el = hashLookup(hash, pathway_name);
+    if (!el)
+	continue;  
+
+    sp = el->val;
+    pd = sp->val;
+
+    if (!pd)
+	continue;
+
+    struct links *li;
+    AllocVar(li);
+    li->parent_name = cloneString(parent_entity);
+    li->child_name  = cloneString(child_entity);
+    li->link_type   = cloneString(link_type);
+    slAddHead(&pd->links, li);
+    }
+
+sqlFreeResult(&sr);
+
+hashFree(&hash);
+
+slReverse(&spList);
+return spList;
+}
+
+char *getCohortName(struct sqlConnection *biConn, int cohort_id)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select name from %s where id = %d", 
+      CO_TABLE, cohort_id);
+
+return sqlQuickString(biConn, query);
+}
+
+struct slName *getCohortTables(struct sqlConnection *biConn, int cohort_id)
+{
+char query[256];
+safef(query, sizeof(query), 
+      "select data_table from %s join %s on id=dataset_id where cohort_id = %d",
+      DC_TABLE, DA_TABLE, cohort_id);
+
+return sqlQuickList(biConn, query);
+}
+
+void pathwayLevelPipeline(char *db, char *pathway, int cohort_id, int numIters)
+{
+struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
+
+char *cohort_name = getCohortName(biConn, cohort_id);
+struct slName *tableNames = getCohortTables(biConn, cohort_id);
+
+uglyTime(NULL);
+//struct slPair *spPathways = getPathwaysByName(biConn, pathway);
+struct slPair *spPathways = getPathwaysByName(biConn, NULL);
+uglyTime("got pathways");
+
+if (slCount(spPathways) == 0)
+    errAbort("No pathways by name of %s", pathway);
+
+struct slPair *spData = analysisValsSamplesHashesList(biConn, tableNames);
+uglyTime("got sample hashes");   
+
+pathwayLevelAnalysis(biConn, cohort_name, spData, spPathways);
+uglyTime("analyzed all genesets");
+
+pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, "withinPathway");
+uglyTime("produced null distribution");
+
+pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, "allData");
+uglyTime("produced null distribution");
+
+slPairHashesFreeList(&spData);
+slPairPathwayFreeList(&spPathways);
+hFreeConn(&biConn);
+} 
+
+int main(int argc, char *argv[])
+       /* Process command line. */
+{
+if (argc != 6)
+    usage();
+
+srand ( time(NULL) );
+
+char *db = argv[1];
+char *pathway = argv[2];
+tmpDir = cloneString(argv[3]);
+int cohort_id = atoi(argv[4]);
+int numIters = atoi(argv[5]);
+
+pathwayLevelPipeline(db, pathway, cohort_id, numIters);
+
+return 0;
+}