src/hg/instinct/bioInt2/makeClusterFiles.c 1.4

1.4 2009/12/18 21:43:13 cvaske
Deal with an arbitrary number of data types, rather than just CGH expression; Changed output names; fixed bug with withinPathway permutation
Index: src/hg/instinct/bioInt2/makeClusterFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/makeClusterFiles.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 4 -r1.3 -r1.4
--- src/hg/instinct/bioInt2/makeClusterFiles.c	5 Sep 2009 01:12:01 -0000	1.3
+++ src/hg/instinct/bioInt2/makeClusterFiles.c	18 Dec 2009 21:43:13 -0000	1.4
@@ -29,10 +29,9 @@
 char *tmpDir = "tmpNull";
 
 struct dataTuple {
     struct dataTuple *next;
-    double cnv;
-    double exp;
+    double *data;
 };
 
 /* Gene-level analysis functions */
 struct links {
@@ -60,10 +59,11 @@
 
 void dataTupleFree(struct dataTuple **pEl)
 {
 struct dataTuple *el;
-
+if (!pEl) return;
 if ((el = *pEl) == NULL) return;
+freez(&el->data);
 freez(pEl);
 }
 
 void dataTupleFreeList(struct dataTuple **pList)
@@ -77,8 +77,33 @@
     }
 *pList = NULL;
 }  
 
+/* If we have data in all data types, return the tuple, otherwise NULL */
+struct dataTuple* dataTupleAlloc(struct typeHash* data, struct slName* sl) {
+struct dataTuple* dt;
+AllocVar(dt);
+AllocArray(dt->data, slCount(data));
+struct typeHash* th;
+double *val = dt->data;
+for (th = data; th; th = th->next)
+    {
+    struct hashEl *el = hashLookup(th->hash, sl->name);
+    if (el)
+	{
+	struct analysisVals *av = el->val;
+	*val = av->conf;
+	}
+    else
+	{
+	dataTupleFree(&dt);
+	return NULL;
+	}
+    ++val;
+    }
+return dt;
+}
+
 struct analysisVals *readAnalysisValsFromFile(char *filename, 
 					      struct entities *enList, int sample_id)
 {
 struct lineFile *lf = lineFileOpen(filename, TRUE);
@@ -183,44 +208,38 @@
     return FALSE;
 
 struct pathwayData *pd = data;
 
-char cnvFile[1024];
-safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
-FILE *cnvF = mustOpen(cnvFile, "w");
-if (!cnvF)
-    return FALSE;
-
-char expFile[1024];
-safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
-FILE *expF = mustOpen(expFile, "w");
-if (!expF)
+boolean closedAll = 1;
+struct typeHash *th;
+for (th = pd->data; th; th = th->next)
+    {
+    char fileName[1024];
+    safef(fileName, sizeof(fileName), "%s_%s.tab", prefix, th->type);
+    FILE *evidF = mustOpen(fileName, "a");
+    if (!evidF)
     return FALSE;
 
-fprintf(cnvF, "id\t");
-fprintf(expF, "id\t");
-
-struct entities *en, *enList = pd->entities;
-for (en = enList; en; en = en->next)
+    fprintf(evidF, "id\t");
+    struct entities *en, *enList = pd->entities;
+    for (en = enList; en; en = en->next)
     {
     int id = hashIntValDefault(pd->featureIds, en->name, -1);
     if (id == -1)
 	continue;
 
-    fprintf(cnvF, "%s", en->name);
-    fprintf(expF, "%s", en->name);
+	fprintf(evidF, "%s", en->name);
     if (en->next)
 	{
-	fprintf(cnvF, "\t");
-	fprintf(expF, "\t");
+	    fprintf(evidF, "\t");
 	}
     }
-fprintf(cnvF, "\n");
-fprintf(expF, "\n");
+    fprintf(evidF, "\n");
 
-boolean closedExp = carefulCloseWarn(&expF);
-boolean closedCNV = carefulCloseWarn(&cnvF);
-return (closedExp && closedCNV);
+    closedAll = closedAll &&  carefulCloseWarn(&evidF);
+    }
+    
+return closedAll;
 }
 
 boolean writeEvidenceFile(char *prefix, char *sampleName, struct pathwayData *pd)
 {
@@ -228,104 +247,75 @@
     return FALSE;
 if (!pd->entities || !pd->links)
     return FALSE;
 
-char cnvFile[1024];
-safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
-FILE *cnvF = mustOpen(cnvFile, "a");
-if (!cnvF)
-    return FALSE;
+boolean closedAll = 1;
 
-char expFile[1024];
-safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
-FILE *expF = mustOpen(expFile, "a");
-if (!expF)
+struct typeHash *th;
+for (th = pd->data; th; th = th->next)
+    {
+    struct entities *en, *enList = pd->entities;
+    char idStr[128];
+    char fileName[1024];
+    safef(fileName, sizeof(fileName), "%s_%s.tab", prefix, th->type);
+    FILE *evidF = mustOpen(fileName, "a");
+    if (!evidF)
     return FALSE;
+    fprintf(evidF, "%s\t", sampleName);
 
-fprintf(cnvF, "%s\t", sampleName); 
-fprintf(expF, "%s\t", sampleName);
- 
-struct entities *en, *enList = pd->entities;
-char idStr[128];
-for (en = enList; en; en = en->next)
+    for (en = enList; en; en = en->next)
     {
     int id = hashIntValDefault(pd->featureIds, en->name, -1);
     if (id == -1)
 	continue;
 
     safef(idStr, sizeof(idStr), "%d", id);
 	
-    boolean hadExp = FALSE;
-    boolean hadCNV = FALSE;
-    
-    struct typeHash *th;
-    for (th = pd->data; th; th = th->next)
-	{
 	struct hashEl *el = hashLookup(th->hash, idStr);
 	if (!el)
-	    continue;
-
+	    fprintf(evidF, "NA");
+	else 
+	    {
 	struct analysisVals *av = el->val;
 	double val = av->conf;
-	
-	if (sameString(th->type, "Expression"))
-	    {
-	    fprintf(expF, "%f", val);
-	    hadExp = TRUE;
+	    fprintf(evidF, "%f", val);
 	    }
-	else if (sameString(th->type, "CNV"))
-	    {
-	    fprintf(cnvF, "%f", val);
-	    hadCNV = TRUE;
-	    }
-	else
-	    continue;
-	}
-    if (!hadExp)
-	fprintf(expF, "NA");
-    
-    if (!hadCNV)
-	fprintf(cnvF, "NA");
 
     if (en->next)
 	{
-	fprintf(cnvF, "\t");
-	fprintf(expF, "\t");
+	    fprintf(evidF, "\t");
 	}
     }
-
-fprintf(cnvF, "\n");
-fprintf(expF, "\n");
-
-boolean closedExp = carefulCloseWarn(&expF);
-boolean closedCNV = carefulCloseWarn(&cnvF);
-
-return (closedExp && closedCNV);
+    fprintf(evidF, "\n");
+    closedAll = closedAll && carefulCloseWarn(&evidF);
+    }
+return closedAll;
 }
 
-boolean writeNullEvidenceFile(char *prefix, char *sampleName, struct pathwayData *pd)
+boolean writeNullEvidenceFile(char *prefix, char *sampleName, struct pathwayData *pd, struct typeHash* thList)
 {
 if (!pd)
     return FALSE;
 if (!pd->entities || !pd->links)
     return FALSE;
-
-char cnvFile[1024];
-safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
-FILE *cnvF = mustOpen(cnvFile, "a");
-if (!cnvF)
+if (!thList)
     return FALSE;
 
-char expFile[1024];
-safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
-FILE *expF = mustOpen(expFile, "a");
-if (!expF)
-    return FALSE;
+int numTypes = slCount(thList);
+FILE** evidFiles;
+AllocArray(evidFiles, numTypes);
+int i = 0;
+struct typeHash* th = thList;
+for (i = 0; i < numTypes; ++i)
+    {
+    char fileName[1024];
+    safef(fileName, sizeof(fileName), "%s_%s.tab", prefix, th->type);
+    evidFiles[i] = mustOpen(fileName, "a");
 
-fprintf(cnvF, "%s\t", sampleName); 
-fprintf(expF, "%s\t", sampleName);
+    fprintf(evidFiles[i], "%s\t", sampleName); 
+    th = th->next;
+    }
 
-struct dataTuple *dt;
 struct hash *dtHash = pd->data;
 int numTuples = hashNumEntries(dtHash);
 
 struct entities *en, *enList = pd->entities;
@@ -345,35 +335,38 @@
 	{
 	fprintf(stderr, "uh-oh!");
 	continue;
 	}
-    dt = el->val; 
+    struct dataTuple* dt = el->val; 
 
-    if (dt->cnv != BAD_VAL)
-	fprintf(cnvF, "%f", dt->cnv);
-    else
-	fprintf(cnvF, "NA");
-
-
-    if (dt->exp != BAD_VAL)
-	fprintf(expF, "%f", dt->exp);
+    for (i = 0; i < numTypes; ++i) 
+	{
+	if (dt->data[i] != BAD_VAL)
+	    {
+	    fprintf(evidFiles[i], "%f", dt->data[i]);
+	    }
     else
-	fprintf(expF, "NA");
+	    {
+	    fprintf(evidFiles[i], "NA");
+	    }
 
     if (en->next)
 	{
-	fprintf(cnvF, "\t");
-	fprintf(expF, "\t");
+	    fprintf(evidFiles[i], "\t");
+	    }
 	}
     }
 
-fprintf(cnvF, "\n");
-fprintf(expF, "\n");
+boolean closedAll = 1;
+for (i = 0; i < numTypes; ++i)
+    {
+    fprintf(evidFiles[i], "\n");
+    closedAll = closedAll & carefulCloseWarn(&evidFiles[i]);
+    }
 
-boolean closeExp = carefulCloseWarn(&expF);
-boolean closeCNV = carefulCloseWarn(&cnvF);
+freez(evidFiles);
 
-return (closeExp && closeCNV);
+return closedAll;
 }
 
 void factorGraph(char *tableName, void *data, int sample_id, int feature_id)
 { 
@@ -392,9 +385,10 @@
 if (!writeEvidenceFile(prefix, sampleName, pd))
     errAbort("Problem writing evidence file %s\n", prefix);
 }
 
-void factorGraphNull(char *tableName, void *data, int sample_id, int batchNum, 
+void factorGraphNull(char *tableName, void *data, struct typeHash* thList,
+		     int sample_id, int batchNum, 
 		     char *nullType)
 { 
 struct pathwayData *pd = data;
 
@@ -415,15 +409,16 @@
 
 char sampleName[512];
 safef(sampleName, sizeof(sampleName), "iter_%d", sample_id);
 
-if (!writeNullEvidenceFile(prefix, sampleName, pd))
+if (!writeNullEvidenceFile(prefix, sampleName, pd, thList))
     errAbort("Problem writing evidence file %s\n", prefix);
 }
 
 /* Pipeline Stuff */
 struct pathwayVals *pathwayLevelAnalysis(struct sqlConnection *biConn, char *tableName, 
-					 struct slPair *spData, struct slPair *spPathways)
+					 struct slPair *spData, struct slPair *spPathways,
+					 int numDataTypes)
 {
 fprintf(stdout, "starting geneset analysis.\n");
 
 struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
@@ -446,8 +441,9 @@
     char prefix[512];
     safef(prefix, sizeof(prefix), "%s/%s_pid_%d", 
 	  tmpDir, tableName, pd->id);
 
+    pd->data = spData->val;
     if (!prepEvidenceFiles(prefix, pd))
 	{
 	fprintf(stderr, "problem with evidence prep, skipping pathway.\n");
 	continue;
@@ -455,9 +451,9 @@
 
     for (sp = spData; sp; sp = sp->next)
 	{
 	pd->data = sp->val;
-	if (slCount(pd->data) < 2) 
+	if (slCount(pd->data) < numDataTypes) 
 	    continue;  // currently only consider samples with more than one type of evidence.
 
 	int sample_id = atoi(sp->name);
 	factorGraph(tableName, pd, sample_id, -1);
@@ -486,9 +482,9 @@
 }
 
 
 struct dataTuple *getAllDataTuples(struct sqlConnection *biConn, struct hash *featureHash, 
-				   struct slPair *spData)
+				   struct slPair *spData, int numDataTypes)
 {
 struct slName *sl, *featureNames = getAnalysisFeatureNames(biConn, AF_TABLE, "gene");
 
 char idStr[256];
@@ -508,41 +504,25 @@
 struct slPair *sp;
 struct dataTuple *dt, *dtList = NULL;
 for (sp = spData; sp; sp = sp->next)
     {
-    struct typeHash *th, *thList = sp->val;
-    if (slCount(thList) < 2) 
+    struct typeHash *thList = sp->val;
+    if (slCount(thList) < numDataTypes) 
 	continue;  // currently only consider samples with more than one type of evidence.
     
     for (sl = featureIds; sl; sl = sl->next)
 	{
-	AllocVar(dt);
-	dt->cnv = BAD_VAL;
-	dt->exp = BAD_VAL;
-	for (th = thList; th; th = th->next)
-	    {
-	    struct hashEl *el = hashLookup(th->hash, sl->name);
-	    if (!el)
-		continue;
-	    
-	    struct analysisVals *av = el->val;
-	    if (sameString(th->type, "Expression"))
-		dt->exp = av->conf;
-	    else if (sameString(th->type, "CNV"))
-		dt->cnv = av->conf;	    
-	    }
-	if (dt->cnv == BAD_VAL && dt->exp == BAD_VAL)
-	    dataTupleFree(&dt);
-	else
+	dt = dataTupleAlloc(thList, sl);
+	if (dt)
 	    slAddHead(&dtList, dt);
 	}
     }
 
 return dtList;
 }
 
 struct dataTuple *getPathwayDataTuples(struct sqlConnection *biConn, struct hash *featureHash, 
-				       struct slPair *spData, struct slPair *spPathway)
+				       struct slPair *spData, struct slPair *spPathway, int numDataTypes)
 {
 struct pathwayData *pd = spPathway->val;
 struct entities *en, *enList = pd->entities;
 struct slName *featureIds = NULL;
@@ -560,33 +540,17 @@
 struct slPair *sp;
 struct dataTuple *dt, *dtList = NULL;
 for (sp = spData; sp; sp = sp->next)
     {
-    struct typeHash *th, *thList = sp->val;
-    if (slCount(thList) < 2) 
+    struct typeHash *thList = sp->val;
+    if (slCount(thList) != numDataTypes) 
 	continue;  // currently only consider samples with more than one type of evidence.
     
     struct slName *sl;
     for (sl = featureIds; sl; sl = sl->next)
 	{
-	AllocVar(dt);
-	dt->cnv = BAD_VAL;
-	dt->exp = BAD_VAL;
-	for (th = thList; th; th = th->next)
-	    {
-	    struct hashEl *el = hashLookup(th->hash, sl->name);
-	    if (!el)
-		continue;
-	    
-	    struct analysisVals *av = el->val;
-	    if (sameString(th->type, "Expression"))
-		dt->exp = av->conf;
-	    else if (sameString(th->type, "CNV"))
-		dt->cnv = av->conf;	    
-	    }
-	if (dt->cnv == BAD_VAL && dt->exp == BAD_VAL)
-	    dataTupleFree(&dt);
-	else
+	dt = dataTupleAlloc(thList, sl);
+	if (dt)
 	    slAddHead(&dtList, dt);
 	}
     }
 
@@ -615,31 +579,44 @@
 
 /* Pipeline Stuff */
 struct pathwayVals *pathwayNullAnalysis(struct sqlConnection *biConn, char *tableName, 
 					struct slPair *spData, struct slPair *spPathways,
-					int numIters, char *nullType)
+					int numIters, int numDataTypes, char *nullType)
 {
+if (!spData)
+    return NULL;
+
 fprintf(stderr, "starting null analysis analysis.\n");
 
 struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
-
+struct hash *dtHash = NULL;
 struct dataTuple *dtList = NULL;
 if (sameString(nullType, "allData"))
-    dtList = getAllDataTuples(biConn, featureHash, spData);
+    {
+    dtList = getAllDataTuples(biConn, featureHash, spData, numDataTypes);
+    fprintf(stderr, "hashing data tuples...\n");
+    dtHash = hashDataTuples(dtList);
+    }
 else if (sameString(nullType, "withinPathway"))
-    dtList = getPathwayDataTuples(biConn, featureHash, spData, spPathways);
+    {/* set dtList on each invofaction below*/}
 else
     errAbort("%s not supported", nullType);
 
-fprintf(stderr, "hashing data tuples...\n");
-struct hash *dtHash = hashDataTuples(dtList);
 
 struct slPair *pa;
 for (pa = spPathways; pa; pa = pa->next)
     {
     struct pathwayData *pd = pa->val;
     pd->featureIds = featureHash;
 
+    if (sameString(nullType, "withinPathway"))
+	{
+	dtList = getPathwayDataTuples(biConn, featureHash, spData, pa,
+				      numDataTypes);
+	fprintf(stderr, "hashing data tuples...\n");
+	dtHash = hashDataTuples(dtList);
+	}
+
     fprintf(stderr, "prepping pathway %s:\n", pa->name);
     if (!prepFactorGraph(tableName, pd))
 	{
 	fprintf(stderr, "problem with prep, skipping pathway.\n");
@@ -686,15 +663,17 @@
 		      "%s/nw_%s_pid_%d_batch_%d",
 		      tmpDir, tableName, pd->id, batchNum);
 		} 
 	    
+	    pd->data = spData->val;
 	    if (!prepEvidenceFiles(prefix, pd))
 		{
 		fprintf(stderr, "problem with evidence prep, skipping pathway.\n");
 		continue;
 		}
 	    }
-	factorGraphNull(tableName, pd, i, batchNum, nullType);
+	pd->data = dtHash;
+	factorGraphNull(tableName, pd, spData->val, i, batchNum, nullType);
 	fprintf(stderr, ".");
 	fflush(stderr);
 	}
     fprintf(stderr, "\n");
@@ -921,8 +900,29 @@
 
 return sqlQuickList(biConn, query);
 }
 
+struct slPair * filterAnalysisValsSampleHashesFullData(struct slPair* spData,
+						       int dataCount)
+{
+struct slPair *sp, **prevSp = &spData;
+for (sp = spData; sp;)
+    {
+    if (slCount(sp->val) != dataCount) 
+	{
+	*prevSp = sp->next;
+	slPairHashesFree(&sp);
+	sp = *prevSp;
+	} 
+    else
+	{
+	prevSp = &(sp->next);
+	sp = sp->next;
+	}
+    }
+return spData;
+}
+
 void pathwayLevelPipeline(char *db, char *pathway, int cohort_id, int numIters)
 {
 struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
 
@@ -942,15 +942,22 @@
 
 struct slPair *spData = analysisValsSamplesHashesList(biConn, tableNames);
 uglyTime("got sample hashes");   
 
-pathwayLevelAnalysis(biConn, cohort_name, spData, spPathways);
+/* Only consider samples with full data.  The following output */
+/* functions require this. */
+spData = filterAnalysisValsSampleHashesFullData(spData, slCount(tableNames));
+
+pathwayLevelAnalysis(biConn, cohort_name, spData, spPathways, 
+		     slCount(tableNames));
 uglyTime("analyzed all genesets");
 
-pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, "withinPathway");
+pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, 
+		    slCount(tableNames), "withinPathway");
 uglyTime("produced null distribution");
 
-pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, "allData");
+pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, 
+		    slCount(tableNames), "allData");
 uglyTime("produced null distribution");
 
 slPairHashesFreeList(&spData);
 slPairPathwayFreeList(&spPathways);