src/hg/instinct/bioInt2/makeClusterFiles.c 1.4
1.4 2009/12/18 21:43:13 cvaske
Deal with an arbitrary number of data types, rather than just CGH expression; Changed output names; fixed bug with withinPathway permutation
Index: src/hg/instinct/bioInt2/makeClusterFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/makeClusterFiles.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 4 -r1.3 -r1.4
--- src/hg/instinct/bioInt2/makeClusterFiles.c 5 Sep 2009 01:12:01 -0000 1.3
+++ src/hg/instinct/bioInt2/makeClusterFiles.c 18 Dec 2009 21:43:13 -0000 1.4
@@ -29,10 +29,9 @@
char *tmpDir = "tmpNull";
struct dataTuple {
struct dataTuple *next;
- double cnv;
- double exp;
+ double *data;
};
/* Gene-level analysis functions */
struct links {
@@ -60,10 +59,11 @@
void dataTupleFree(struct dataTuple **pEl)
{
struct dataTuple *el;
-
+if (!pEl) return;
if ((el = *pEl) == NULL) return;
+freez(&el->data);
freez(pEl);
}
void dataTupleFreeList(struct dataTuple **pList)
@@ -77,8 +77,33 @@
}
*pList = NULL;
}
+/* If we have data in all data types, return the tuple, otherwise NULL */
+struct dataTuple* dataTupleAlloc(struct typeHash* data, struct slName* sl) {
+struct dataTuple* dt;
+AllocVar(dt);
+AllocArray(dt->data, slCount(data));
+struct typeHash* th;
+double *val = dt->data;
+for (th = data; th; th = th->next)
+ {
+ struct hashEl *el = hashLookup(th->hash, sl->name);
+ if (el)
+ {
+ struct analysisVals *av = el->val;
+ *val = av->conf;
+ }
+ else
+ {
+ dataTupleFree(&dt);
+ return NULL;
+ }
+ ++val;
+ }
+return dt;
+}
+
struct analysisVals *readAnalysisValsFromFile(char *filename,
struct entities *enList, int sample_id)
{
struct lineFile *lf = lineFileOpen(filename, TRUE);
@@ -183,44 +208,38 @@
return FALSE;
struct pathwayData *pd = data;
-char cnvFile[1024];
-safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
-FILE *cnvF = mustOpen(cnvFile, "w");
-if (!cnvF)
- return FALSE;
-
-char expFile[1024];
-safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
-FILE *expF = mustOpen(expFile, "w");
-if (!expF)
+boolean closedAll = 1;
+struct typeHash *th;
+for (th = pd->data; th; th = th->next)
+ {
+ char fileName[1024];
+ safef(fileName, sizeof(fileName), "%s_%s.tab", prefix, th->type);
+ FILE *evidF = mustOpen(fileName, "a");
+ if (!evidF)
return FALSE;
-fprintf(cnvF, "id\t");
-fprintf(expF, "id\t");
-
-struct entities *en, *enList = pd->entities;
-for (en = enList; en; en = en->next)
+ fprintf(evidF, "id\t");
+ struct entities *en, *enList = pd->entities;
+ for (en = enList; en; en = en->next)
{
int id = hashIntValDefault(pd->featureIds, en->name, -1);
if (id == -1)
continue;
- fprintf(cnvF, "%s", en->name);
- fprintf(expF, "%s", en->name);
+ fprintf(evidF, "%s", en->name);
if (en->next)
{
- fprintf(cnvF, "\t");
- fprintf(expF, "\t");
+ fprintf(evidF, "\t");
}
}
-fprintf(cnvF, "\n");
-fprintf(expF, "\n");
+ fprintf(evidF, "\n");
-boolean closedExp = carefulCloseWarn(&expF);
-boolean closedCNV = carefulCloseWarn(&cnvF);
-return (closedExp && closedCNV);
+ closedAll = closedAll && carefulCloseWarn(&evidF);
+ }
+
+return closedAll;
}
boolean writeEvidenceFile(char *prefix, char *sampleName, struct pathwayData *pd)
{
@@ -228,104 +247,75 @@
return FALSE;
if (!pd->entities || !pd->links)
return FALSE;
-char cnvFile[1024];
-safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
-FILE *cnvF = mustOpen(cnvFile, "a");
-if (!cnvF)
- return FALSE;
+boolean closedAll = 1;
-char expFile[1024];
-safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
-FILE *expF = mustOpen(expFile, "a");
-if (!expF)
+struct typeHash *th;
+for (th = pd->data; th; th = th->next)
+ {
+ struct entities *en, *enList = pd->entities;
+ char idStr[128];
+ char fileName[1024];
+ safef(fileName, sizeof(fileName), "%s_%s.tab", prefix, th->type);
+ FILE *evidF = mustOpen(fileName, "a");
+ if (!evidF)
return FALSE;
+ fprintf(evidF, "%s\t", sampleName);
-fprintf(cnvF, "%s\t", sampleName);
-fprintf(expF, "%s\t", sampleName);
-
-struct entities *en, *enList = pd->entities;
-char idStr[128];
-for (en = enList; en; en = en->next)
+ for (en = enList; en; en = en->next)
{
int id = hashIntValDefault(pd->featureIds, en->name, -1);
if (id == -1)
continue;
safef(idStr, sizeof(idStr), "%d", id);
- boolean hadExp = FALSE;
- boolean hadCNV = FALSE;
-
- struct typeHash *th;
- for (th = pd->data; th; th = th->next)
- {
struct hashEl *el = hashLookup(th->hash, idStr);
if (!el)
- continue;
-
+ fprintf(evidF, "NA");
+ else
+ {
struct analysisVals *av = el->val;
double val = av->conf;
-
- if (sameString(th->type, "Expression"))
- {
- fprintf(expF, "%f", val);
- hadExp = TRUE;
+ fprintf(evidF, "%f", val);
}
- else if (sameString(th->type, "CNV"))
- {
- fprintf(cnvF, "%f", val);
- hadCNV = TRUE;
- }
- else
- continue;
- }
- if (!hadExp)
- fprintf(expF, "NA");
-
- if (!hadCNV)
- fprintf(cnvF, "NA");
if (en->next)
{
- fprintf(cnvF, "\t");
- fprintf(expF, "\t");
+ fprintf(evidF, "\t");
}
}
-
-fprintf(cnvF, "\n");
-fprintf(expF, "\n");
-
-boolean closedExp = carefulCloseWarn(&expF);
-boolean closedCNV = carefulCloseWarn(&cnvF);
-
-return (closedExp && closedCNV);
+ fprintf(evidF, "\n");
+ closedAll = closedAll && carefulCloseWarn(&evidF);
+ }
+return closedAll;
}
-boolean writeNullEvidenceFile(char *prefix, char *sampleName, struct pathwayData *pd)
+boolean writeNullEvidenceFile(char *prefix, char *sampleName, struct pathwayData *pd, struct typeHash* thList)
{
if (!pd)
return FALSE;
if (!pd->entities || !pd->links)
return FALSE;
-
-char cnvFile[1024];
-safef(cnvFile, sizeof(cnvFile), "%s_genome.tab", prefix);
-FILE *cnvF = mustOpen(cnvFile, "a");
-if (!cnvF)
+if (!thList)
return FALSE;
-char expFile[1024];
-safef(expFile, sizeof(expFile), "%s_mRNA.tab", prefix);
-FILE *expF = mustOpen(expFile, "a");
-if (!expF)
- return FALSE;
+int numTypes = slCount(thList);
+FILE** evidFiles;
+AllocArray(evidFiles, numTypes);
+int i = 0;
+struct typeHash* th = thList;
+for (i = 0; i < numTypes; ++i)
+ {
+ char fileName[1024];
+ safef(fileName, sizeof(fileName), "%s_%s.tab", prefix, th->type);
+ evidFiles[i] = mustOpen(fileName, "a");
-fprintf(cnvF, "%s\t", sampleName);
-fprintf(expF, "%s\t", sampleName);
+ fprintf(evidFiles[i], "%s\t", sampleName);
+ th = th->next;
+ }
-struct dataTuple *dt;
struct hash *dtHash = pd->data;
int numTuples = hashNumEntries(dtHash);
struct entities *en, *enList = pd->entities;
@@ -345,35 +335,38 @@
{
fprintf(stderr, "uh-oh!");
continue;
}
- dt = el->val;
+ struct dataTuple* dt = el->val;
- if (dt->cnv != BAD_VAL)
- fprintf(cnvF, "%f", dt->cnv);
- else
- fprintf(cnvF, "NA");
-
-
- if (dt->exp != BAD_VAL)
- fprintf(expF, "%f", dt->exp);
+ for (i = 0; i < numTypes; ++i)
+ {
+ if (dt->data[i] != BAD_VAL)
+ {
+ fprintf(evidFiles[i], "%f", dt->data[i]);
+ }
else
- fprintf(expF, "NA");
+ {
+ fprintf(evidFiles[i], "NA");
+ }
if (en->next)
{
- fprintf(cnvF, "\t");
- fprintf(expF, "\t");
+ fprintf(evidFiles[i], "\t");
+ }
}
}
-fprintf(cnvF, "\n");
-fprintf(expF, "\n");
+boolean closedAll = 1;
+for (i = 0; i < numTypes; ++i)
+ {
+ fprintf(evidFiles[i], "\n");
+ closedAll = closedAll & carefulCloseWarn(&evidFiles[i]);
+ }
-boolean closeExp = carefulCloseWarn(&expF);
-boolean closeCNV = carefulCloseWarn(&cnvF);
+freez(evidFiles);
-return (closeExp && closeCNV);
+return closedAll;
}
void factorGraph(char *tableName, void *data, int sample_id, int feature_id)
{
@@ -392,9 +385,10 @@
if (!writeEvidenceFile(prefix, sampleName, pd))
errAbort("Problem writing evidence file %s\n", prefix);
}
-void factorGraphNull(char *tableName, void *data, int sample_id, int batchNum,
+void factorGraphNull(char *tableName, void *data, struct typeHash* thList,
+ int sample_id, int batchNum,
char *nullType)
{
struct pathwayData *pd = data;
@@ -415,15 +409,16 @@
char sampleName[512];
safef(sampleName, sizeof(sampleName), "iter_%d", sample_id);
-if (!writeNullEvidenceFile(prefix, sampleName, pd))
+if (!writeNullEvidenceFile(prefix, sampleName, pd, thList))
errAbort("Problem writing evidence file %s\n", prefix);
}
/* Pipeline Stuff */
struct pathwayVals *pathwayLevelAnalysis(struct sqlConnection *biConn, char *tableName,
- struct slPair *spData, struct slPair *spPathways)
+ struct slPair *spData, struct slPair *spPathways,
+ int numDataTypes)
{
fprintf(stdout, "starting geneset analysis.\n");
struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
@@ -446,8 +441,9 @@
char prefix[512];
safef(prefix, sizeof(prefix), "%s/%s_pid_%d",
tmpDir, tableName, pd->id);
+ pd->data = spData->val;
if (!prepEvidenceFiles(prefix, pd))
{
fprintf(stderr, "problem with evidence prep, skipping pathway.\n");
continue;
@@ -455,9 +451,9 @@
for (sp = spData; sp; sp = sp->next)
{
pd->data = sp->val;
- if (slCount(pd->data) < 2)
+ if (slCount(pd->data) < numDataTypes)
continue; // currently only consider samples with more than one type of evidence.
int sample_id = atoi(sp->name);
factorGraph(tableName, pd, sample_id, -1);
@@ -486,9 +482,9 @@
}
struct dataTuple *getAllDataTuples(struct sqlConnection *biConn, struct hash *featureHash,
- struct slPair *spData)
+ struct slPair *spData, int numDataTypes)
{
struct slName *sl, *featureNames = getAnalysisFeatureNames(biConn, AF_TABLE, "gene");
char idStr[256];
@@ -508,41 +504,25 @@
struct slPair *sp;
struct dataTuple *dt, *dtList = NULL;
for (sp = spData; sp; sp = sp->next)
{
- struct typeHash *th, *thList = sp->val;
- if (slCount(thList) < 2)
+ struct typeHash *thList = sp->val;
+ if (slCount(thList) < numDataTypes)
continue; // currently only consider samples with more than one type of evidence.
for (sl = featureIds; sl; sl = sl->next)
{
- AllocVar(dt);
- dt->cnv = BAD_VAL;
- dt->exp = BAD_VAL;
- for (th = thList; th; th = th->next)
- {
- struct hashEl *el = hashLookup(th->hash, sl->name);
- if (!el)
- continue;
-
- struct analysisVals *av = el->val;
- if (sameString(th->type, "Expression"))
- dt->exp = av->conf;
- else if (sameString(th->type, "CNV"))
- dt->cnv = av->conf;
- }
- if (dt->cnv == BAD_VAL && dt->exp == BAD_VAL)
- dataTupleFree(&dt);
- else
+ dt = dataTupleAlloc(thList, sl);
+ if (dt)
slAddHead(&dtList, dt);
}
}
return dtList;
}
struct dataTuple *getPathwayDataTuples(struct sqlConnection *biConn, struct hash *featureHash,
- struct slPair *spData, struct slPair *spPathway)
+ struct slPair *spData, struct slPair *spPathway, int numDataTypes)
{
struct pathwayData *pd = spPathway->val;
struct entities *en, *enList = pd->entities;
struct slName *featureIds = NULL;
@@ -560,33 +540,17 @@
struct slPair *sp;
struct dataTuple *dt, *dtList = NULL;
for (sp = spData; sp; sp = sp->next)
{
- struct typeHash *th, *thList = sp->val;
- if (slCount(thList) < 2)
+ struct typeHash *thList = sp->val;
+ if (slCount(thList) != numDataTypes)
continue; // currently only consider samples with more than one type of evidence.
struct slName *sl;
for (sl = featureIds; sl; sl = sl->next)
{
- AllocVar(dt);
- dt->cnv = BAD_VAL;
- dt->exp = BAD_VAL;
- for (th = thList; th; th = th->next)
- {
- struct hashEl *el = hashLookup(th->hash, sl->name);
- if (!el)
- continue;
-
- struct analysisVals *av = el->val;
- if (sameString(th->type, "Expression"))
- dt->exp = av->conf;
- else if (sameString(th->type, "CNV"))
- dt->cnv = av->conf;
- }
- if (dt->cnv == BAD_VAL && dt->exp == BAD_VAL)
- dataTupleFree(&dt);
- else
+ dt = dataTupleAlloc(thList, sl);
+ if (dt)
slAddHead(&dtList, dt);
}
}
@@ -615,31 +579,44 @@
/* Pipeline Stuff */
struct pathwayVals *pathwayNullAnalysis(struct sqlConnection *biConn, char *tableName,
struct slPair *spData, struct slPair *spPathways,
- int numIters, char *nullType)
+ int numIters, int numDataTypes, char *nullType)
{
+if (!spData)
+ return NULL;
+
fprintf(stderr, "starting null analysis analysis.\n");
struct hash *featureHash = createIdHash(biConn, AF_TABLE, "feature_name");
-
+struct hash *dtHash = NULL;
struct dataTuple *dtList = NULL;
if (sameString(nullType, "allData"))
- dtList = getAllDataTuples(biConn, featureHash, spData);
+ {
+ dtList = getAllDataTuples(biConn, featureHash, spData, numDataTypes);
+ fprintf(stderr, "hashing data tuples...\n");
+ dtHash = hashDataTuples(dtList);
+ }
else if (sameString(nullType, "withinPathway"))
- dtList = getPathwayDataTuples(biConn, featureHash, spData, spPathways);
+ {/* set dtList on each invofaction below*/}
else
errAbort("%s not supported", nullType);
-fprintf(stderr, "hashing data tuples...\n");
-struct hash *dtHash = hashDataTuples(dtList);
struct slPair *pa;
for (pa = spPathways; pa; pa = pa->next)
{
struct pathwayData *pd = pa->val;
pd->featureIds = featureHash;
+ if (sameString(nullType, "withinPathway"))
+ {
+ dtList = getPathwayDataTuples(biConn, featureHash, spData, pa,
+ numDataTypes);
+ fprintf(stderr, "hashing data tuples...\n");
+ dtHash = hashDataTuples(dtList);
+ }
+
fprintf(stderr, "prepping pathway %s:\n", pa->name);
if (!prepFactorGraph(tableName, pd))
{
fprintf(stderr, "problem with prep, skipping pathway.\n");
@@ -686,15 +663,17 @@
"%s/nw_%s_pid_%d_batch_%d",
tmpDir, tableName, pd->id, batchNum);
}
+ pd->data = spData->val;
if (!prepEvidenceFiles(prefix, pd))
{
fprintf(stderr, "problem with evidence prep, skipping pathway.\n");
continue;
}
}
- factorGraphNull(tableName, pd, i, batchNum, nullType);
+ pd->data = dtHash;
+ factorGraphNull(tableName, pd, spData->val, i, batchNum, nullType);
fprintf(stderr, ".");
fflush(stderr);
}
fprintf(stderr, "\n");
@@ -921,8 +900,29 @@
return sqlQuickList(biConn, query);
}
+struct slPair * filterAnalysisValsSampleHashesFullData(struct slPair* spData,
+ int dataCount)
+{
+struct slPair *sp, **prevSp = &spData;
+for (sp = spData; sp;)
+ {
+ if (slCount(sp->val) != dataCount)
+ {
+ *prevSp = sp->next;
+ slPairHashesFree(&sp);
+ sp = *prevSp;
+ }
+ else
+ {
+ prevSp = &(sp->next);
+ sp = sp->next;
+ }
+ }
+return spData;
+}
+
void pathwayLevelPipeline(char *db, char *pathway, int cohort_id, int numIters)
{
struct sqlConnection *biConn = hAllocConnProfile("localDb", db);
@@ -942,15 +942,22 @@
struct slPair *spData = analysisValsSamplesHashesList(biConn, tableNames);
uglyTime("got sample hashes");
-pathwayLevelAnalysis(biConn, cohort_name, spData, spPathways);
+/* Only consider samples with full data. The following output */
+/* functions require this. */
+spData = filterAnalysisValsSampleHashesFullData(spData, slCount(tableNames));
+
+pathwayLevelAnalysis(biConn, cohort_name, spData, spPathways,
+ slCount(tableNames));
uglyTime("analyzed all genesets");
-pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, "withinPathway");
+pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters,
+ slCount(tableNames), "withinPathway");
uglyTime("produced null distribution");
-pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters, "allData");
+pathwayNullAnalysis(biConn, cohort_name, spData, spPathways, numIters,
+ slCount(tableNames), "allData");
uglyTime("produced null distribution");
slPairHashesFreeList(&spData);
slPairPathwayFreeList(&spPathways);