src/hg/instinct/bioInt2/makeClusterFiles.c 1.5
1.5 2009/12/22 14:37:44 sbenz
Added db batch number support
Index: src/hg/instinct/bioInt2/makeClusterFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/makeClusterFiles.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 4 -r1.4 -r1.5
--- src/hg/instinct/bioInt2/makeClusterFiles.c 18 Dec 2009 21:43:13 -0000 1.4
+++ src/hg/instinct/bioInt2/makeClusterFiles.c 22 Dec 2009 14:37:44 -0000 1.5
@@ -51,8 +51,9 @@
struct pathwayData {
int id;
struct entities *entities;
struct links *links;
+ int batchSize;
void *data;
struct hash *featureIds;
};
@@ -367,18 +368,22 @@
return closedAll;
}
-void factorGraph(char *tableName, void *data, int sample_id, int feature_id)
+void factorGraph(char *tableName, void *data, int sample_id, int feature_id, int batchNum)
{
if (!data)
return;
struct pathwayData *pd = data;
char prefix[512];
-safef(prefix, sizeof(prefix), "%s/%s_pid_%d",
+/*safef(prefix, sizeof(prefix), "%s/%s_pid_%d",
tmpDir, tableName, pd->id);
+*/
+safef(prefix, sizeof(prefix),
+ "%s/%s_pid_%d_batch_%d",
+ tmpDir, tableName, pd->id, batchNum);
char sampleName[512];
safef(sampleName, sizeof(sampleName), "sample_%d", sample_id);
@@ -437,9 +442,43 @@
{
fprintf(stderr, "problem with prep, skipping pathway.\n");
continue;
}
+
+ int numInBatch = pd->batchSize;
+ int i=0;
+ int batchNum = 0;
+
+ for (sp = spData; sp; sp = sp->next)
+ {
+ if ((i % numInBatch) == 0)
+ {
char prefix[512];
+ batchNum++;
+ safef(prefix, sizeof(prefix),
+ "%s/%s_pid_%d_batch_%d",
+ tmpDir, tableName, pd->id, batchNum);
+
+
+ pd->data = spData->val;
+ if (!prepEvidenceFiles(prefix, pd))
+ {
+ fprintf(stderr, "problem with evidence prep, skipping pathway.\n");
+ continue;
+ }
+ }
+ pd->data = sp->val;
+ if (slCount(pd->data) < numDataTypes)
+ continue; // currently only consider samples with more than one type of evidence.
+
+ int sample_id = atoi(sp->name);
+ factorGraph(tableName, pd, sample_id, -1, batchNum);
+ fprintf(stderr, ".");
+ fflush(stderr);
+ i++;
+ }
+
+ /* char prefix[512];
safef(prefix, sizeof(prefix), "%s/%s_pid_%d",
tmpDir, tableName, pd->id);
pd->data = spData->val;
@@ -459,8 +497,9 @@
factorGraph(tableName, pd, sample_id, -1);
fprintf(stderr, ".");
fflush(stderr);
}
+ */
fprintf(stderr, "\n");
count++;
}
@@ -626,10 +665,10 @@
pd->data = dtHash;
// Tune batch size to get around ~20 minutes per batch.
// .. Pathways listed below take forever.
- int numInBatch = 0;
- if (sameString(pa->name, "LPA receptor mediated events"))
+ int numInBatch = pd->batchSize;
+ /*if (sameString(pa->name, "LPA receptor mediated events"))
numInBatch = 100;
else if (sameString(pa->name, "PDGFR-beta signaling pathway"))
numInBatch = 250;
else if (sameString(pa->name, "TCGA08_rtk_signaling"))
@@ -640,9 +679,9 @@
numInBatch = 250;
else if (sameString(pa->name, "RXR and RAR heterodimerization with other nuclear receptor"))
numInBatch = 250;
else
- numInBatch = NUM_IN_BATCH; // default batch size
+ numInBatch = NUM_IN_BATCH; // default batch size*/
int i;
int batchNum = 0;
for (i = 0; i < numIters; i++)
@@ -777,15 +816,15 @@
char query[2048];
if (!pathwayName)
safef(query, sizeof(query),
- "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
+ "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type,batch_size from %s "
"join %s on %s.pathway_id=%s.pathway_id", // a join across a few tables
EN_TABLE, EP_TABLE, EN_TABLE,
EP_TABLE, EN_TABLE, EP_TABLE);
else
safef(query, sizeof(query),
- "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
+ "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type,batch_size from %s "
"join %s on %s.pathway_id=%s.pathway_id where %s.pathway_name = \"%s\"",
EN_TABLE, EP_TABLE, EN_TABLE,
EP_TABLE, EN_TABLE, EP_TABLE,
EP_TABLE, pathwayName);
@@ -803,8 +842,9 @@
char *pathway_name = row[1]; // pathway name
int entity_id = atoi(row[2]);
char *entity_name = row[3]; // entity name
char *entity_type = row[4]; // type (protein, abstract, whatever...)
+ int batchSize = atoi(row[5]); // optimal batch size for sending to the cluster
struct hashEl *el = hashLookup(hash, pathway_name);
if (!el)
{
@@ -815,8 +855,12 @@
pd->entities = NULL;
pd->links = NULL;
pd->data = NULL;
pd->featureIds = NULL;
+ if(batchSize == 0)
+ pd->batchSize = NUM_IN_BATCH;
+ else
+ pd->batchSize = batchSize;
sp->val = pd;
slAddHead(&spList, sp);
hashAdd(hash, pathway_name, sp);
}