src/hg/instinct/bioInt2/makeClusterFiles.c 1.5

1.5 2009/12/22 14:37:44 sbenz
Added db batch number support
Index: src/hg/instinct/bioInt2/makeClusterFiles.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/instinct/bioInt2/makeClusterFiles.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 4 -r1.4 -r1.5
--- src/hg/instinct/bioInt2/makeClusterFiles.c	18 Dec 2009 21:43:13 -0000	1.4
+++ src/hg/instinct/bioInt2/makeClusterFiles.c	22 Dec 2009 14:37:44 -0000	1.5
@@ -51,8 +51,9 @@
 struct pathwayData {
     int id;
     struct entities *entities;
     struct links *links;
+    int batchSize;
     
     void *data;
     struct hash *featureIds;
 };
@@ -367,18 +368,22 @@
 
 return closedAll;
 }
 
-void factorGraph(char *tableName, void *data, int sample_id, int feature_id)
+void factorGraph(char *tableName, void *data, int sample_id, int feature_id, int batchNum)
 { 
 if (!data)
     return;
 
 struct pathwayData *pd = data;
 
 char prefix[512];
-safef(prefix, sizeof(prefix), "%s/%s_pid_%d", 
+/*safef(prefix, sizeof(prefix), "%s/%s_pid_%d", 
       tmpDir, tableName, pd->id); 
+*/
+safef(prefix, sizeof(prefix),
+	  "%s/%s_pid_%d_batch_%d",
+	  tmpDir, tableName, pd->id, batchNum);
 
 char sampleName[512];
 safef(sampleName, sizeof(sampleName), "sample_%d", sample_id);
 
@@ -437,9 +442,43 @@
 	{
 	fprintf(stderr, "problem with prep, skipping pathway.\n");
 	continue;
 	}
+	
+    int numInBatch = pd->batchSize;
+    int i=0;
+    int batchNum = 0;
+	
+    for (sp = spData; sp; sp = sp->next)
+	{
+	if ((i % numInBatch) == 0)
+	    {
     char prefix[512];
+	    batchNum++;
+		safef(prefix, sizeof(prefix),
+		      "%s/%s_pid_%d_batch_%d",
+		      tmpDir, tableName, pd->id, batchNum);
+		
+	    
+	    pd->data = spData->val;
+	    if (!prepEvidenceFiles(prefix, pd))
+		{
+		fprintf(stderr, "problem with evidence prep, skipping pathway.\n");
+		continue;
+		}
+	    }
+	pd->data = sp->val;
+	if (slCount(pd->data) < numDataTypes) 
+	    continue;  // currently only consider samples with more than one type of evidence.
+
+	int sample_id = atoi(sp->name);
+	factorGraph(tableName, pd, sample_id, -1, batchNum);
+	fprintf(stderr, ".");
+	fflush(stderr);
+	i++;
+	}
+
+ /*   char prefix[512];
     safef(prefix, sizeof(prefix), "%s/%s_pid_%d", 
 	  tmpDir, tableName, pd->id);
 
     pd->data = spData->val;
@@ -459,8 +497,9 @@
 	factorGraph(tableName, pd, sample_id, -1);
 	fprintf(stderr, ".");
 	fflush(stderr);
 	}
+ */
     fprintf(stderr, "\n");
     
     count++;
     }
@@ -626,10 +665,10 @@
     pd->data = dtHash;
     
     // Tune batch size to get around ~20 minutes per batch.
     // .. Pathways listed below take forever.
-    int numInBatch = 0;
-    if (sameString(pa->name, "LPA receptor mediated events"))
+    int numInBatch = pd->batchSize;
+    /*if (sameString(pa->name, "LPA receptor mediated events"))
 	numInBatch = 100;
     else if (sameString(pa->name, "PDGFR-beta signaling pathway"))
 	numInBatch = 250;
     else if (sameString(pa->name, "TCGA08_rtk_signaling"))
@@ -640,9 +679,9 @@
 	numInBatch = 250;
     else if (sameString(pa->name, "RXR and RAR heterodimerization with other nuclear receptor"))
 	numInBatch = 250;
     else
-	numInBatch = NUM_IN_BATCH;  // default batch size
+	numInBatch = NUM_IN_BATCH;  // default batch size*/
     
     int i;
     int batchNum = 0;
     for (i = 0; i < numIters; i++)
@@ -777,15 +816,15 @@
 char query[2048];
 
 if (!pathwayName)
     safef(query, sizeof(query), 
-	  "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
+	  "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type,batch_size from %s "
 	  "join %s on %s.pathway_id=%s.pathway_id", // a join across a few tables
 	  EN_TABLE, EP_TABLE, EN_TABLE, 
 	  EP_TABLE, EN_TABLE, EP_TABLE);
 else
     safef(query, sizeof(query), 
-	  "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type from %s "
+	  "select %s.pathway_id,%s.pathway_name,entity_id,entity_name,entity_type,batch_size from %s "
 	  "join %s on %s.pathway_id=%s.pathway_id where %s.pathway_name = \"%s\"", 
 	  EN_TABLE, EP_TABLE, EN_TABLE, 
 	  EP_TABLE, EN_TABLE, EP_TABLE, 
 	  EP_TABLE, pathwayName);
@@ -803,8 +842,9 @@
     char *pathway_name = row[1];   // pathway name
     int entity_id      = atoi(row[2]);
     char *entity_name  = row[3];   // entity name
     char *entity_type  = row[4];   // type (protein, abstract, whatever...)
+	int batchSize = atoi(row[5]);  // optimal batch size for sending to the cluster
 
     struct hashEl *el = hashLookup(hash, pathway_name);
     if (!el)
 	{
@@ -815,8 +855,12 @@
 	pd->entities = NULL;
 	pd->links = NULL;
 	pd->data = NULL;
 	pd->featureIds = NULL;
+	if(batchSize == 0)
+		pd->batchSize = NUM_IN_BATCH;
+	else
+		pd->batchSize = batchSize;
 	sp->val = pd;
 	slAddHead(&spList, sp);
 	hashAdd(hash, pathway_name, sp);
 	}