src/parasol/paraHub/paraHub.c 1.133

1.133 2010/01/12 09:09:15 markd
add option to set minimum log level, move some logging to debug
Index: src/parasol/paraHub/paraHub.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/parasol/paraHub/paraHub.c,v
retrieving revision 1.132
retrieving revision 1.133
diff -b -B -U 4 -r1.132 -r1.133
--- src/parasol/paraHub/paraHub.c	21 Nov 2009 01:07:12 -0000	1.132
+++ src/parasol/paraHub/paraHub.c	12 Jan 2010 09:09:15 -0000	1.133
@@ -78,8 +78,9 @@
     {"machineCheckPeriod", OPTION_INT},
     {"subnet", OPTION_STRING},
     {"nextJobId", OPTION_INT},
     {"logFacility", OPTION_STRING},
+    {"logMinPriority", OPTION_STRING},
     {"log", OPTION_STRING},
     {"debug", OPTION_BOOLEAN},
     {"noResume", OPTION_BOOLEAN},
     {NULL, 0}
@@ -124,8 +125,10 @@
 	 "   -machineCheckPeriod=N  Minutes between checking on machine - default %d.\n"
 	 "   -subnet=XXX.YYY.ZZZ  Only accept connections from subnet (example 192.168).\n"
 	 "   -nextJobId=N  Starting job ID number.\n"
 	 "   -logFacility=facility  Log to the specified syslog facility - default local0.\n"
+         "   -logMinPriority=pri minimum syslog priority to log, also filters file logging.\n"
+         "    defaults to \"warn\"\n"
          "   -log=file  Log to file instead of syslog.\n"
          "   -debug  Don't daemonize\n"
 	 "   -noResume  Don't try to reconnect with jobs running on nodes.\n"
 	               ,
@@ -675,9 +678,9 @@
 void plan(struct paraMessage *pm) 
 /* Make a new plan allocating resources to batches */
 {
 
-logInfo("executing new plan");
+logDebug("executing new plan");
 
 if (pm)
     {
     pmClear(pm);
@@ -960,9 +963,9 @@
     pmSendString(pm, rudpOut, "");
     }
 
 needsPlanning = FALSE;
-logInfo("plan finished");
+logDebug("plan finished");
 
 }
 
 
@@ -1272,9 +1275,11 @@
 {
 struct machine *mach;
 if ((mach = findMachine(machName)))
     {
-    logInfo("hub: user %s removed machine %s because: %s",user,machName,reason);
+    // logged as an error because it's important for admins to know that there is an
+    // error with this machine
+    logError("hub: user %s removed machine %s because: %s",user,machName,reason);
     requeueAllJobs(mach, FALSE);
     dlRemove(mach->node);
     slRemoveEl(&machineList, mach);
     hashRemove(machineHash, mach->name);
@@ -1282,11 +1287,11 @@
     return TRUE;
     }
 else
     {
-    logInfo("hub: user %s wanted to removed machine %s because: %s but machine was not found",user,machName,reason);
+    logDebug("hub: user %s wanted to removed machine %s because: %s but machine was not found",user,machName,reason);
+    return FALSE;
     }
-return FALSE;
 }
 
 
 void removeMachineAcknowledge(char *line, struct paraMessage *pm)
@@ -1982,9 +1987,9 @@
 needsPlanning = TRUE;
 batch->maxJob = maxJob;
 updateUserMaxJob(user);
 if (maxJob>=-1)
-    logInfo("paraHub: User %s set maxJob=%d for batch %s", userName, maxJob, dir);
+    logDebug("paraHub: User %s set maxJob=%d for batch %s", userName, maxJob, dir);
 return maxJob;
 }
 
 
@@ -2023,9 +2028,9 @@
 if (batch == NULL) return -2;
 batch->doneCount = 0;
 batch->doneTime = 0;
 batch->crashCount = 0;
-logInfo("paraHub: User %s reset done and crashed counts for batch %s", userName, dir);
+logDebug("paraHub: User %s reset done and crashed counts for batch %s", userName, dir);
 return 0;
 }
 
 int resetCountsFromMessage(char *line)
@@ -2065,9 +2070,9 @@
 /* make sure nothing running and queue empty */
 if (batch->runningCount > 0) return -1;
 if (!dlEnd(batch->jobQueue->head)) return -1;
 sweepResultsWithRemove(name);
-logInfo("paraHub: User %s freed batch %s", userName, batchName);
+logDebug("paraHub: User %s freed batch %s", userName, batchName);
 /* remove batch from batchList */
 slRemoveEl(&batchList, batch);
 /* remove from user cur/old batches */
 dlRemove(batch->node);
@@ -2113,9 +2118,9 @@
 batch->sickNodes = newHashExt(6, FALSE);
 batch->continuousCrashCount = 0;  /* reset so user can retry */
 needsPlanning = TRUE;
 updateUserSickNodes(user);
-logInfo("paraHub: User %s cleared sick nodes for batch %s", userName, dir);
+logDebug("paraHub: User %s cleared sick nodes for batch %s", userName, dir);
 return 0;
 }
 
 int clearSickNodesFromMessage(char *line)
@@ -2147,9 +2152,9 @@
 struct user *user = findUser(userName);
 struct batch *batch = findBatch(user, dir, TRUE);
 if (user == NULL) return -2;
 if (batch == NULL) return -2;
-logInfo("paraHub: User %s ran showSickNodes for batch %s", userName, dir);
+logDebug("paraHub: User %s ran showSickNodes for batch %s", userName, dir);
 struct hashEl *el, *list = hashElListHash(batch->sickNodes);
 slSort(&list, hashElCmp);
 for (el = list; el != NULL; el = el->next)
     {
@@ -2202,9 +2207,9 @@
 needsPlanning = TRUE;
 batch->priority = priority;
 updateUserPriority(user);
 if ((priority>=1)&&(priority<NORMAL_PRIORITY))
-    logInfo("paraHub: User %s set priority=%d for batch %s", userName, priority, dir);
+    logDebug("paraHub: User %s set priority=%d for batch %s", userName, priority, dir);
 return priority;
 }
 
 
@@ -3177,9 +3182,9 @@
 int running = 0, finished = 0;
 struct hash *erHash = newHashExt(8, FALSE); /* A hash of existingResults */
 struct existingResults *erList = NULL;
 
-logInfo("Checking for jobs already running on nodes");
+logDebug("Checking for jobs already running on nodes");
 for (mach = machineList; mach != NULL; mach = mach->next)
     {
     struct paraMessage pm;
     struct rudp *ru = rudpNew(rudpOut->socket);	/* Get own resend timing */
@@ -3200,9 +3205,9 @@
 hashFree(&erHash);
 needsPlanning = TRUE;
 
 /* Report results. */
-logInfo("%d running jobs, %d jobs that finished while hub was down",
+logDebug("%d running jobs, %d jobs that finished while hub was down",
 	running, finished);
 }
 
 void startHub(char *machineList)
@@ -3221,8 +3226,9 @@
 if (optionExists("log"))
     logOpenFile("paraHub", optionVal("log", NULL));
 else    
     logOpenSyslog("paraHub", optionVal("logFacility", NULL));
+logSetMinPriority(optionVal("logMinPriority", "info"));
 logInfo("starting paraHub on %s", hubHost);
 
 /* Set up various lists. */
 hubMessageQueueInit();
@@ -3231,10 +3237,9 @@
 machineHash = newHash(0);
 startMachines(machineList);
 
 openJobId();
-logInfo("----------------------------------------------------------------");
-logInfo("Starting paraHub. Next job ID is %d.", nextJobId);
+logInfo("next job ID is %d.", nextJobId);
 
 rudpOut = rudpMustOpen();
 if (!optionExists("noResume"))
     checkForJobsOnNodes();
@@ -3250,9 +3255,9 @@
 sockSuckStart(rudpIn);
 startHeartbeat();
 startSpokes();
 
-logInfo("sockSuck,Heartbeat,Spokes have been started");
+logDebug("sockSuck,Heartbeat,Spokes have been started");
 
 /* Bump up our priority to just shy of real-time. */
 nice(-40);