src/parasol/paraHub/paraHub.c 1.133
1.133 2010/01/12 09:09:15 markd
add option to set minimum log level, move some logging to debug
Index: src/parasol/paraHub/paraHub.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/parasol/paraHub/paraHub.c,v
retrieving revision 1.132
retrieving revision 1.133
diff -b -B -U 4 -r1.132 -r1.133
--- src/parasol/paraHub/paraHub.c 21 Nov 2009 01:07:12 -0000 1.132
+++ src/parasol/paraHub/paraHub.c 12 Jan 2010 09:09:15 -0000 1.133
@@ -78,8 +78,9 @@
{"machineCheckPeriod", OPTION_INT},
{"subnet", OPTION_STRING},
{"nextJobId", OPTION_INT},
{"logFacility", OPTION_STRING},
+ {"logMinPriority", OPTION_STRING},
{"log", OPTION_STRING},
{"debug", OPTION_BOOLEAN},
{"noResume", OPTION_BOOLEAN},
{NULL, 0}
@@ -124,8 +125,10 @@
" -machineCheckPeriod=N Minutes between checking on machine - default %d.\n"
" -subnet=XXX.YYY.ZZZ Only accept connections from subnet (example 192.168).\n"
" -nextJobId=N Starting job ID number.\n"
" -logFacility=facility Log to the specified syslog facility - default local0.\n"
+ " -logMinPriority=pri minimum syslog priority to log, also filters file logging.\n"
+ " defaults to \"warn\"\n"
" -log=file Log to file instead of syslog.\n"
" -debug Don't daemonize\n"
" -noResume Don't try to reconnect with jobs running on nodes.\n"
,
@@ -675,9 +678,9 @@
void plan(struct paraMessage *pm)
/* Make a new plan allocating resources to batches */
{
-logInfo("executing new plan");
+logDebug("executing new plan");
if (pm)
{
pmClear(pm);
@@ -960,9 +963,9 @@
pmSendString(pm, rudpOut, "");
}
needsPlanning = FALSE;
-logInfo("plan finished");
+logDebug("plan finished");
}
@@ -1272,9 +1275,11 @@
{
struct machine *mach;
if ((mach = findMachine(machName)))
{
- logInfo("hub: user %s removed machine %s because: %s",user,machName,reason);
+ // logged as an error because it's important for admins to know that there is an
+ // error with this machine
+ logError("hub: user %s removed machine %s because: %s",user,machName,reason);
requeueAllJobs(mach, FALSE);
dlRemove(mach->node);
slRemoveEl(&machineList, mach);
hashRemove(machineHash, mach->name);
@@ -1282,11 +1287,11 @@
return TRUE;
}
else
{
- logInfo("hub: user %s wanted to removed machine %s because: %s but machine was not found",user,machName,reason);
+ logDebug("hub: user %s wanted to removed machine %s because: %s but machine was not found",user,machName,reason);
+ return FALSE;
}
-return FALSE;
}
void removeMachineAcknowledge(char *line, struct paraMessage *pm)
@@ -1982,9 +1987,9 @@
needsPlanning = TRUE;
batch->maxJob = maxJob;
updateUserMaxJob(user);
if (maxJob>=-1)
- logInfo("paraHub: User %s set maxJob=%d for batch %s", userName, maxJob, dir);
+ logDebug("paraHub: User %s set maxJob=%d for batch %s", userName, maxJob, dir);
return maxJob;
}
@@ -2023,9 +2028,9 @@
if (batch == NULL) return -2;
batch->doneCount = 0;
batch->doneTime = 0;
batch->crashCount = 0;
-logInfo("paraHub: User %s reset done and crashed counts for batch %s", userName, dir);
+logDebug("paraHub: User %s reset done and crashed counts for batch %s", userName, dir);
return 0;
}
int resetCountsFromMessage(char *line)
@@ -2065,9 +2070,9 @@
/* make sure nothing running and queue empty */
if (batch->runningCount > 0) return -1;
if (!dlEnd(batch->jobQueue->head)) return -1;
sweepResultsWithRemove(name);
-logInfo("paraHub: User %s freed batch %s", userName, batchName);
+logDebug("paraHub: User %s freed batch %s", userName, batchName);
/* remove batch from batchList */
slRemoveEl(&batchList, batch);
/* remove from user cur/old batches */
dlRemove(batch->node);
@@ -2113,9 +2118,9 @@
batch->sickNodes = newHashExt(6, FALSE);
batch->continuousCrashCount = 0; /* reset so user can retry */
needsPlanning = TRUE;
updateUserSickNodes(user);
-logInfo("paraHub: User %s cleared sick nodes for batch %s", userName, dir);
+logDebug("paraHub: User %s cleared sick nodes for batch %s", userName, dir);
return 0;
}
int clearSickNodesFromMessage(char *line)
@@ -2147,9 +2152,9 @@
struct user *user = findUser(userName);
struct batch *batch = findBatch(user, dir, TRUE);
if (user == NULL) return -2;
if (batch == NULL) return -2;
-logInfo("paraHub: User %s ran showSickNodes for batch %s", userName, dir);
+logDebug("paraHub: User %s ran showSickNodes for batch %s", userName, dir);
struct hashEl *el, *list = hashElListHash(batch->sickNodes);
slSort(&list, hashElCmp);
for (el = list; el != NULL; el = el->next)
{
@@ -2202,9 +2207,9 @@
needsPlanning = TRUE;
batch->priority = priority;
updateUserPriority(user);
if ((priority>=1)&&(priority<NORMAL_PRIORITY))
- logInfo("paraHub: User %s set priority=%d for batch %s", userName, priority, dir);
+ logDebug("paraHub: User %s set priority=%d for batch %s", userName, priority, dir);
return priority;
}
@@ -3177,9 +3182,9 @@
int running = 0, finished = 0;
struct hash *erHash = newHashExt(8, FALSE); /* A hash of existingResults */
struct existingResults *erList = NULL;
-logInfo("Checking for jobs already running on nodes");
+logDebug("Checking for jobs already running on nodes");
for (mach = machineList; mach != NULL; mach = mach->next)
{
struct paraMessage pm;
struct rudp *ru = rudpNew(rudpOut->socket); /* Get own resend timing */
@@ -3200,9 +3205,9 @@
hashFree(&erHash);
needsPlanning = TRUE;
/* Report results. */
-logInfo("%d running jobs, %d jobs that finished while hub was down",
+logDebug("%d running jobs, %d jobs that finished while hub was down",
running, finished);
}
void startHub(char *machineList)
@@ -3221,8 +3226,9 @@
if (optionExists("log"))
logOpenFile("paraHub", optionVal("log", NULL));
else
logOpenSyslog("paraHub", optionVal("logFacility", NULL));
+logSetMinPriority(optionVal("logMinPriority", "info"));
logInfo("starting paraHub on %s", hubHost);
/* Set up various lists. */
hubMessageQueueInit();
@@ -3231,10 +3237,9 @@
machineHash = newHash(0);
startMachines(machineList);
openJobId();
-logInfo("----------------------------------------------------------------");
-logInfo("Starting paraHub. Next job ID is %d.", nextJobId);
+logInfo("next job ID is %d.", nextJobId);
rudpOut = rudpMustOpen();
if (!optionExists("noResume"))
checkForJobsOnNodes();
@@ -3250,9 +3255,9 @@
sockSuckStart(rudpIn);
startHeartbeat();
startSpokes();
-logInfo("sockSuck,Heartbeat,Spokes have been started");
+logDebug("sockSuck,Heartbeat,Spokes have been started");
/* Bump up our priority to just shy of real-time. */
nice(-40);