62116e0476f1e663d37de8477766f5cdee572862 galt Fri Oct 12 18:37:08 2012 -0700 v12.16 - fixed bug in planner which had r and c reversed so that -ramUnit option now works diff --git src/parasol/paraHub/paraHub.c src/parasol/paraHub/paraHub.c index 548aed5..7d18ae4 100644 --- src/parasol/paraHub/paraHub.c +++ src/parasol/paraHub/paraHub.c @@ -173,31 +173,31 @@ struct resultQueue *resultQueues; /* Result files. */ int finishedJobCount = 0; /* Number of finished jobs. */ int crashedJobCount = 0; /* Number of crashed jobs. */ char *jobIdFileName = "parasol.jid"; /* File name where jobId file is. */ FILE *jobIdFile = NULL; /* Handle to jobId file. */ char *hubHost; /* Name of machine running this. */ struct rudp *rudpOut; /* Our rUDP socket. */ /* Variables for new scheduler */ // TODO make commandline param options to override defaults for unit sizes? /* using machines list spec info for defaults */ -int cpuUnit = 1; /* 1 CPU */ +int cpuUnit = 1; /* 1 CPU */ /* someday this could be float 0.5 */ long long ramUnit = 512 * 1024 * 1024; /* 500 MB */ int defaultJobCpu = 1; /* number of cpuUnits in default job usage */ int defaultJobRam = 1; /* number of ramUnits in default job usage */ /* for the resource array dimensions */ int maxCpuInCluster = 0; /* node with largest number of cpu units */ int maxRamInCluster = 0; /* node with largest number of ram units */ struct slRef ***perCpu = NULL; /* an array of resources sharing the same cpu units free units count */ boolean needsPlanning = FALSE; /* remember if situation changed, need new plan */ void setupLists() /* Make up machine, spoke, user and job lists - all doubly linked * so it is fast to remove items from one list and put them * on another. */ { @@ -736,31 +736,31 @@ struct dlNode *jobNode = NULL; for (jobNode = mach->jobs->head; !dlEnd(jobNode); jobNode = jobNode->next) { struct job *job = jobNode->val; struct batch *batch = job->batch; struct user *user = batch->user; job->oldPlan = TRUE; if (batch->planning && (batch->maxJob != -1)) { if (pm) { //pmClear(pm); //pmPrintf(pm, "preserving batch %s on machine %s", batch->name, mach->name); //pmSend(pm, rudpOut); } - allocateResourcesToMachine(mach, batch, user, &r, &c); + allocateResourcesToMachine(mach, batch, user, &c, &r); } } if (pm) { //pmClear(pm); //pmPrintf(pm, "machSpec (%s) cpus:%d ramSize=%d" //, mach->name, mach->machSpec->cpus, mach->machSpec->ramSize); //pmSend(pm, rudpOut); } if (c < 1 || r < 1) { if (pm) @@ -851,31 +851,31 @@ /* allocate plan, reduce resources, calc new resources and pos. * move machine from old array pos to new pos. (slPopHead, slAddHead) * update its stats, and if heaps, update heaps. */ if (pm) { //pmClear(pm); //pmPrintf(pm, "found hardware cpu %d ram %d in machine %s c=%d r=%d batch=%s", //batch->cpu, batch->ram, mach->name, c, r, batch->name); //pmSend(pm, rudpOut); } - allocateResourcesToMachine(mach, batch, user, &r, &c); + allocateResourcesToMachine(mach, batch, user, &c, &r); if (pm) { //pmClear(pm); //pmPrintf(pm, "remaining hardware c=%d r=%d", c, r); //pmSend(pm, rudpOut); } if (c < 1 || r < 1) freeMem(el); /* this node has insufficient resources remaining */ else slAddHead(&perCpu[c][r], el); } else @@ -1067,32 +1067,30 @@ job = jNode->val; dlAddTail(hangJobs, job->hangNode); ++batch->runningCount; --batch->queuedCount; ++user->runningCount; unactivateBatchIfEmpty(batch); /* Tell machine, job, and spoke about each other. */ dlAddTail(machine->jobs, job->jobNode); /* just put it back on the ready list, it will get looked at again */ dlAddTail(readyMachines, mNode); job->machine = machine; job->lastChecked = job->startTime = job->lastClockIn = now; - if (!(job->ram)) /* if no ram size specified, use the default */ - job->ram = batch->ram * ramUnit; spokeSendJob(spoke, machine, job); return TRUE; } } void runner(int count) /* Try to run a couple of jobs. */ { while (--count >= 0) if (!runNextJob()) break; } struct machine *machineNew(char *name, char *tempDir, struct machSpec *m) /* Create a new machine structure. */ @@ -1908,35 +1906,44 @@ job = jobNew(command, userName, dir, in, out, cpus, ram, results, TRUE); if (!job) { return 0; } batch = job->batch; dlAddTail(batch->jobQueue, job->node); ++batch->queuedCount; int oldCpu = batch->cpu; int oldRam = batch->ram; if (job->cpus) batch->cpu = (job->cpus + 0.5) / cpuUnit; /* rounding */ else + { + /* if no cpus specified, use the default */ batch->cpu = defaultJobCpu; + job->cpus = defaultJobCpu * cpuUnit; + } if (job->ram) - batch->ram = (job->ram + (0.5*ramUnit)) / ramUnit; /* rounding */ + batch->ram = 1 + (job->ram - 1) / ramUnit; /* any remainder will be rounded upwards + e.g. 1 to 1024m --> 1G but 1025m --> 2G if unit is 1G. 0m would just cause default ram usage. */ else + { + /* if no ram size specified, use the default */ batch->ram = defaultJobRam; + job->ram = defaultJobRam * ramUnit; + } if (oldCpu != batch->cpu || oldRam != batch->ram) { needsPlanning = TRUE; } if (batch->planCount == 0) { needsPlanning = TRUE; } user = batch->user; dlRemove(user->node); dlAddTail(queuedUsers, user->node); job->submitTime = time(NULL); return job->id;