5674b62fd90df90955867e674a8f157a58a0e953 galt Wed Jun 18 14:33:02 2025 -0700 Fixes packet overflow issue in Parasol paraNode resurrect. refs #34883 diff --git src/parasol/paraHub/paraHub.c src/parasol/paraHub/paraHub.c index e07e43f08ea..386fc4f7b40 100644 --- src/parasol/paraHub/paraHub.c +++ src/parasol/paraHub/paraHub.c @@ -1762,48 +1762,69 @@ * In this case we will have restarted the job elsewhere, and * that other copy could be conflicting with the copy of * the job the node is still running. */ { char *name = nextWord(&line), *jobIdString; int jobId; struct machine *mach; struct dlNode *node; boolean hostFound = FALSE; for (node = deadMachines->head; !dlEnd(node); node = node->next) { mach = node->val; if (sameString(mach->name, name) && mach->isDead) { hostFound = TRUE; + + jobIdString = nextWord(&line); + if (!jobIdString) + { + logWarn("unexpected blank jobId in nodeAlive"); + return; // not expected to happen. + } + if (!sameString(jobIdString, "done")) + { + jobId = atoi(jobIdString); + if (!slIntFind(mach->resurrectJobIds, jobId)) + { + struct slInt *i = slIntNew(jobId); + slAddHead( &mach->resurrectJobIds, i ); + } + break; + } + slReverse(&mach->resurrectJobIds); + dlRemove(node); dlAddTail(freeMachines, node); needsPlanning = TRUE; mach->isDead = FALSE; if (mach->deadJobIds != NULL) { struct dyString *dy = dyStringNew(0); struct slInt *i = mach->deadJobIds; dyStringPrintf(dy, "hub: node %s assigned ", name); for(i = mach->deadJobIds; i; i = i->next) dyStringPrintf(dy, "%d ", i->val); dyStringPrintf(dy, "came back."); logWarn("%s", dy->string); dyStringFree(&dy); - while ((jobIdString = nextWord(&line)) != NULL) + + struct slInt *slResJob; + for (slResJob = mach->resurrectJobIds; slResJob; slResJob = slResJob->next) { - jobId = atoi(jobIdString); + jobId = slResJob->val; if ((i = slIntFind(mach->deadJobIds, jobId))) { struct job *job; warn("hub: Looks like %s is still keeping track of %d", name, jobId); if ((job = findWaitingJob(jobId)) != NULL) { warn("hub: Luckily rerun of job %d has not yet happened.", jobId); job->machine = mach; dlAddTail(mach->jobs, job->jobNode); job->lastChecked = mach->lastChecked = job->lastClockIn = now; dlRemove(job->node); dlAddTail(runningJobs, job->node); dlRemove(mach->node); dlAddTail(busyMachines, mach->node); @@ -1828,30 +1849,32 @@ } else { /* This case should be very rare. It should happen when * a node is out of touch for 2 hours, but when it comes * back is running a job that we reran to completion * on another node. */ warn("hub: Job %d has finished running, there is a conflict. " "Data may be corrupted, and it will take a lot of logic to fix.", jobId); } } } } slFreeList(&mach->deadJobIds); + slFreeList(&mach->resurrectJobIds); + runner(1); break; } } if (!hostFound) { warn("hub 'alive $HOST' msg handler: unable to resurrect host %s, " "not find in deadMachines list.", name); } } void recycleMachine(struct machine *mach) /* Recycle machine into free list. */ { dlRemove(mach->node);