5674b62fd90df90955867e674a8f157a58a0e953
galt
  Wed Jun 18 14:33:02 2025 -0700
Fixes packet overflow issue in Parasol paraNode resurrect. refs #34883

diff --git src/parasol/paraHub/paraHub.c src/parasol/paraHub/paraHub.c
index e07e43f08ea..386fc4f7b40 100644
--- src/parasol/paraHub/paraHub.c
+++ src/parasol/paraHub/paraHub.c
@@ -1762,48 +1762,69 @@
  * In this case we will have restarted the job elsewhere, and
  * that other copy could be conflicting with the copy of
  * the job the node is still running. */
 {
 char *name = nextWord(&line), *jobIdString;
 int jobId;
 struct machine *mach;
 struct dlNode *node;
 boolean hostFound = FALSE;
 for (node = deadMachines->head; !dlEnd(node); node = node->next)
     {
     mach = node->val;
     if (sameString(mach->name, name) && mach->isDead)
         {
 	hostFound = TRUE;
+
+	jobIdString = nextWord(&line);
+	if (!jobIdString)
+	    {
+            logWarn("unexpected blank jobId in nodeAlive");
+	    return; // not expected to happen.
+	    }
+	if (!sameString(jobIdString, "done"))
+	    {
+	    jobId = atoi(jobIdString);
+	    if (!slIntFind(mach->resurrectJobIds, jobId))
+		{ 
+		struct slInt *i = slIntNew(jobId);
+		slAddHead( &mach->resurrectJobIds, i ); 
+		}
+	    break;
+	    }
+	slReverse(&mach->resurrectJobIds); 
+
 	dlRemove(node);
 	dlAddTail(freeMachines, node);
 	needsPlanning = TRUE;
 	mach->isDead = FALSE;
 
 	if (mach->deadJobIds != NULL)
 	    {
 	    struct dyString *dy = dyStringNew(0);
 	    struct slInt *i = mach->deadJobIds;
 	    dyStringPrintf(dy, "hub: node %s assigned ", name); 
 	    for(i = mach->deadJobIds; i; i = i->next)
 		dyStringPrintf(dy, "%d ", i->val);
 	    dyStringPrintf(dy, "came back.");
 	    logWarn("%s", dy->string);
 	    dyStringFree(&dy);
-	    while ((jobIdString = nextWord(&line)) != NULL)
+
+            struct slInt *slResJob;
+            for (slResJob = mach->resurrectJobIds; slResJob; slResJob = slResJob->next)
 	        {
-		jobId = atoi(jobIdString);
+		jobId = slResJob->val;
                 if ((i = slIntFind(mach->deadJobIds, jobId)))
 		    {
 		    struct job *job;
 		    warn("hub: Looks like %s is still keeping track of %d", name, jobId);
 		    if ((job = findWaitingJob(jobId)) != NULL)
 			{
 			warn("hub: Luckily rerun of job %d has not yet happened.", 
                              jobId);
 			job->machine = mach;
 			dlAddTail(mach->jobs, job->jobNode);
 			job->lastChecked = mach->lastChecked = job->lastClockIn = now;
 			dlRemove(job->node);
 			dlAddTail(runningJobs, job->node);
 			dlRemove(mach->node);
 			dlAddTail(busyMachines, mach->node);
@@ -1828,30 +1849,32 @@
 			}
 		    else
 		        {
 			/* This case should be very rare.  It should happen when
 			 * a node is out of touch for 2 hours, but when it comes
 			 * back is running a job that we reran to completion
 			 * on another node. */
 			warn("hub: Job %d has finished running, there is a conflict. "
 			     "Data may be corrupted, and it will take a lot of logic to fix.", 
                              jobId);
 			}
 		    }
 		}
 	    }
 	slFreeList(&mach->deadJobIds);
+	slFreeList(&mach->resurrectJobIds);
+
 	runner(1);
 	break;
 	}
     }
 if (!hostFound)
     {
     warn("hub 'alive $HOST' msg handler: unable to resurrect host %s, "
 	 "not find in deadMachines list.",  name);
     }
 }
 
 void recycleMachine(struct machine *mach)
 /* Recycle machine into free list. */
 {
 dlRemove(mach->node);