84de43168bbf2952470fb8757c79a585c8668a8a hiram Fri Mar 20 14:28:31 2020 -0700 useful script to run a list of commands in a fashion similar to a parasol batch refs #24547 diff --git src/hg/utils/automation/perlPara.pl src/hg/utils/automation/perlPara.pl new file mode 100755 index 0000000..d79edba --- /dev/null +++ src/hg/utils/automation/perlPara.pl @@ -0,0 +1,196 @@ +#!/usr/bin/env perl + +# a quick and dirty implementation of a look-alike parasol job management +# system +# given a jobList of commands, and N number specification, +# run N jobs at a time from the jobList, waiting while N jobs are running, +# when one finishes, start the next one. After all jobs have been started, +# then wait for them all to finish. Finally, show all exit codes for all jobs. + +use strict; +use warnings; + +my $argc = scalar(@ARGV); + +if ($argc != 2) { + printf STDERR "usage: perlPara.pl N cmd.list\n"; + printf STDERR "will run commands from the cmd.list file up to N at a time\n"; + exit 255; +} + +my $nProcs = shift; +my $cmdList = shift; + +if ($nProcs < 1) { + printf STDERR "# ERROR: given N '%d' must be > 0\n", $nProcs; + exit 255; +} + +my @commands; # the job commands to run, read in from the cmd.list +my @pidList; # corresponding pid for each command +my %cmdQueue; # key is pid, value is command string +my %exitCodes; # key is pid, value is exit code from child +my %startTimes; # key is pid, value is epoch seconds start time +my %endTimes; # key is pid, value is epoch seconds at command exit +my $longestRunningJob = 0; + +############################################################################ +### read in the jobList, save in @commands array +############################################################################ +open (FH, "<$cmdList") or die "can not read $cmdList\n"; +while (my $cmd = <FH>) { + chomp $cmd; + next if ($cmd =~ m/^#/); + push @commands, $cmd; +} +close (FH); + +my $cmdCount = scalar(@commands); +printf STDERR "# counted $cmdCount commands in $cmdList\n"; +my $totalRunTime = 0; +my $jobsDone = 0; +my $overallTimeStart = `date "+%s"`; +chomp $overallTimeStart; + +############################################################################ +### while loop to start N jobs, wait for one to finish, start the next one +### keeping N jobs running until all the jobs have started +############################################################################ +my %childPids; # key is child PID, value is 1 running or 0 for done +my $processCount = 0; +my $nextCmd = 0; +while ($nextCmd < $cmdCount) { + ### keep starting jobs until N number processes are running + if ($processCount < $nProcs) { +# printf STDERR "# starting process $nextCmd '%s'\n", $commands[$nextCmd]; + my $pid = -1; + if(!defined($pid = fork())) { + # fork returned undef, so unsuccessful + die "Cannot fork process $nextCmd '%s' $!", $commands[$nextCmd];; + } elsif ($pid == 0) { # in child process return +# printf STDERR "# Child $nextCmd '%s' pid: $$\n", $commands[$nextCmd]; + my $ret = system($commands[$nextCmd]); + $ret >>= 8; + exit $ret; + } else { # return to parent, pid is the pid of the child + my $startTime = `date "+%s"`; + chomp $startTime; + $startTimes{$pid} = $startTime; + # fork returned 0 nor undef + # so this branch is parent + printf STDERR "process $nextCmd '%s' pid: $pid\n", $commands[$nextCmd]; + $childPids{$pid} = 1; # running + push @pidList, $pid; + $cmdQueue{$pid} = $commands[$nextCmd]; + ++$nextCmd; + ++$processCount; + } + } else { +# printf STDERR "# waiting for processes\n"; + ######### N jobs are running, wait for a job to finish to start a new one + my $childPid = wait; + my $ret = $?; + printf STDERR "# odd childPid '$childPid' ret '$ret'" if ($childPid < 0); + my $endTime = `date "+%s"`; + chomp $endTime; + $endTimes{$childPid} = $endTime; + $exitCodes{$childPid} = $ret; + my $elapsedSeconds = $endTimes{$childPid} - $startTimes{$childPid}; + $longestRunningJob = $elapsedSeconds if ($elapsedSeconds > $longestRunningJob); + $totalRunTime += $elapsedSeconds; + $jobsDone += 1; + my $avgTime = $totalRunTime / $jobsDone; + my $etaDone = ($avgTime * ($cmdCount - $jobsDone)) / $nProcs; + my $etaHours = $etaDone / 3600; + printf STDERR "%d sec %d avgSec %d sec (%.1f hr) eta %s pid $childPid ret '$ret'\n", $elapsedSeconds, $avgTime, $etaDone, $etaHours, $cmdQueue{$childPid}; + printf STDERR "# Completed: %d of %d jobs\n", $jobsDone, $cmdCount; + printf STDERR "# CPU time in finished jobs: %10ds %10.2fm %8.2fh %7.2fd %6.3fy\n", $totalRunTime, $totalRunTime / 60, $totalRunTime / 3600, $totalRunTime / 86400, $totalRunTime / (86400 * 365.2524); +printf STDERR "# Average job time: %10ds %10.2fm %8.2fh %7.2fd\n", + $avgTime, $avgTime / 60, $avgTime / 3600, $avgTime / 86400; +printf STDERR "# Longest finished job: %10ds %10.2fm %8.2fh %7.2fd\n", + $longestRunningJob, $longestRunningJob / 60, $longestRunningJob / 3600, + $longestRunningJob / 86400; +printf STDERR "# Estimated complete: %10ds %10.2fm %8.2fh %7.2fd\n", + $etaDone, $etaDone / 60, $etaDone / 3600, $etaDone / 86400; + $childPids{$childPid} = 0; # done + --$processCount; + } +} + +############################################################################# +### all jobs have been submitted, show the currently running jobs +############################################################################# +printf STDERR "# done running $cmdCount processes\n"; +printf STDERR "# processCount is at: %d\n", $processCount; +my $stillRunning = 0; +foreach my $childPid (sort keys %childPids) { + if ($childPids{$childPid} > 0) { + my $nowTime = `date "+%s"`; + chomp $nowTime; + my $elapsedSeconds = $nowTime - $startTimes{$childPid}; + $longestRunningJob = $elapsedSeconds if ($elapsedSeconds > $longestRunningJob); + printf STDERR "# child $childPid running %d seconds %s\n", $elapsedSeconds, $cmdQueue{$childPid}; + ++$stillRunning; + } +} + +############################################################################# +### all jobs have been submitted, now wait for the last ones to finish +############################################################################# +for (my $i = 0; $i < $stillRunning; ++$i) { + printf STDERR "# waiting for processes\n"; + my $childPid = wait; + my $ret = $?; + printf STDERR "# odd childPid '$childPid' ret '$ret'" if ($childPid < 0); + my $endTime = `date "+%s"`; + chomp $endTime; + $endTimes{$childPid} = $endTime; + $exitCodes{$childPid} = $ret; + my $elapsedSeconds = $endTimes{$childPid} - $startTimes{$childPid}; + $longestRunningJob = $elapsedSeconds if ($elapsedSeconds > $longestRunningJob); + $totalRunTime += $elapsedSeconds; + $jobsDone += 1; + my $avgTime = $totalRunTime / $jobsDone; + my $etaDone = ($avgTime * ($cmdCount - $jobsDone)) / $nProcs; + my $etaHours = $etaDone / 3600; + printf STDERR "%d sec %d avgSec %d sec (%.1f hr) eta %s pid $childPid ret '$ret'\n", $elapsedSeconds, $avgTime, $etaDone, $etaHours, $cmdQueue{$childPid}; + printf STDERR "# Completed: %d of %d jobs\n", $jobsDone, $cmdCount; + printf STDERR "# CPU time in finished jobs: %10ds %10.2fm %8.2fh %7.2fd %6.3fy\n", $totalRunTime, $totalRunTime / 60, $totalRunTime / 3600, $totalRunTime / 86400, $totalRunTime / (86400 * 365.2524); +printf STDERR "# Average job time: %10ds %10.2fm %8.2fh %7.2fd\n", + $avgTime, $avgTime / 60, $avgTime / 3600, $avgTime / 86400; +printf STDERR "# Longest finished job: %10ds %10.2fm %8.2fh %7.2fd\n", + $longestRunningJob, $longestRunningJob / 60, $longestRunningJob / 3600, + $longestRunningJob / 86400; +printf STDERR "# Estimated complete: %10ds %10.2fm %8.2fh %7.2fd\n", + $etaDone, $etaDone / 60, $etaDone / 3600, $etaDone / 86400; + $childPids{$childPid} = 0; # done + --$processCount; +} +printf STDERR "# processCount is at: %d\n", $processCount; + +############################################################################# +### show all exit codes for all jobs +############################################################################# +printf STDERR "# process exit codes:\n"; +for (my $i = 0; $i < $cmdCount; ++$i) { + my $childPid = $pidList[$i]; + my $elapsedSeconds = $endTimes{$childPid} - $startTimes{$childPid}; + my $exitCode = $exitCodes{$childPid}; + printf STDERR "# cmd $i pid $childPid exit code '$exitCode' %d %d seconds %s\n", $exitCode >> 8, $elapsedSeconds, $cmdQueue{$childPid}; +} + +my $avgTime = $totalRunTime / $jobsDone; +my $overallTimeDone = `date "+%s"`; +chomp $overallTimeDone; +my $wallClockTime = $overallTimeDone - $overallTimeStart; +my $speedUpMultiple = $totalRunTime / $wallClockTime; + +printf STDERR "# %d jobs %d totalSec %d sec/job %d wallClock %.2f X speedup multiple\n", $jobsDone, $totalRunTime, $avgTime, $wallClockTime, $speedUpMultiple; + +printf STDERR "# Completed: %d of %d jobs\n", $jobsDone, $cmdCount; +printf STDERR "# CPU time in finished jobs: %10ds %10.2fm %8.2fh %7.2fd %6.3fy\n", $totalRunTime, $totalRunTime / 60, $totalRunTime / 3600, $totalRunTime / 86400, $totalRunTime / (86400 * 365.2524); +printf STDERR "# Average job time: %10ds %10.2fm %8.2fh %7.2fd\n", + $avgTime, $avgTime / 60, $avgTime / 3600, $avgTime / 86400; +printf STDERR "# Longest finished job: %10ds %10.2fm %8.2fh %7.2fd\n", + $longestRunningJob, $longestRunningJob / 60, $longestRunningJob / 3600, + $longestRunningJob / 86400;