a14381a65d94efd21600039ed9fb723fd8d5f51a hiram Thu Sep 30 12:14:45 2021 -0700 prevent divide by zero no redmine diff --git src/hg/utils/automation/perlPara.pl src/hg/utils/automation/perlPara.pl index d79edba..6fb0e46 100755 --- src/hg/utils/automation/perlPara.pl +++ src/hg/utils/automation/perlPara.pl @@ -1,196 +1,197 @@ #!/usr/bin/env perl # a quick and dirty implementation of a look-alike parasol job management # system # given a jobList of commands, and N number specification, # run N jobs at a time from the jobList, waiting while N jobs are running, # when one finishes, start the next one. After all jobs have been started, # then wait for them all to finish. Finally, show all exit codes for all jobs. use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 2) { printf STDERR "usage: perlPara.pl N cmd.list\n"; printf STDERR "will run commands from the cmd.list file up to N at a time\n"; exit 255; } my $nProcs = shift; my $cmdList = shift; if ($nProcs < 1) { printf STDERR "# ERROR: given N '%d' must be > 0\n", $nProcs; exit 255; } my @commands; # the job commands to run, read in from the cmd.list my @pidList; # corresponding pid for each command my %cmdQueue; # key is pid, value is command string my %exitCodes; # key is pid, value is exit code from child my %startTimes; # key is pid, value is epoch seconds start time my %endTimes; # key is pid, value is epoch seconds at command exit my $longestRunningJob = 0; ############################################################################ ### read in the jobList, save in @commands array ############################################################################ open (FH, "<$cmdList") or die "can not read $cmdList\n"; while (my $cmd = <FH>) { chomp $cmd; next if ($cmd =~ m/^#/); push @commands, $cmd; } close (FH); my $cmdCount = scalar(@commands); printf STDERR "# counted $cmdCount commands in $cmdList\n"; my $totalRunTime = 0; my $jobsDone = 0; my $overallTimeStart = `date "+%s"`; chomp $overallTimeStart; ############################################################################ ### while loop to start N jobs, wait for one to finish, start the next one ### keeping N jobs running until all the jobs have started ############################################################################ my %childPids; # key is child PID, value is 1 running or 0 for done my $processCount = 0; my $nextCmd = 0; while ($nextCmd < $cmdCount) { ### keep starting jobs until N number processes are running if ($processCount < $nProcs) { # printf STDERR "# starting process $nextCmd '%s'\n", $commands[$nextCmd]; my $pid = -1; if(!defined($pid = fork())) { # fork returned undef, so unsuccessful die "Cannot fork process $nextCmd '%s' $!", $commands[$nextCmd];; } elsif ($pid == 0) { # in child process return # printf STDERR "# Child $nextCmd '%s' pid: $$\n", $commands[$nextCmd]; my $ret = system($commands[$nextCmd]); $ret >>= 8; exit $ret; } else { # return to parent, pid is the pid of the child my $startTime = `date "+%s"`; chomp $startTime; $startTimes{$pid} = $startTime; # fork returned 0 nor undef # so this branch is parent printf STDERR "process $nextCmd '%s' pid: $pid\n", $commands[$nextCmd]; $childPids{$pid} = 1; # running push @pidList, $pid; $cmdQueue{$pid} = $commands[$nextCmd]; ++$nextCmd; ++$processCount; } } else { # printf STDERR "# waiting for processes\n"; ######### N jobs are running, wait for a job to finish to start a new one my $childPid = wait; my $ret = $?; printf STDERR "# odd childPid '$childPid' ret '$ret'" if ($childPid < 0); my $endTime = `date "+%s"`; chomp $endTime; $endTimes{$childPid} = $endTime; $exitCodes{$childPid} = $ret; my $elapsedSeconds = $endTimes{$childPid} - $startTimes{$childPid}; $longestRunningJob = $elapsedSeconds if ($elapsedSeconds > $longestRunningJob); $totalRunTime += $elapsedSeconds; $jobsDone += 1; my $avgTime = $totalRunTime / $jobsDone; my $etaDone = ($avgTime * ($cmdCount - $jobsDone)) / $nProcs; my $etaHours = $etaDone / 3600; printf STDERR "%d sec %d avgSec %d sec (%.1f hr) eta %s pid $childPid ret '$ret'\n", $elapsedSeconds, $avgTime, $etaDone, $etaHours, $cmdQueue{$childPid}; printf STDERR "# Completed: %d of %d jobs\n", $jobsDone, $cmdCount; printf STDERR "# CPU time in finished jobs: %10ds %10.2fm %8.2fh %7.2fd %6.3fy\n", $totalRunTime, $totalRunTime / 60, $totalRunTime / 3600, $totalRunTime / 86400, $totalRunTime / (86400 * 365.2524); printf STDERR "# Average job time: %10ds %10.2fm %8.2fh %7.2fd\n", $avgTime, $avgTime / 60, $avgTime / 3600, $avgTime / 86400; printf STDERR "# Longest finished job: %10ds %10.2fm %8.2fh %7.2fd\n", $longestRunningJob, $longestRunningJob / 60, $longestRunningJob / 3600, $longestRunningJob / 86400; printf STDERR "# Estimated complete: %10ds %10.2fm %8.2fh %7.2fd\n", $etaDone, $etaDone / 60, $etaDone / 3600, $etaDone / 86400; $childPids{$childPid} = 0; # done --$processCount; } } ############################################################################# ### all jobs have been submitted, show the currently running jobs ############################################################################# printf STDERR "# done running $cmdCount processes\n"; printf STDERR "# processCount is at: %d\n", $processCount; my $stillRunning = 0; foreach my $childPid (sort keys %childPids) { if ($childPids{$childPid} > 0) { my $nowTime = `date "+%s"`; chomp $nowTime; my $elapsedSeconds = $nowTime - $startTimes{$childPid}; $longestRunningJob = $elapsedSeconds if ($elapsedSeconds > $longestRunningJob); printf STDERR "# child $childPid running %d seconds %s\n", $elapsedSeconds, $cmdQueue{$childPid}; ++$stillRunning; } } ############################################################################# ### all jobs have been submitted, now wait for the last ones to finish ############################################################################# for (my $i = 0; $i < $stillRunning; ++$i) { printf STDERR "# waiting for processes\n"; my $childPid = wait; my $ret = $?; printf STDERR "# odd childPid '$childPid' ret '$ret'" if ($childPid < 0); my $endTime = `date "+%s"`; chomp $endTime; $endTimes{$childPid} = $endTime; $exitCodes{$childPid} = $ret; my $elapsedSeconds = $endTimes{$childPid} - $startTimes{$childPid}; $longestRunningJob = $elapsedSeconds if ($elapsedSeconds > $longestRunningJob); $totalRunTime += $elapsedSeconds; $jobsDone += 1; my $avgTime = $totalRunTime / $jobsDone; my $etaDone = ($avgTime * ($cmdCount - $jobsDone)) / $nProcs; my $etaHours = $etaDone / 3600; printf STDERR "%d sec %d avgSec %d sec (%.1f hr) eta %s pid $childPid ret '$ret'\n", $elapsedSeconds, $avgTime, $etaDone, $etaHours, $cmdQueue{$childPid}; printf STDERR "# Completed: %d of %d jobs\n", $jobsDone, $cmdCount; printf STDERR "# CPU time in finished jobs: %10ds %10.2fm %8.2fh %7.2fd %6.3fy\n", $totalRunTime, $totalRunTime / 60, $totalRunTime / 3600, $totalRunTime / 86400, $totalRunTime / (86400 * 365.2524); printf STDERR "# Average job time: %10ds %10.2fm %8.2fh %7.2fd\n", $avgTime, $avgTime / 60, $avgTime / 3600, $avgTime / 86400; printf STDERR "# Longest finished job: %10ds %10.2fm %8.2fh %7.2fd\n", $longestRunningJob, $longestRunningJob / 60, $longestRunningJob / 3600, $longestRunningJob / 86400; printf STDERR "# Estimated complete: %10ds %10.2fm %8.2fh %7.2fd\n", $etaDone, $etaDone / 60, $etaDone / 3600, $etaDone / 86400; $childPids{$childPid} = 0; # done --$processCount; } printf STDERR "# processCount is at: %d\n", $processCount; ############################################################################# ### show all exit codes for all jobs ############################################################################# printf STDERR "# process exit codes:\n"; for (my $i = 0; $i < $cmdCount; ++$i) { my $childPid = $pidList[$i]; my $elapsedSeconds = $endTimes{$childPid} - $startTimes{$childPid}; my $exitCode = $exitCodes{$childPid}; printf STDERR "# cmd $i pid $childPid exit code '$exitCode' %d %d seconds %s\n", $exitCode >> 8, $elapsedSeconds, $cmdQueue{$childPid}; } my $avgTime = $totalRunTime / $jobsDone; my $overallTimeDone = `date "+%s"`; chomp $overallTimeDone; my $wallClockTime = $overallTimeDone - $overallTimeStart; -my $speedUpMultiple = $totalRunTime / $wallClockTime; +my $speedUpMultiple = 1; +$speedUpMultiple = $totalRunTime / $wallClockTime if ($wallClockTime > 0); printf STDERR "# %d jobs %d totalSec %d sec/job %d wallClock %.2f X speedup multiple\n", $jobsDone, $totalRunTime, $avgTime, $wallClockTime, $speedUpMultiple; printf STDERR "# Completed: %d of %d jobs\n", $jobsDone, $cmdCount; printf STDERR "# CPU time in finished jobs: %10ds %10.2fm %8.2fh %7.2fd %6.3fy\n", $totalRunTime, $totalRunTime / 60, $totalRunTime / 3600, $totalRunTime / 86400, $totalRunTime / (86400 * 365.2524); printf STDERR "# Average job time: %10ds %10.2fm %8.2fh %7.2fd\n", $avgTime, $avgTime / 60, $avgTime / 3600, $avgTime / 86400; printf STDERR "# Longest finished job: %10ds %10.2fm %8.2fh %7.2fd\n", $longestRunningJob, $longestRunningJob / 60, $longestRunningJob / 3600, $longestRunningJob / 86400;