6d8144be131a358037daea03610b22315527f26f hiram Thu Apr 23 15:12:32 2026 -0700 better recognition of errors in the workflow refs #31811 diff --git src/hg/utils/otto/userRequests/workflowMonitor.sh src/hg/utils/otto/userRequests/workflowMonitor.sh index bb333d2f4aa..dd53ed9338e 100755 --- src/hg/utils/otto/userRequests/workflowMonitor.sh +++ src/hg/utils/otto/userRequests/workflowMonitor.sh @@ -89,65 +89,99 @@ state=$(printf "%s" "${stateJson}" | jq -r '.state // "unknown"') printf "# invocation state: %s\n" "${state}" 1>&2 case "${state}" in "cancelled"|"failed") printf "ERROR: workflow %s -- invocation %s\n" "${state}" "${invocationId}" 1>&2 exit 1 ;; "new"|"ready") printf "# workflow still starting up, will check again later\n" 1>&2 exit 0 ;; "scheduled") # all steps dispatched -- fall through to check individual jobs ;; + "completed") + # all steps dispatched -- fall through to check individual jobs + ;; *) printf "# unexpected state '%s', will check again later\n" "${state}" 1>&2 exit 0 ;; esac # "scheduled" means all steps dispatched, but jobs may still be running. # use the jobs_summary endpoint to get aggregate job state counts. summaryJson=$(curl -s -H "x-api-key: ${galaxyApiKey}" \ "${galaxyUrl}/api/invocations/${invocationId}/jobs_summary") # count jobs in non-terminal states (new, queued, running, waiting, upload, # setting_metadata, resubmitted). Terminal states: ok, error, deleted, # skipped, paused. nonTerminalCount=$(printf "%s" "${summaryJson}" | jq \ '[.states | to_entries[] | select(.key != "ok" and .key != "error" and .key != "deleted" and .key != "skipped" and .key != "paused") | .value] | add // 0') if [ "${nonTerminalCount}" -gt 0 ]; then activeStates=$(printf "%s" "${summaryJson}" | jq -r \ '[.states | to_entries[] | select(.key != "ok" and .key != "error" and .key != "deleted" and .key != "skipped" and .key != "paused") | "\(.value) \(.key)"] | join(", ")') printf "# %d jobs still active (%s), will check again later\n" \ "${nonTerminalCount}" "${activeStates}" 1>&2 exit 0 fi -# warn about errored jobs but proceed to download what we can +# check for errored jobs -- a "completed" invocation can still contain +# individual jobs that failed errorCount=$(printf "%s" "${summaryJson}" | jq '.states.error // 0') if [ "${errorCount}" -gt 0 ]; then - printf "WARNING: %d jobs had errors in invocation %s\n" \ + printf "ERROR: %d job(s) had errors in invocation %s\n" \ "${errorCount}" "${invocationId}" 1>&2 + + # the invocation detail (stateJson) embeds steps with job_ids but not + # job states; query each job individually to find which step(s) failed + printf "%s" "${stateJson}" | jq -r ' + .steps[] | select(.job_id != null) | + "\(.order_index)\t\(.workflow_step_label // "unlabeled")\t\(.job_id)" + ' | while IFS=$'\t' read -r stepIdx stepLabel jobId; do + jobState=$(curl -s -H "x-api-key: ${galaxyApiKey}" \ + "${galaxyUrl}/api/jobs/${jobId}" | jq -r '.state // "unknown"') + if [ "${jobState}" = "error" ]; then + printf " FAILED step %s: %s (job %s)\n" \ + "${stepIdx}" "${stepLabel}" "${jobId}" 1>&2 + fi + done + + # check sub-workflow invocations for errors + printf "%s" "${stateJson}" | jq -r ' + .steps[] | select(.subworkflow_invocation_id != null) | + "\(.order_index)\t\(.workflow_step_label // "unlabeled")\t\(.subworkflow_invocation_id)" + ' | while IFS=$'\t' read -r stepIdx stepLabel subInvId; do + subErrors=$(curl -s -H "x-api-key: ${galaxyApiKey}" \ + "${galaxyUrl}/api/invocations/${subInvId}/jobs_summary" \ + | jq '.states.error // 0') + if [ "${subErrors}" -gt 0 ]; then + printf " FAILED step %s: %s (sub-workflow %s, %d error(s))\n" \ + "${stepIdx}" "${stepLabel}" "${subInvId}" "${subErrors}" 1>&2 + fi + done + + exit 1 fi printf "# all jobs complete, downloading results\n" 1>&2 ############################################################################ # download results via planemo ############################################################################ mkdir -p "result/${DS}" ${PM} invocation_download "${invocationId}" --profile vgp \ --output_directory "result/${DS}" ############################################################################ # chainBigBedFb - convert chain to bigBed and compute featureBits # args: db chainName chainGz sizesFile fbFile ############################################################################