6d8144be131a358037daea03610b22315527f26f
hiram
  Thu Apr 23 15:12:32 2026 -0700
better recognition of errors in the workflow refs #31811

diff --git src/hg/utils/otto/userRequests/workflowMonitor.sh src/hg/utils/otto/userRequests/workflowMonitor.sh
index bb333d2f4aa..dd53ed9338e 100755
--- src/hg/utils/otto/userRequests/workflowMonitor.sh
+++ src/hg/utils/otto/userRequests/workflowMonitor.sh
@@ -89,65 +89,99 @@
 state=$(printf "%s" "${stateJson}" | jq -r '.state // "unknown"')
 printf "# invocation state: %s\n" "${state}" 1>&2
 
 case "${state}" in
   "cancelled"|"failed")
     printf "ERROR: workflow %s -- invocation %s\n" "${state}" "${invocationId}" 1>&2
     exit 1
     ;;
   "new"|"ready")
     printf "# workflow still starting up, will check again later\n" 1>&2
     exit 0
     ;;
   "scheduled")
     # all steps dispatched -- fall through to check individual jobs
     ;;
+  "completed")
+    # all steps dispatched -- fall through to check individual jobs
+    ;;
   *)
     printf "# unexpected state '%s', will check again later\n" "${state}" 1>&2
     exit 0
     ;;
 esac
 
 # "scheduled" means all steps dispatched, but jobs may still be running.
 # use the jobs_summary endpoint to get aggregate job state counts.
 summaryJson=$(curl -s -H "x-api-key: ${galaxyApiKey}" \
   "${galaxyUrl}/api/invocations/${invocationId}/jobs_summary")
 
 # count jobs in non-terminal states (new, queued, running, waiting, upload,
 # setting_metadata, resubmitted).  Terminal states: ok, error, deleted,
 # skipped, paused.
 nonTerminalCount=$(printf "%s" "${summaryJson}" | jq \
   '[.states | to_entries[] | select(.key != "ok" and .key != "error"
       and .key != "deleted" and .key != "skipped"
       and .key != "paused") | .value] | add // 0')
 
 if [ "${nonTerminalCount}" -gt 0 ]; then
   activeStates=$(printf "%s" "${summaryJson}" | jq -r \
     '[.states | to_entries[] | select(.key != "ok" and .key != "error"
         and .key != "deleted" and .key != "skipped"
         and .key != "paused") | "\(.value) \(.key)"] | join(", ")')
   printf "# %d jobs still active (%s), will check again later\n" \
     "${nonTerminalCount}" "${activeStates}" 1>&2
   exit 0
 fi
 
-# warn about errored jobs but proceed to download what we can
+# check for errored jobs -- a "completed" invocation can still contain
+# individual jobs that failed
 errorCount=$(printf "%s" "${summaryJson}" | jq '.states.error // 0')
 
 if [ "${errorCount}" -gt 0 ]; then
-  printf "WARNING: %d jobs had errors in invocation %s\n" \
+  printf "ERROR: %d job(s) had errors in invocation %s\n" \
     "${errorCount}" "${invocationId}" 1>&2
+
+  # the invocation detail (stateJson) embeds steps with job_ids but not
+  # job states; query each job individually to find which step(s) failed
+  printf "%s" "${stateJson}" | jq -r '
+    .steps[] | select(.job_id != null) |
+    "\(.order_index)\t\(.workflow_step_label // "unlabeled")\t\(.job_id)"
+  ' | while IFS=$'\t' read -r stepIdx stepLabel jobId; do
+    jobState=$(curl -s -H "x-api-key: ${galaxyApiKey}" \
+      "${galaxyUrl}/api/jobs/${jobId}" | jq -r '.state // "unknown"')
+    if [ "${jobState}" = "error" ]; then
+      printf "  FAILED step %s: %s (job %s)\n" \
+        "${stepIdx}" "${stepLabel}" "${jobId}" 1>&2
+    fi
+  done
+
+  # check sub-workflow invocations for errors
+  printf "%s" "${stateJson}" | jq -r '
+    .steps[] | select(.subworkflow_invocation_id != null) |
+    "\(.order_index)\t\(.workflow_step_label // "unlabeled")\t\(.subworkflow_invocation_id)"
+  ' | while IFS=$'\t' read -r stepIdx stepLabel subInvId; do
+    subErrors=$(curl -s -H "x-api-key: ${galaxyApiKey}" \
+      "${galaxyUrl}/api/invocations/${subInvId}/jobs_summary" \
+      | jq '.states.error // 0')
+    if [ "${subErrors}" -gt 0 ]; then
+      printf "  FAILED step %s: %s (sub-workflow %s, %d error(s))\n" \
+        "${stepIdx}" "${stepLabel}" "${subInvId}" "${subErrors}" 1>&2
+    fi
+  done
+
+  exit 1
 fi
 
 printf "# all jobs complete, downloading results\n" 1>&2
 
 ############################################################################
 # download results via planemo
 ############################################################################
 mkdir -p "result/${DS}"
 ${PM} invocation_download "${invocationId}" --profile vgp \
   --output_directory "result/${DS}"
 
 ############################################################################
 # chainBigBedFb - convert chain to bigBed and compute featureBits
 # args: db chainName chainGz sizesFile fbFile
 ############################################################################