Skip to content

Commit

Permalink
feat: report action timeout as failed with timeout message (#165)
Browse files Browse the repository at this point in the history
Problem:

Previously, if an OpenJD Action was canceled due to a timeout being
reached then we'd report the action as just canceled. The change to
update to openjd-sessions 0.5.0
( #160 )
made it so that timeout actions would report as FAILED, but didn't
change the failure message to make it clear that the reason for the
failure was a timeout.

Solution:

We mutate the action status when we recieve it to override the failure
message with one that indicates that the action has reached its runtime
limit.

Signed-off-by: Daniel Neilson <53624638+ddneilson@users.noreply.github.com>
  • Loading branch information
ddneilson authored Feb 21, 2024
1 parent 858f621 commit ff36123
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 5 deletions.
20 changes: 17 additions & 3 deletions src/deadline_worker_agent/sessions/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,9 +1033,10 @@ def _handle_action_update(
now: datetime,
):
if is_unsuccessful:
fail_message = (
action_status.fail_message
or f"Action {current_action.definition.human_readable()} failed"
fail_message = action_status.fail_message or (
f"TIMEOUT - Previous action exceeded runtime limit: {current_action.definition.human_readable()}"
if action_status.state == ActionState.TIMEOUT
else f"Previous action failed: {current_action.definition.human_readable()}"
)

# If the current action failed, we mark future actions assigned to the session as
Expand All @@ -1052,6 +1053,19 @@ def _handle_action_update(
# UpdateWorkerSchedule request if so.
self._current_action = None

if action_status.state == ActionState.TIMEOUT:
# If the action ended via timeout, then we're reporting this as a failed action
# but also surface the timeout was the reason for the fail so that they don't have
# to go log diving.
action_status = ActionStatus(
# Preserve properties
state=action_status.state,
progress=action_status.progress,
exit_code=action_status.exit_code,
# Replace the message to let the customer know that the action reached its runtime limit.
fail_message="TIMEOUT - Exceeded the allotted runtime limit.",
)

completed_status = OPENJD_ACTION_STATE_TO_DEADLINE_COMPLETED_STATUS.get(
action_status.state, None
)
Expand Down
38 changes: 36 additions & 2 deletions test/unit/sessions/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,40 @@ def mock_action_updated_impl_side_effect(
current_action_lock_exit.assert_called_once()
mock_report_action_update.assert_not_called()

def test_timeout_messaging(
self,
session: Session,
# We don't use the value of this fixture, but requiring it has the side-effect of assigning
# it as the current action of the session
current_action: CurrentAction,
) -> None:
"""Test that when an action is reported as TIMEOUT then we:
1) Cancel all subsequent tasks as NEVER_ATTEMPTED; and
2) Have an appropriate failure message on the action.
"""
# GIVEN
status = ActionStatus(state=ActionState.TIMEOUT, exit_code=-1, progress=24.4)
with (
patch.object(session._queue, "cancel_all") as mock_cancel_all,
patch.object(session, "_report_action_update") as mock_report_action_update,
):
# WHEN
session.update_action(status)

# THEN
mock_cancel_all.assert_called_once_with(
cancel_outcome="NEVER_ATTEMPTED", message=ANY, ignore_env_exits=True
)
assert "TIMEOUT" in mock_cancel_all.call_args.kwargs["message"]
mock_report_action_update.assert_called_once()
session_status = mock_report_action_update.call_args.args[0]
assert session_status.completed_status == "FAILED"
called_with_status = session_status.status
assert called_with_status.state == ActionState.TIMEOUT
assert "TIMEOUT" in called_with_status.fail_message
assert called_with_status.exit_code == status.exit_code
assert called_with_status.progress == status.progress


class TestSessionActionUpdatedImpl:
"""Test cases for Session._action_updated_impl()"""
Expand Down Expand Up @@ -1075,7 +1109,7 @@ def test_failed_enter_env(
session._current_action = current_action
queue_cancel_all: MagicMock = session_action_queue.cancel_all
expected_next_action_message = failed_action_status.fail_message or (
f"Action {current_action.definition.human_readable()} failed"
f"Previous action failed: {current_action.definition.human_readable()}"
)
expected_action_update = SessionActionStatus(
id=action_id,
Expand Down Expand Up @@ -1143,7 +1177,7 @@ def test_failed_task_run(
session._current_action = current_action
queue_cancel_all: MagicMock = session_action_queue.cancel_all
expected_next_action_message = failed_action_status.fail_message or (
f"Action {current_action.definition.human_readable()} failed"
f"Previous action failed: {current_action.definition.human_readable()}"
)
expected_action_update = SessionActionStatus(
id=action_id,
Expand Down

0 comments on commit ff36123

Please sign in to comment.