Skip to content

Commit

Permalink
feat: Add telemetry event for uncaught exceptions (#203)
Browse files Browse the repository at this point in the history
Signed-off-by: Caden Marofke <marofke@amazon.com>
  • Loading branch information
marofke authored Mar 20, 2024
1 parent 985b437 commit 9a17a07
Show file tree
Hide file tree
Showing 6 changed files with 64 additions and 2 deletions.
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,18 @@ to interact with AWS Deadline Cloud and perform tasks.

See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.

## Telemetry

This library collects telemetry data by default. Telemetry events contain non-personally-identifiable information that helps us understand how users interact with our software so we know what features our customers use, and/or what existing pain points are.

You can opt out of telemetry data collection by either:

1. Setting the environment variable: `DEADLINE_CLOUD_TELEMETRY_OPT_OUT=true`
2. Providing the installer flag: `--telemetry-opt-out`
3. Setting the config file: `deadline config set telemetry.opt_out true`

Note that setting the environment variable supersedes the config file setting.

## License

This project is licensed under the Apache-2.0 License.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ dynamic = ["version"]
dependencies = [
"requests ~= 2.31",
"boto3 >= 1.28.80",
"deadline == 0.40.*",
"deadline == 0.41.*",
"openjd-sessions == 0.7.*",
# tomli became tomllib in standard library in Python 3.11
"tomli == 2.0.* ; python_version<'3.11'",
Expand Down
24 changes: 23 additions & 1 deletion src/deadline_worker_agent/aws/deadline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

from deadline.client.api import get_telemetry_client, TelemetryClient
from deadline.job_attachments.progress_tracker import SummaryStatistics
from openjd.model import version as openjd_model_version
from openjd.sessions import version as openjd_sessions_version

from ..._version import __version__ as version # noqa
from ...startup.config import Configuration
Expand All @@ -35,6 +37,8 @@
LOG_CONFIG_OPTION_STREAM_NAME_KEY,
)

__cached_telemetry_client: Optional[TelemetryClient] = None

_logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -792,7 +796,18 @@ def update_worker_schedule(

def _get_deadline_telemetry_client() -> TelemetryClient:
"""Wrapper around the Deadline Client Library telemetry client, in order to set package-specific information"""
return get_telemetry_client("deadline-cloud-worker-agent", version)
global __cached_telemetry_client
if not __cached_telemetry_client:
__cached_telemetry_client = get_telemetry_client(
"deadline-cloud-worker-agent", ".".join(version.split(".")[:3])
)
__cached_telemetry_client.update_common_details(
{"openjd-sessions-version": ".".join(openjd_sessions_version.split(".")[:3])}
)
__cached_telemetry_client.update_common_details(
{"openjd-model-version": ".".join(openjd_model_version.split(".")[:3])}
)
return __cached_telemetry_client


def record_worker_start_telemetry_event(capabilities: Capabilities) -> None:
Expand All @@ -802,6 +817,13 @@ def record_worker_start_telemetry_event(capabilities: Capabilities) -> None:
)


def record_uncaught_exception_telemetry_event(exception_type: str) -> None:
"""Calls the telemetry client to record an event signaling an uncaught exception occurred."""
_get_deadline_telemetry_client().record_error(
event_details={"exception_scope": "uncaught"}, exception_type=exception_type
)


def record_sync_inputs_telemetry_event(queue_id: str, summary: SummaryStatistics) -> None:
"""Calls the telemetry client to record an event capturing the sync-inputs summary."""
details: Dict[str, Any] = asdict(summary)
Expand Down
2 changes: 2 additions & 0 deletions src/deadline_worker_agent/startup/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
update_worker,
update_worker_schedule,
record_worker_start_telemetry_event,
record_uncaught_exception_telemetry_event,
)

__all__ = ["entrypoint"]
Expand Down Expand Up @@ -161,6 +162,7 @@ def filter(self, record: logging.LogRecord) -> bool:
raise
else:
_logger.critical(e, exc_info=True)
record_uncaught_exception_telemetry_event(exception_type=str(type(e)))
sys.exit(1)
finally:
_logger.info("Worker Agent exiting")
Expand Down
23 changes: 23 additions & 0 deletions test/unit/aws/deadline/test_client_telemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
record_worker_start_telemetry_event,
record_sync_inputs_telemetry_event,
record_sync_outputs_telemetry_event,
record_uncaught_exception_telemetry_event,
)
from deadline_worker_agent.startup.capabilities import Capabilities
from deadline.job_attachments.progress_tracker import SummaryStatistics
Expand Down Expand Up @@ -125,3 +126,25 @@ def test_record_sync_outputs_telemetry_event():
"queue_id": "queue-test",
},
)


def test_record_uncaught_exception_telemetry_event():
"""
Tests that when record_uncaught_exception_telemetry_event() is called, the correct
event type and details are passed to the telemetry client's record_event() method.
"""
# GIVEN
mock_telemetry_client = MagicMock()

with patch.object(deadline_mod, "_get_deadline_telemetry_client") as mock_get_telemetry_client:
mock_get_telemetry_client.return_value = mock_telemetry_client

# WHEN
error = ValueError()
record_uncaught_exception_telemetry_event(str(type(error)))

# THEN
mock_telemetry_client.record_error.assert_called_with(
exception_type="<class 'ValueError'>",
event_details={"exception_scope": "uncaught"},
)
3 changes: 3 additions & 0 deletions test/unit/startup/test_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,11 @@ def test_calls_worker_run(
mock_worker_run.assert_called_once_with()


@patch.object(entrypoint_mod, "record_uncaught_exception_telemetry_event")
@patch.object(entrypoint_mod.sys, "exit")
def test_worker_run_exception(
sys_exit_mock: MagicMock,
telemetry_mock: MagicMock,
mock_worker_run: MagicMock,
) -> None:
"""Tests that exceptions raised by Worker.run() are logged and the program exits with a non-zero exit code"""
Expand All @@ -205,6 +207,7 @@ def test_worker_run_exception(
logger_exception: MagicMock = logger.exception
logger_exception.assert_called_once_with("Failed running worker: %s", exception)
sys_exit_mock.assert_called_once_with(1)
telemetry_mock.assert_called_once_with(exception_type=str(type(exception)))


def test_configuration_load(
Expand Down

0 comments on commit 9a17a07

Please sign in to comment.