From 90053b03c2c25235dff7b94013ac1711ba6260c6 Mon Sep 17 00:00:00 2001 From: Daniel Neilson <53624638+ddneilson@users.noreply.github.com> Date: Wed, 8 Nov 2023 20:51:16 +0000 Subject: [PATCH] fix: avoid truncating sessionaction logs Summary: In the service we use a SessionAction's reported startedAt/endedAt time to filter the cloudwatch logs stream for a Session so that we can display only the logs relevant to that particular SessionAction. There is an edge case that occationally shows up where the very last log line of a SessionAction will not show up in this filtered view. This is because the timestamp on that log line is *after* the endedAt time of the SessionAction; see below for a sample: In the portal UI I only see the following in the sync attachments log: ``` 2023/11/02 13:29:10-05:00 Syncing inputs using Job Attachments 2023/11/02 13:29:11-05:00 Downloaded 445.0 B / 445.0 B of 1 file (Transfer rate: 0.0 B/s) ``` The full log shows: ``` 2023-11-02T13:29:10.839-05:00 Syncing inputs using Job Attachments 2023-11-02T13:29:11.042-05:00 Downloaded 445.0 B / 445.0 B of 1 file (Transfer rate: 0.0 B/s) 2023-11-02T13:29:11.044-05:00 Summary Statistics for file downloads: Processed 1 file totaling 445.0 B. Skipped re-processing 0 files totaling 0.0 B. Total processing time of 0.05191 seconds at 8.57 KB/s. ``` The session action details are: ``` % aws deadline get-session-action --farm-id ... { "sessionActionId": "sessionaction-d0f37f306f93484699ceda74bb6c15ef-0", "status": "SUCCEEDED", "startedAt": "2023-11-02T18:29:10.838000+00:00", "endedAt": "2023-11-02T18:29:11.043000+00:00", "progressPercent": 100.0, "sessionId": "session-d0f37f306f93484699ceda74bb6c15ef", "definition": { "syncInputJobAttachments": {} } } ``` Notice that the timestamp on the line of logging that's excluded is 1 millisecond after the endedAt time of the SessionAction. Resolution: This comes down to a difference in rounding. We are rounding when converting a timestamp in to the integer number of milliseconds after epoch-utc to report to CloudWatch. However, the Deadline Cloud service is truncating the startedAt/endedAt times to the number of milliseconds. Truncation and rounding produce different results half of the time (statistically speaking). So, this fix modifies the timestamp that we report to CloudWatch to truncate to the number of milliseconds rather than round. Signed-off-by: Daniel Neilson <53624638+ddneilson@users.noreply.github.com> --- src/deadline_worker_agent/log_sync/cloudwatch.py | 9 +++++++-- test/unit/log_sync/test_cloudwatch.py | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/deadline_worker_agent/log_sync/cloudwatch.py b/src/deadline_worker_agent/log_sync/cloudwatch.py index c1499604..5f615620 100644 --- a/src/deadline_worker_agent/log_sync/cloudwatch.py +++ b/src/deadline_worker_agent/log_sync/cloudwatch.py @@ -635,8 +635,13 @@ def emit(self, record: LogRecord) -> None: message = " " # record.created is expressed in seconds (floating-point) since # January 1, 1970, 00:00:00 (UTC) but CloudWatch expects an integer value expressed in - # microseconds. Convert to microseconds and round to nearest integer. - timestamp = int(round(record.created * 1000)) + # microseconds since utc-epoch. + # Our service uses a SessionAction's startedAt/endedAt times to determine which part of a + # log belongs to that particular SessionAction. So, we need to take some care to ensure that + # the time that we report to CloudWatch is rounded in the same way that the service will round + # the startedAt/endedAt time that it receives. Our service truncates, rather than rounds, times + # to microseconds so we do the same here. + timestamp = int(record.created * 1000) self._log_event_queue.append( FormattedLogEntry( timestamp=timestamp, diff --git a/test/unit/log_sync/test_cloudwatch.py b/test/unit/log_sync/test_cloudwatch.py index 9740ed01..4a77d550 100644 --- a/test/unit/log_sync/test_cloudwatch.py +++ b/test/unit/log_sync/test_cloudwatch.py @@ -1345,7 +1345,7 @@ def test_emit( # THEN log_queue_append_mock.assert_called_once_with( FormattedLogEntry( - timestamp=int(round(record.created * 1000)), + timestamp=int(record.created * 1000), message=record.message, ) ) @@ -1386,7 +1386,7 @@ def test_emit_exception( # THEN log_queue_append_mock.assert_called_once_with( FormattedLogEntry( - timestamp=int(round(record.created * 1000)), + timestamp=int(record.created * 1000), message=record.message, ) )