Skip to content

Commit

Permalink
Redriven Step Functions Trace Merging (#545)
Browse files Browse the repository at this point in the history
Adds support for Step Functions trace merging in Redrive cases

We previously used `hash(ExecutionId # StateName # StateEnteredTime)` for spanID calculation but these values are identical across redrives for a Lambda task state. The new approach also adds a `RedriveCount` to the end of the hash but omits this value when it is 0 to have easy backwards compatability.
  • Loading branch information
avedmala authored Jan 2, 2025
1 parent ed8c462 commit b6edd91
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 10 deletions.
13 changes: 12 additions & 1 deletion datadog_lambda/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,12 +384,23 @@ def _parse_high_64_bits(trace_tags: str) -> str:


def _generate_sfn_parent_id(context: dict) -> int:
"""
The upstream Step Function can propagate its execution context to downstream Lambdas. The
Lambda can use these details to share the same traceID and infer its parent's spanID.
Excluding redriveCount when its 0 to account for cases where customers are using an old
version of the Lambda layer that doesn't use this value for its parentID generation.
"""
execution_id = context.get("Execution").get("Id")
redrive_count = context.get("Execution").get("RedriveCount", 0)
state_name = context.get("State").get("Name")
state_entered_time = context.get("State").get("EnteredTime")

redrive_postfix = "" if redrive_count == 0 else f"#{redrive_count}"

return _deterministic_sha256_hash(
f"{execution_id}#{state_name}#{state_entered_time}", HIGHER_64_BITS
f"{execution_id}#{state_name}#{state_entered_time}{redrive_postfix}",
HIGHER_64_BITS,
)


Expand Down
73 changes: 64 additions & 9 deletions tests/test_tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,30 +619,83 @@ def test_step_function_trace_data(self):
lambda_ctx = get_mock_context()
sfn_event = {
"Execution": {
"Id": "665c417c-1237-4742-aaca-8b3becbb9e75",
"Id": "arn:aws:states:sa-east-1:425362996713:execution:abhinav-activity-state-machine:72a7ca3e-901c-41bb-b5a3-5f279b92a316",
"Name": "72a7ca3e-901c-41bb-b5a3-5f279b92a316",
"RoleArn": "arn:aws:iam::425362996713:role/service-role/StepFunctions-abhinav-activity-state-machine-role-22jpbgl6j",
"StartTime": "2024-12-04T19:38:04.069Z",
},
"StateMachine": {},
"State": {
"Name": "my-awesome-state",
"EnteredTime": "Mon Nov 13 12:43:33 PST 2023",
"Name": "Lambda Invoke",
"EnteredTime": "2024-12-04T19:38:04.118Z",
"RetryCount": 0,
},
"StateMachine": {
"Id": "arn:aws:states:sa-east-1:425362996713:stateMachine:abhinav-activity-state-machine",
"Name": "abhinav-activity-state-machine",
},
}
ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx)
self.assertEqual(source, "event")
expected_context = Context(
trace_id=3675572987363469717,
span_id=6880978411788117524,
trace_id=435175499815315247,
span_id=3929055471293792800,
sampling_priority=1,
meta={"_dd.p.tid": "3e7a89d1b7310603"},
)
self.assertEqual(ctx, expected_context)
self.assertEqual(
get_dd_trace_context(),
{
TraceHeader.TRACE_ID: "435175499815315247",
TraceHeader.PARENT_ID: "10713633173203262661",
TraceHeader.SAMPLING_PRIORITY: "1",
TraceHeader.TAGS: "_dd.p.tid=3e7a89d1b7310603",
},
)
create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY)
self.mock_send_segment.assert_called_with(
XraySubsegment.TRACE_KEY,
expected_context,
)

# https://github.com/DataDog/logs-backend/blob/c17618cb552fc369ca40282bae0a65803f82f694/domains/serverless/apps/logs-to-traces-reducer/src/test/resources/test-json-files/stepfunctions/RedriveTest/snapshots/RedriveLambdaSuccessTraceMerging.json#L46
@with_trace_propagation_style("datadog")
def test_step_function_trace_data_redrive(self):
lambda_ctx = get_mock_context()
sfn_event = {
"Execution": {
"Id": "arn:aws:states:sa-east-1:425362996713:execution:abhinav-activity-state-machine:72a7ca3e-901c-41bb-b5a3-5f279b92a316",
"Name": "72a7ca3e-901c-41bb-b5a3-5f279b92a316",
"RoleArn": "arn:aws:iam::425362996713:role/service-role/StepFunctions-abhinav-activity-state-machine-role-22jpbgl6j",
"StartTime": "2024-12-04T19:38:04.069Z",
"RedriveCount": 1,
},
"State": {
"Name": "Lambda Invoke",
"EnteredTime": "2024-12-04T19:38:04.118Z",
"RetryCount": 0,
},
"StateMachine": {
"Id": "arn:aws:states:sa-east-1:425362996713:stateMachine:abhinav-activity-state-machine",
"Name": "abhinav-activity-state-machine",
},
}
ctx, source, event_source = extract_dd_trace_context(sfn_event, lambda_ctx)
self.assertEqual(source, "event")
expected_context = Context(
trace_id=435175499815315247,
span_id=5063839446130725204,
sampling_priority=1,
meta={"_dd.p.tid": "e987c84b36b11ab"},
meta={"_dd.p.tid": "3e7a89d1b7310603"},
)
self.assertEqual(ctx, expected_context)
self.assertEqual(
get_dd_trace_context(),
{
TraceHeader.TRACE_ID: "3675572987363469717",
TraceHeader.TRACE_ID: "435175499815315247",
TraceHeader.PARENT_ID: "10713633173203262661",
TraceHeader.SAMPLING_PRIORITY: "1",
TraceHeader.TAGS: "_dd.p.tid=e987c84b36b11ab",
TraceHeader.TAGS: "_dd.p.tid=3e7a89d1b7310603",
},
)
create_dd_dummy_metadata_subsegment(ctx, XraySubsegment.TRACE_KEY)
Expand All @@ -658,6 +711,7 @@ def test_step_function_trace_data_lambda_root(self):
"_datadog": {
"Execution": {
"Id": "665c417c-1237-4742-aaca-8b3becbb9e75",
"RedriveCount": 0,
},
"StateMachine": {},
"State": {
Expand Down Expand Up @@ -700,6 +754,7 @@ def test_step_function_trace_data_sfn_root(self):
"_datadog": {
"Execution": {
"Id": "665c417c-1237-4742-aaca-8b3becbb9e75",
"RedriveCount": 0,
},
"StateMachine": {},
"State": {
Expand Down

0 comments on commit b6edd91

Please sign in to comment.