Skip to content

Commit

Permalink
feat: Add dag_id when generating OpenLineage run_id for task instance. (
Browse files Browse the repository at this point in the history
  • Loading branch information
kacpermuda authored Jan 9, 2024
1 parent a818a8c commit 95a8310
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 51 deletions.
5 changes: 4 additions & 1 deletion airflow/providers/dbt/cloud/utils/openlineage.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ async def get_artifacts_for_steps(steps, artifacts):

# generate same run id of current task instance
parent_run_id = OpenLineageAdapter.build_task_instance_run_id(
operator.task_id, task_instance.execution_date, task_instance.try_number - 1
dag_id=task_instance.dag_id,
task_id=operator.task_id,
execution_date=task_instance.execution_date,
try_number=task_instance.try_number - 1,
)

parent_job = ParentRunMetadata(
Expand Down
4 changes: 2 additions & 2 deletions airflow/providers/openlineage/plugins/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,11 @@ def build_dag_run_id(dag_id, dag_run_id):
return str(uuid.uuid3(uuid.NAMESPACE_URL, f"{_DAG_NAMESPACE}.{dag_id}.{dag_run_id}"))

@staticmethod
def build_task_instance_run_id(task_id, execution_date, try_number):
def build_task_instance_run_id(dag_id, task_id, execution_date, try_number):
return str(
uuid.uuid3(
uuid.NAMESPACE_URL,
f"{_DAG_NAMESPACE}.{task_id}.{execution_date}.{try_number}",
f"{_DAG_NAMESPACE}.{dag_id}.{task_id}.{execution_date}.{try_number}",
)
)

Expand Down
27 changes: 18 additions & 9 deletions airflow/providers/openlineage/plugins/listener.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ def on_running():
parent_run_id = self.adapter.build_dag_run_id(dag.dag_id, dagrun.run_id)

task_uuid = self.adapter.build_task_instance_run_id(
task.task_id, task_instance.execution_date, task_instance.try_number
dag_id=dag.dag_id,
task_id=task.task_id,
execution_date=task_instance.execution_date,
try_number=task_instance.try_number,
)

task_metadata = self.extractor_manager.extract_metadata(dagrun, task)
Expand Down Expand Up @@ -116,14 +119,17 @@ def on_task_instance_success(self, previous_state, task_instance: TaskInstance,
task = task_instance.task
dag = task.dag

task_uuid = OpenLineageAdapter.build_task_instance_run_id(
task.task_id, task_instance.execution_date, task_instance.try_number - 1
)

@print_warning(self.log)
def on_success():
parent_run_id = OpenLineageAdapter.build_dag_run_id(dag.dag_id, dagrun.run_id)

task_uuid = OpenLineageAdapter.build_task_instance_run_id(
dag_id=dag.dag_id,
task_id=task.task_id,
execution_date=task_instance.execution_date,
try_number=task_instance.try_number - 1,
)

task_metadata = self.extractor_manager.extract_metadata(
dagrun, task, complete=True, task_instance=task_instance
)
Expand All @@ -149,14 +155,17 @@ def on_task_instance_failed(self, previous_state, task_instance: TaskInstance, s
task = task_instance.task
dag = task.dag

task_uuid = OpenLineageAdapter.build_task_instance_run_id(
task.task_id, task_instance.execution_date, task_instance.try_number
)

@print_warning(self.log)
def on_failure():
parent_run_id = OpenLineageAdapter.build_dag_run_id(dag.dag_id, dagrun.run_id)

task_uuid = OpenLineageAdapter.build_task_instance_run_id(
dag_id=dag.dag_id,
task_id=task.task_id,
execution_date=task_instance.execution_date,
try_number=task_instance.try_number,
)

task_metadata = self.extractor_manager.extract_metadata(
dagrun, task, complete=True, task_instance=task_instance
)
Expand Down
10 changes: 8 additions & 2 deletions airflow/providers/openlineage/plugins/macros.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,10 @@ def lineage_run_id(task_instance: TaskInstance):
:ref:`howto/macros:openlineage`
"""
return OpenLineageAdapter.build_task_instance_run_id(
task_instance.task.task_id, task_instance.execution_date, task_instance.try_number
dag_id=task_instance.dag_id,
task_id=task_instance.task.task_id,
execution_date=task_instance.execution_date,
try_number=task_instance.try_number,
)


Expand All @@ -55,6 +58,9 @@ def lineage_parent_id(run_id: str, task_instance: TaskInstance):
:ref:`howto/macros:openlineage`
"""
job_name = OpenLineageAdapter.build_task_instance_run_id(
task_instance.task.task_id, task_instance.execution_date, task_instance.try_number
dag_id=task_instance.dag_id,
task_id=task_instance.task.task_id,
execution_date=task_instance.execution_date,
try_number=task_instance.try_number,
)
return f"{_JOB_NAMESPACE}/{job_name}/{run_id}"
1 change: 0 additions & 1 deletion tests/always/test_project_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ def test_providers_modules_should_have_tests(self):
"tests/providers/openlineage/extractors/test_manager.py",
"tests/providers/openlineage/plugins/test_adapter.py",
"tests/providers/openlineage/plugins/test_facets.py",
"tests/providers/openlineage/plugins/test_macros.py",
"tests/providers/openlineage/test_sqlparser.py",
"tests/providers/redis/operators/test_redis_publish.py",
"tests/providers/redis/sensors/test_redis_key.py",
Expand Down
57 changes: 44 additions & 13 deletions tests/providers/openlineage/plugins/test_listener.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,10 @@ def _create_listener_and_task_instance() -> tuple[OpenLineageListener, TaskInsta
listener, task_instance = _create_listener_and_task_instance()
# Now you can use listener and task_instance in your tests to simulate their interaction.
"""

def mock_task_id(dag_id, task_id, execution_date, try_number):
return f"{dag_id}.{task_id}.{execution_date}.{try_number}"

listener = OpenLineageListener()
listener.log = mock.Mock()
listener.extractor_manager = mock.Mock()
Expand All @@ -161,7 +165,7 @@ def _create_listener_and_task_instance() -> tuple[OpenLineageListener, TaskInsta

adapter = mock.Mock()
adapter.build_dag_run_id.side_effect = lambda x, y: f"{x}.{y}"
adapter.build_task_instance_run_id.side_effect = lambda x, y, z: f"{x}.{y}.{z}"
adapter.build_task_instance_run_id.side_effect = mock_task_id
adapter.start_task = mock.Mock()
adapter.fail_task = mock.Mock()
adapter.complete_task = mock.Mock()
Expand Down Expand Up @@ -211,7 +215,7 @@ def test_adapter_start_task_is_called_with_proper_arguments(

listener.on_task_instance_running(None, task_instance, None)
listener.adapter.start_task.assert_called_once_with(
run_id="task_id.execution_date.1",
run_id="dag_id.task_id.execution_date.1",
job_name="job_name",
job_description="Test DAG Description",
event_time="2023-01-01T13:01:01",
Expand Down Expand Up @@ -239,9 +243,13 @@ def test_adapter_fail_task_is_called_with_proper_arguments(mock_get_job_name, mo
the test verifies the integrity and consistency of the data passed to the adapter during task
failure events, thus confirming that the adapter's failure handling is functioning as expected.
"""

def mock_task_id(dag_id, task_id, execution_date, try_number):
return f"{dag_id}.{task_id}.{execution_date}.{try_number}"

listener, task_instance = _create_listener_and_task_instance()
mock_get_job_name.return_value = "job_name"
mocked_adapter.build_task_instance_run_id.side_effect = lambda x, y, z: f"{x}.{y}.{z}"
mocked_adapter.build_task_instance_run_id.side_effect = mock_task_id
mocked_adapter.build_dag_run_id.side_effect = lambda x, y: f"{x}.{y}"

listener.on_task_instance_failed(None, task_instance, None)
Expand All @@ -250,7 +258,7 @@ def test_adapter_fail_task_is_called_with_proper_arguments(mock_get_job_name, mo
job_name="job_name",
parent_job_name="dag_id",
parent_run_id="dag_id.dag_run_run_id",
run_id="task_id.execution_date.1",
run_id="dag_id.task_id.execution_date.1",
task=listener.extractor_manager.extract_metadata(),
)

Expand All @@ -266,9 +274,13 @@ def test_adapter_complete_task_is_called_with_proper_arguments(mock_get_job_name
accordingly. This helps confirm the consistency and correctness of the data passed to the adapter
during the task's lifecycle events.
"""

def mock_task_id(dag_id, task_id, execution_date, try_number):
return f"{dag_id}.{task_id}.{execution_date}.{try_number}"

listener, task_instance = _create_listener_and_task_instance()
mock_get_job_name.return_value = "job_name"
mocked_adapter.build_task_instance_run_id.side_effect = lambda x, y, z: f"{x}.{y}.{z}"
mocked_adapter.build_task_instance_run_id.side_effect = mock_task_id
mocked_adapter.build_dag_run_id.side_effect = lambda x, y: f"{x}.{y}"

listener.on_task_instance_success(None, task_instance, None)
Expand All @@ -279,7 +291,7 @@ def test_adapter_complete_task_is_called_with_proper_arguments(mock_get_job_name
job_name="job_name",
parent_job_name="dag_id",
parent_run_id="dag_id.dag_run_run_id",
run_id="task_id.execution_date.0",
run_id="dag_id.task_id.execution_date.0",
task=listener.extractor_manager.extract_metadata(),
)

Expand All @@ -292,7 +304,7 @@ def test_adapter_complete_task_is_called_with_proper_arguments(mock_get_job_name
job_name="job_name",
parent_job_name="dag_id",
parent_run_id="dag_id.dag_run_run_id",
run_id="task_id.execution_date.1",
run_id="dag_id.task_id.execution_date.1",
task=listener.extractor_manager.extract_metadata(),
)

Expand All @@ -305,20 +317,24 @@ def test_run_id_is_constant_across_all_methods(mocked_adapter):
reflecting the task's identity and execution context. The test also simulates the change in the
try_number attribute, as it would occur in Airflow, to verify that the run_id updates accordingly.
"""

def mock_task_id(dag_id, task_id, execution_date, try_number):
return f"{dag_id}.{task_id}.{execution_date}.{try_number}"

listener, task_instance = _create_listener_and_task_instance()
mocked_adapter.build_task_instance_run_id.side_effect = lambda x, y, z: f"{x}.{y}.{z}"
mocked_adapter.build_task_instance_run_id.side_effect = mock_task_id

listener.on_task_instance_running(None, task_instance, None)
expected_run_id = listener.adapter.start_task.call_args.kwargs["run_id"]
assert expected_run_id == "task_id.execution_date.1"
assert expected_run_id == "dag_id.task_id.execution_date.1"

listener.on_task_instance_failed(None, task_instance, None)
assert listener.adapter.fail_task.call_args.kwargs["run_id"] == expected_run_id

# This run_id will be different as we did NOT simulate increase of the try_number attribute,
# which happens in Airflow.
listener.on_task_instance_success(None, task_instance, None)
assert listener.adapter.complete_task.call_args.kwargs["run_id"] == "task_id.execution_date.0"
assert listener.adapter.complete_task.call_args.kwargs["run_id"] == "dag_id.task_id.execution_date.0"

# Now we simulate the increase of try_number, and the run_id should reflect that change.
# This is how airflow works, and that's why we expect the run_id to remain constant across all methods.
Expand All @@ -336,7 +352,12 @@ def test_running_task_correctly_calls_openlineage_adapter_run_id_method():
"""
listener, task_instance = _create_listener_and_task_instance()
listener.on_task_instance_running(None, task_instance, None)
listener.adapter.build_task_instance_run_id.assert_called_once_with("task_id", "execution_date", 1)
listener.adapter.build_task_instance_run_id.assert_called_once_with(
dag_id="dag_id",
task_id="task_id",
execution_date="execution_date",
try_number=1,
)


@mock.patch("airflow.providers.openlineage.plugins.listener.OpenLineageAdapter")
Expand All @@ -349,7 +370,12 @@ def test_failed_task_correctly_calls_openlineage_adapter_run_id_method(mock_adap
"""
listener, task_instance = _create_listener_and_task_instance()
listener.on_task_instance_failed(None, task_instance, None)
mock_adapter.build_task_instance_run_id.assert_called_with("task_id", "execution_date", 1)
mock_adapter.build_task_instance_run_id.assert_called_once_with(
dag_id="dag_id",
task_id="task_id",
execution_date="execution_date",
try_number=1,
)


@mock.patch("airflow.providers.openlineage.plugins.listener.OpenLineageAdapter")
Expand All @@ -362,7 +388,12 @@ def test_successful_task_correctly_calls_openlineage_adapter_run_id_method(mock_
"""
listener, task_instance = _create_listener_and_task_instance()
listener.on_task_instance_success(None, task_instance, None)
mock_adapter.build_task_instance_run_id.assert_called_with("task_id", "execution_date", 0)
mock_adapter.build_task_instance_run_id.assert_called_once_with(
dag_id="dag_id",
task_id="task_id",
execution_date="execution_date",
try_number=0,
)


@mock.patch("airflow.models.taskinstance.get_listener_manager")
Expand Down
52 changes: 52 additions & 0 deletions tests/providers/openlineage/plugins/test_macros.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import uuid
from unittest import mock

from airflow.providers.openlineage.plugins.adapter import _DAG_NAMESPACE
from airflow.providers.openlineage.plugins.macros import lineage_parent_id, lineage_run_id


def test_lineage_run_id():
task = mock.MagicMock(
dag_id="dag_id", execution_date="execution_date", try_number=1, task=mock.MagicMock(task_id="task_id")
)
actual = lineage_run_id(task)
expected = str(
uuid.uuid3(
uuid.NAMESPACE_URL,
f"{_DAG_NAMESPACE}.dag_id.task_id.execution_date.1",
)
)
assert actual == expected


def test_lineage_parent_id():
task = mock.MagicMock(
dag_id="dag_id", execution_date="execution_date", try_number=1, task=mock.MagicMock(task_id="task_id")
)
actual = lineage_parent_id(run_id="run_id", task_instance=task)
job_name = str(
uuid.uuid3(
uuid.NAMESPACE_URL,
f"{_DAG_NAMESPACE}.dag_id.task_id.execution_date.1",
)
)
expected = f"{_DAG_NAMESPACE}/{job_name}/run_id"
assert actual == expected
Loading

0 comments on commit 95a8310

Please sign in to comment.