-
Notifications
You must be signed in to change notification settings - Fork 1.6k
/
Copy pathlauncher.py
277 lines (238 loc) · 10.3 KB
/
launcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import os
import typing
from dataclasses import dataclass
import boto3
import dagster
import requests
from dagster import Field, check
from dagster.core.launcher.base import LaunchRunContext, RunLauncher
from dagster.grpc.types import ExecuteRunArgs
from dagster.serdes import ConfigurableClass, serialize_dagster_namedtuple
from dagster.utils.backcompat import experimental
@dataclass
class TaskMetadata:
cluster: str
subnets: typing.List[str]
security_groups: typing.List[str]
task_definition: typing.Dict[str, typing.Any]
container_definition: typing.Dict[str, typing.Any]
@experimental
class EcsRunLauncher(RunLauncher, ConfigurableClass):
def __init__(self, inst_data=None, task_definition=None, container_name="run"):
self._inst_data = inst_data
self.ecs = boto3.client("ecs")
self.ec2 = boto3.resource("ec2")
self.task_definition = task_definition
self.container_name = container_name
if self.task_definition:
task_definition = self.ecs.describe_task_definition(taskDefinition=task_definition)
container_names = [
container.get("name")
for container in task_definition["taskDefinition"]["containerDefinitions"]
]
check.invariant(
container_name in container_names,
f"Cannot override container '{container_name}' in task definition "
f"'{self.task_definition}' because the container is not defined.",
)
self.task_definition = task_definition["taskDefinition"]["taskDefinitionArn"]
@property
def inst_data(self):
return self._inst_data
@classmethod
def config_type(cls):
return {
"task_definition": Field(
dagster.String,
is_required=False,
description=(
"The task definition to use when launching new tasks. "
"If none is provided, each run will create its own task "
"definition."
),
),
"container_name": Field(
dagster.String,
is_required=False,
default_value="run",
description=(
"The container name to use when launching new tasks. Defaults to 'run'."
),
),
}
@staticmethod
def from_config_value(inst_data, config_value):
return EcsRunLauncher(inst_data=inst_data, **config_value)
def _ecs_tags(self, run_id):
return [{"key": "dagster/run_id", "value": run_id}]
def _run_tags(self, task_arn):
cluster = self._task_metadata().cluster
return {"ecs/task_arn": task_arn, "ecs/cluster": cluster}
def launch_run(self, context: LaunchRunContext) -> None:
"""
Launch a run in an ECS task.
Currently, Fargate is the only supported launchType and awsvpc is the
only supported networkMode. These are the defaults that are set up by
docker-compose when you use the Dagster ECS reference deployment.
"""
run = context.pipeline_run
metadata = self._task_metadata()
pipeline_origin = context.pipeline_code_origin
image = pipeline_origin.repository_origin.container_image
task_definition = self._task_definition(metadata, image)["family"]
input_json = serialize_dagster_namedtuple(
ExecuteRunArgs(
pipeline_origin=pipeline_origin,
pipeline_run_id=run.run_id,
instance_ref=self._instance.get_ref(),
)
)
command = ["dagster", "api", "execute_run", input_json]
# Run a task using the same network configuration as this processes's
# task.
response = self.ecs.run_task(
taskDefinition=task_definition,
cluster=metadata.cluster,
overrides={"containerOverrides": [{"name": self.container_name, "command": command}]},
networkConfiguration={
"awsvpcConfiguration": {
"subnets": metadata.subnets,
"assignPublicIp": "ENABLED",
"securityGroups": metadata.security_groups,
}
},
launchType="FARGATE",
)
arn = response["tasks"][0]["taskArn"]
self._instance.add_run_tags(run.run_id, self._run_tags(task_arn=arn))
self.ecs.tag_resource(resourceArn=arn, tags=self._ecs_tags(run.run_id))
self._instance.report_engine_event(
message=f"Launching run in task {arn} on cluster {metadata.cluster}",
pipeline_run=run,
cls=self.__class__,
)
def can_terminate(self, run_id):
tags = self._instance.get_run_by_id(run_id).tags
arn = tags.get("ecs/task_arn")
cluster = tags.get("ecs/cluster")
if arn and cluster:
status = self.ecs.describe_tasks(tasks=[arn], cluster=cluster)["tasks"][0]["lastStatus"]
if status != "STOPPED":
return True
return False
def terminate(self, run_id):
tags = self._instance.get_run_by_id(run_id).tags
arn = tags.get("ecs/task_arn")
cluster = tags.get("ecs/cluster")
status = (
self.ecs.describe_tasks(tasks=[arn], cluster=cluster)
.get("tasks", [{}])[0]
.get("lastStatus")
)
if status == "STOPPED":
return False
self.ecs.stop_task(task=arn, cluster=cluster)
return True
def _task_definition(self, metadata, image):
"""
Return the launcher's default task definition if it's configured.
Otherwise, a new task definition revision is registered for every run.
First, the process that calls this method finds its own task
definition. Next, it creates a new task definition based on its own
but it overrides the image with the pipeline origin's image.
"""
if self.task_definition:
task_definition = self.ecs.describe_task_definition(taskDefinition=self.task_definition)
return task_definition["taskDefinition"]
# Start with the current process's task's definition but remove
# extra keys that aren't useful for creating a new task definition
# (status, revision, etc.)
expected_keys = [
key
for key in self.ecs.meta.service_model.shape_for(
"RegisterTaskDefinitionRequest"
).members
]
task_definition = dict(
(key, metadata.task_definition[key])
for key in expected_keys
if key in metadata.task_definition.keys()
)
# The current process might not be running in a container that has the
# pipeline's code installed. Inherit most of the process's container
# definition (things like environment, dependencies, etc.) but replace
# the image with the pipeline origin's image and give it a new name.
# Also remove entryPoint. We plan to set containerOverrides. If both
# entryPoint and containerOverrides are specified, they're concatenated
# and the command will fail
# https://aws.amazon.com/blogs/opensource/demystifying-entrypoint-cmd-docker/
container_definitions = task_definition["containerDefinitions"]
container_definitions.remove(metadata.container_definition)
container_definitions.append(
{
**metadata.container_definition,
"name": self.container_name,
"image": image,
"entryPoint": [],
}
)
task_definition = {
**task_definition,
"family": "dagster-run",
"containerDefinitions": container_definitions,
}
# Register the task overridden task definition as a revision to the
# "dagster-run" family.
# TODO: Only register the task definition if a matching one doesn't
# already exist. Otherwise, we risk exhausting the revisions limit
# (1,000,000 per family) with unnecessary revisions:
# https://docs.aws.amazon.com/AmazonECS/latest/developerguide/service-quotas.html
self.ecs.register_task_definition(**task_definition)
return task_definition
def _task_metadata(self):
"""
ECS injects an environment variable into each Fargate task. The value
of this environment variable is a url that can be queried to introspect
information about the current processes's running task:
https://docs.aws.amazon.com/AmazonECS/latest/userguide/task-metadata-endpoint-v4-fargate.html
"""
container_metadata_uri = os.environ.get("ECS_CONTAINER_METADATA_URI_V4")
name = requests.get(container_metadata_uri).json()["Name"]
task_metadata_uri = container_metadata_uri + "/task"
response = requests.get(task_metadata_uri).json()
cluster = response.get("Cluster")
task_arn = response.get("TaskARN")
task = self.ecs.describe_tasks(tasks=[task_arn], cluster=cluster)["tasks"][0]
enis = []
subnets = []
for attachment in task["attachments"]:
if attachment["type"] == "ElasticNetworkInterface":
for detail in attachment["details"]:
if detail["name"] == "subnetId":
subnets.append(detail["value"])
if detail["name"] == "networkInterfaceId":
enis.append(self.ec2.NetworkInterface(detail["value"]))
security_groups = []
for eni in enis:
for group in eni.groups:
security_groups.append(group["GroupId"])
task_definition_arn = task["taskDefinitionArn"]
task_definition = self.ecs.describe_task_definition(taskDefinition=task_definition_arn)[
"taskDefinition"
]
container_definition = next(
iter(
[
container
for container in task_definition["containerDefinitions"]
if container["name"] == name
]
)
)
return TaskMetadata(
cluster=cluster,
subnets=subnets,
security_groups=security_groups,
task_definition=task_definition,
container_definition=container_definition,
)