Skip to content

Commit 53847a6

Browse files
authored
Decouple ray up and user's file_mounts / setup (#407)
* wip: Add setup in provision pipeline * Fix gcp/azure * remove useless variables * minor fix * Add some TODOs * Fix comments * Fix comments * Fix gcp/azure initialization_commands * Remove setup from template * Fix setup directory * Change rsync back to -Pavz * Remove unused argument * fix file_mount/dir_mount
1 parent d19a6fd commit 53847a6

11 files changed

+140
-290
lines changed

examples/horovod_distributed_tf_app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# TODO (zhwu): Fix this for #272.
1+
# TODO(zhwu): Fix this for #272.
22
import json
33
from typing import Dict, List
44

sky/backends/backend.py

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ def sync_file_mounts(
4242
) -> None:
4343
raise NotImplementedError
4444

45+
def setup(self, handle: ResourceHandle, task: Task) -> None:
46+
raise NotImplementedError
47+
4548
def add_storage_objects(self, task: Task) -> None:
4649
raise NotImplementedError
4750

sky/backends/backend_utils.py

+2-61
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,11 @@
66
import pathlib
77
import shlex
88
import subprocess
9-
import tempfile
109
import textwrap
1110
import time
1211
from typing import Dict, List, Optional, Tuple, Union
1312
import uuid
1413
import yaml
15-
import zlib
1614

1715
import jinja2
1816

@@ -22,7 +20,6 @@
2220
from sky import clouds
2321
from sky import sky_logging
2422
from sky import resources
25-
from sky import task as task_lib
2623
from sky.backends import wheel_utils
2724
from sky.adaptors import azure
2825
from sky.skylet import log_lib
@@ -90,12 +87,7 @@ def wrap_file_mount(cls, path: str) -> str:
9087
return os.path.join(_SKY_REMOTE_FILE_MOUNTS_DIR, path.lstrip('/'))
9188

9289
@classmethod
93-
def make_safe_symlink_command(
94-
cls,
95-
*,
96-
source: str,
97-
target: str,
98-
download_target_commands: Optional[List[str]] = None) -> str:
90+
def make_safe_symlink_command(cls, *, source: str, target: str) -> str:
9991
"""Returns a command that safely symlinks 'source' to 'target'.
10092
10193
All intermediate directories of 'source' will be owned by $USER,
@@ -147,8 +139,6 @@ def make_safe_symlink_command(
147139
f'(echo "!!! Failed mounting because path exists ({source})"; '
148140
'exit 1)))',
149141
]
150-
if download_target_commands is not None:
151-
commands += download_target_commands
152142
commands += [
153143
# Link.
154144
f'sudo ln -s {target} {source}',
@@ -311,8 +301,7 @@ def remove_cluster(cls, ip: str, auth_config: Dict[str, str]):
311301

312302

313303
# TODO: too many things happening here - leaky abstraction. Refactor.
314-
def write_cluster_config(task: task_lib.Task,
315-
to_provision: Resources,
304+
def write_cluster_config(to_provision: Resources,
316305
num_nodes: int,
317306
cluster_config_template: str,
318307
cluster_name: str,
@@ -373,48 +362,6 @@ def write_cluster_config(task: task_lib.Task,
373362

374363
assert cluster_name is not None
375364

376-
setup_sh_path = None
377-
if task.setup is not None:
378-
codegen = textwrap.dedent(f"""#!/bin/bash
379-
. $(conda info --base)/etc/profile.d/conda.sh
380-
{task.setup}
381-
""")
382-
# Use a stable path, /<tempdir>/sky_setup_<checksum>.sh, because
383-
# rerunning the same task without any changes to the content of the
384-
# setup command should skip the setup step. Using NamedTemporaryFile()
385-
# would generate a random path every time, hence re-triggering setup.
386-
checksum = zlib.crc32(codegen.encode())
387-
tempdir = tempfile.gettempdir()
388-
# TODO: file lock on this path, in case tasks have the same setup cmd.
389-
with open(os.path.join(tempdir, f'sky_setup_{checksum}.sh'), 'w') as f:
390-
f.write(codegen)
391-
setup_sh_path = f.name
392-
393-
# File mounts handling for remote paths possibly without write access:
394-
# (1) in 'file_mounts' sections, add <prefix> to these target paths.
395-
# (2) then, create symlinks from '/.../file' to '<prefix>/.../file'.
396-
# We need to do these since as of Ray 1.8, this is not supported natively
397-
# (Docker works though, of course):
398-
# https://github.com/ray-project/ray/pull/9332
399-
# https://github.com/ray-project/ray/issues/9326
400-
mounts = task.get_local_to_remote_file_mounts()
401-
wrapped_file_mounts = {}
402-
initialization_commands = []
403-
if mounts is not None:
404-
for remote, local in mounts.items():
405-
if not os.path.isabs(remote) and not remote.startswith('~/'):
406-
remote = f'~/{remote}'
407-
if remote.startswith('~/') or remote.startswith('/tmp/'):
408-
# Skip as these should be writable locations.
409-
wrapped_file_mounts[remote] = local
410-
continue
411-
assert os.path.isabs(remote), (remote, local)
412-
wrapped_remote = FileMountHelper.wrap_file_mount(remote)
413-
wrapped_file_mounts[wrapped_remote] = local
414-
command = FileMountHelper.make_safe_symlink_command(
415-
source=remote, target=wrapped_remote)
416-
initialization_commands.append(command)
417-
418365
# TODO(suquark): once we have sky on PYPI, we should directly install sky
419366
# from PYPI
420367
local_wheel_path = wheel_utils.build_sky_wheel()
@@ -424,14 +371,8 @@ def write_cluster_config(task: task_lib.Task,
424371
resources_vars,
425372
**{
426373
'cluster_name': cluster_name,
427-
'setup_sh_path': setup_sh_path,
428-
'workdir': task.workdir,
429-
'docker_image': task.docker_image,
430374
'num_nodes': num_nodes,
431375
'disk_size': to_provision.disk_size,
432-
# File mounts handling.
433-
'file_mounts': wrapped_file_mounts,
434-
'initialization_commands': initialization_commands or None,
435376
# Region/zones.
436377
'region': region,
437378
'zones': ','.join(zones),

0 commit comments

Comments
 (0)