Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Install SkyPilot runtime in separate env #2801

Closed
wants to merge 34 commits into from
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
9bcabb4
fix
cblmemo Oct 3, 2023
57fd5be
Moves skypilot/ray installation to skypilot-runtime env
Michaelvll Nov 17, 2023
7824745
Fix conda run commands and backward compatibility
Michaelvll Nov 17, 2023
c8a7052
format
Michaelvll Nov 17, 2023
7ddcf75
Fix GCP setup
Michaelvll Nov 18, 2023
22d9860
fix dependency cloud
Michaelvll Nov 18, 2023
f3e6df0
Adopt changes to all other clouds
Michaelvll Nov 18, 2023
6c6ea73
fix ray command to be used
Michaelvll Nov 18, 2023
ed9866e
fix azure case
Michaelvll Nov 18, 2023
6491629
Change to venv implementation
Michaelvll Nov 19, 2023
01eea55
format
Michaelvll Nov 19, 2023
6753b2d
format
Michaelvll Nov 19, 2023
db60092
Fix venv
Michaelvll Nov 19, 2023
5fb31e0
remove deactivate's as all the commands are run alone
Michaelvll Nov 19, 2023
952bc80
Fix activate and multiple nodes
Michaelvll Nov 19, 2023
b39fa19
Fix worker ray command
Michaelvll Nov 20, 2023
27dc07b
format
Michaelvll Nov 20, 2023
97ad588
fix runtime for controller
Michaelvll Nov 20, 2023
dbcfff5
deactivate for run command
Michaelvll Nov 20, 2023
cc267c0
fix comment
Michaelvll Nov 20, 2023
c353d3b
deactivate before enabling conda
Michaelvll Nov 21, 2023
465cd77
Fix deactivate
Michaelvll Nov 21, 2023
a280503
longer job time
Michaelvll Nov 21, 2023
bedd6e4
longer timeout
Michaelvll Nov 21, 2023
141ac76
longer wait time
Michaelvll Nov 21, 2023
cf7c315
Merge branch 'fix-ports-on-azure' of github.com:skypilot-org/skypilot…
Michaelvll Nov 21, 2023
e98788f
longer sleep time for autostop
Michaelvll Nov 21, 2023
06c325a
install ray not equal to 2.8
Michaelvll Nov 22, 2023
fe9b5da
Merge branch 'master' of github.com:skypilot-org/skypilot into depend…
Michaelvll Dec 3, 2023
71d9594
Merge branch 'master' of github.com:skypilot-org/skypilot into depend…
Michaelvll Dec 27, 2023
841f1ff
use constants for envs
Michaelvll Dec 27, 2023
ba38eea
Merge branch 'master' of https://github.com/skypilot-org/skypilot int…
Michaelvll Mar 18, 2024
93afbc6
fix var
Michaelvll Mar 18, 2024
ae84211
format
Michaelvll Mar 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@
SKY_RAY_YAML_REMOTE_PATH = '~/.sky/sky_ray.yml'
# Exclude subnet mask from IP address regex.
IP_ADDR_REGEX = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?!/\d{1,2})\b'
SKY_REMOTE_PATH = '~/.sky/wheels'
SKY_USER_FILE_PATH = '~/.sky/generated'

BOLD = '\033[1m'
Expand Down Expand Up @@ -1042,6 +1041,12 @@ def write_cluster_config(
# Conda setup
'conda_installation_commands':
constants.CONDA_INSTALLATION_COMMANDS,
# Ray and SkyPilot setup
'ray_and_skypilot_setup_commands':
constants.RAY_AND_SKYPILOT_SETUP_COMMANDS.format(
cloud=str(cloud).lower(),
sky_wheel_hash=wheel_hash,
sky_version=str(version.parse(sky.__version__))),

# Port of Ray (GCS server).
# Ray's default port 6379 is conflicted with Redis.
Expand All @@ -1054,7 +1059,7 @@ def write_cluster_config(
# Cloud credentials for cloud storage.
'credentials': credentials,
# Sky remote utils.
'sky_remote_path': SKY_REMOTE_PATH,
'sky_remote_path': constants.SKY_REMOTE_WHEEL_PATH,
'sky_local_path': str(local_wheel_path),
# Add yaml file path to the template variables.
'sky_ray_yaml_remote_path': SKY_RAY_YAML_REMOTE_PATH,
Expand Down
9 changes: 6 additions & 3 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@
'monkey_patches' / 'monkey_patch_ray_up.py')

# Restart skylet when the version does not match to keep the skylet up-to-date.
_MAYBE_SKYLET_RESTART_CMD = 'python3 -m sky.skylet.attempt_skylet'
_MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_PYTHON_ENV} '
'python -m sky.skylet.attempt_skylet')


def _get_cluster_config_template(cloud):
Expand Down Expand Up @@ -1814,7 +1815,7 @@ def _tpu_pod_setup(self, cluster_yaml: str,
# Get the private IP of head node for connecting Ray cluster.
head_runner = command_runner.SSHCommandRunner(
all_ips[0], port=cluster_handle.head_ssh_port, **ssh_credentials)
cmd_str = 'python3 -c \"import ray; print(ray._private.services.get_node_ip_address())\"' # pylint: disable=line-too-long
cmd_str = f'{constants.ACTIVATE_PYTHON_ENV} python -c \"import ray; print(ray._private.services.get_node_ip_address())\"' # pylint: disable=line-too-long
rc, stdout, stderr = head_runner.run(cmd_str,
require_outputs=True,
stream_logs=False)
Expand Down Expand Up @@ -3354,7 +3355,8 @@ def _exec_code_on_head(
handle, ray_command, ray_job_id)
else:
job_submit_cmd = (
'RAY_DASHBOARD_PORT=$(python -c "from sky.skylet import job_lib; print(job_lib.get_job_submission_port())" 2> /dev/null || echo 8265);' # pylint: disable=line-too-long
f'{constants.ACTIVATE_PYTHON_ENV}'
f'RAY_DASHBOARD_PORT=$(python -c "from sky.skylet import job_lib; print(job_lib.get_job_submission_port())" 2> /dev/null || echo 8265);' # pylint: disable=line-too-long
f'{cd} && ray job submit '
'--address=http://127.0.0.1:$RAY_DASHBOARD_PORT '
f'--submission-id {ray_job_id} --no-wait '
Expand Down Expand Up @@ -3476,6 +3478,7 @@ def _setup_and_create_job_cmd_on_local_head(
ssh_user, remote_run_file)
switch_user_cmd = ' '.join(switch_user_cmd)
job_submit_cmd = (
f'{constants.ACTIVATE_PYTHON_ENV} '
'ray job submit '
'--address='
f'http://127.0.0.1:{constants.SKY_REMOTE_RAY_DASHBOARD_PORT} '
Expand Down
4 changes: 2 additions & 2 deletions sky/backends/wheel_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@

import sky
from sky import sky_logging
from sky.backends import backend_utils
from sky.skylet import constants

logger = sky_logging.init_logger(__name__)

# Local wheel path is same as the remote path.
WHEEL_DIR = pathlib.Path(os.path.expanduser(backend_utils.SKY_REMOTE_PATH))
WHEEL_DIR = pathlib.Path(os.path.expanduser(constants.SKY_REMOTE_WHEEL_PATH))
_WHEEL_LOCK_PATH = WHEEL_DIR.parent / '.wheels_lock'
SKY_PACKAGE_PATH = pathlib.Path(sky.__file__).parent.parent / 'sky'

Expand Down
24 changes: 15 additions & 9 deletions sky/provision/instance_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,19 +33,23 @@
_DUMP_RAY_PORTS = (
'python -c \'import json, os; '
f'json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w"))\'')
f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w"))\';'
)

_RAY_PORT_COMMAND = (
'RAY_PORT=$(python -c "from sky.skylet import job_lib; '
'print(job_lib.get_ray_port())" 2> /dev/null || echo 6379)')

# Command that calls `ray status` with SkyPilot's Ray port set.
RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND = (
f'{constants.ACTIVATE_PYTHON_ENV} '
f'{_RAY_PORT_COMMAND}; '
'RAY_ADDRESS=127.0.0.1:$RAY_PORT ray status')
'export RAY_ADDRESS=127.0.0.1:$RAY_PORT; '
f'ray status')

# Restart skylet when the version does not match to keep the skylet up-to-date.
_MAYBE_SKYLET_RESTART_CMD = 'python3 -m sky.skylet.attempt_skylet'
_MAYBE_SKYLET_RESTART_CMD = (f'{constants.ACTIVATE_PYTHON_ENV} '
'python -m sky.skylet.attempt_skylet')


def _auto_retry(func):
Expand Down Expand Up @@ -220,10 +224,11 @@ def start_ray_on_head_node(cluster_name: str, custom_resource: Optional[str],
# the same credentials. Otherwise, `ray status` will fail to fetch the
# available nodes.
# Reference: https://github.com/skypilot-org/skypilot/issues/2441
cmd = ('ray stop; unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
f'ray start --head {ray_options} || exit 1;' + _RAY_PRLIMIT +
_DUMP_RAY_PORTS)
cmd = (f'{constants.ACTIVATE_PYTHON_ENV} ray stop; '
'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
'export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; '
f'ray start --head {ray_options} || '
'exit 1;' + _RAY_PRLIMIT + _DUMP_RAY_PORTS)
logger.info(f'Running command on head node: {cmd}')
# TODO(zhwu): add the output to log files.
returncode, stdout, stderr = ssh_runner.run(cmd,
Expand Down Expand Up @@ -267,7 +272,7 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
# Unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY, see the comment in
# `start_ray_on_head_node`.
cmd = (f'unset AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY; '
'RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 '
'export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; '
f'ray start --disable-usage-stats {ray_options} || exit 1;' +
_RAY_PRLIMIT + _DUMP_RAY_PORTS)
if no_restart:
Expand All @@ -278,9 +283,10 @@ def start_ray_on_worker_nodes(cluster_name: str, no_restart: bool,
# that is connected to the head with the correct port.
cmd = (f'{_RAY_PORT_COMMAND}; ps aux | grep "ray/raylet/raylet" | '
f'grep "gcs-address={head_private_ip}:${{RAY_PORT}}" || '
f'{{ {cmd}; }}')
f'{{ {cmd} }}')
else:
cmd = 'ray stop; ' + cmd
cmd = constants.ACTIVATE_PYTHON_ENV + cmd

logger.info(f'Running command on worker nodes: {cmd}')

Expand Down
4 changes: 3 additions & 1 deletion sky/serve/serve_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from sky import status_lib
from sky.serve import constants
from sky.serve import serve_state
from sky.skylet import constants as skylet_constants
from sky.skylet import job_lib
from sky.utils import common_utils
from sky.utils import log_utils
Expand Down Expand Up @@ -825,4 +826,5 @@ def stream_serve_process_logs(cls, service_name: str,
def _build(cls, code: List[str]) -> str:
code = cls._PREFIX + code
generated_code = '; '.join(code)
return f'python3 -u -c {shlex.quote(generated_code)}'
return (f'{skylet_constants.ACTIVATE_PYTHON_ENV} '
f'python -u -c {shlex.quote(generated_code)}')
7 changes: 4 additions & 3 deletions sky/skylet/attempt_skylet.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ def restart_skylet():
# TODO(zhwu): make the killing graceful, e.g., use a signal to tell
# skylet to exit, instead of directly killing it.
subprocess.run(
'ps aux | grep "sky.skylet.skylet" | grep "python3 -m"'
'ps aux | grep "sky.skylet.skylet" | grep "python -m"'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious, is there any reason to make this change?

'| awk \'{print $2}\' | xargs kill >> ~/.sky/skylet.log 2>&1',
shell=True,
check=False)
subprocess.run(
'nohup python3 -m sky.skylet.skylet'
f'{constants.ACTIVATE_PYTHON_ENV} '
f'nohup python -m sky.skylet.skylet'
' >> ~/.sky/skylet.log 2>&1 &',
shell=True,
check=True)
Expand All @@ -27,7 +28,7 @@ def restart_skylet():


proc = subprocess.run(
'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep "python3 -m"',
'ps aux | grep -v "grep" | grep "sky.skylet.skylet" | grep "python -m"',
shell=True,
check=False)

Expand Down
4 changes: 3 additions & 1 deletion sky/skylet/autostop_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from sky import sky_logging
from sky.skylet import configs
from sky.skylet import constants
from sky.utils import common_utils

logger = sky_logging.init_logger(__name__)
Expand Down Expand Up @@ -123,4 +124,5 @@ def is_autostopping(cls) -> str:
def _build(cls, code: List[str]) -> str:
code = cls._PREFIX + code
code = ';'.join(code)
return f'python3 -u -c {shlex.quote(code)}'
return (f'{constants.ACTIVATE_PYTHON_ENV} '
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, why not use sky/skylet/constants.py::run_in_python_env to replace all of those constants.ACTIVATE_PYTHON_ENV?

f'python -u -c {shlex.quote(code)}')
47 changes: 44 additions & 3 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json'
SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
SKY_REMOTE_RAY_VERSION = '2.4.0'
SKY_REMOTE_WHEEL_PATH = '~/.sky/wheels'
SKY_REMOTE_PYTHON_ENV = '~/skypilot-runtime'

# TODO(mluo): Make explicit `sky launch -c <name> ''` optional.
UNINITIALIZED_ONPREM_CLUSTER_MESSAGE = (
Expand Down Expand Up @@ -64,22 +66,61 @@
DOCKER_SERVER_ENV_VAR,
}

ACTIVATE_PYTHON_ENV = (f'[ -d {SKY_REMOTE_PYTHON_ENV} ] && '
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what case will the remote python env not exist? Should we print some warning here?

f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate;')


def run_in_python_env(command):
"""Returns a command that runs in the SkyPilot's python environment."""
return f'( {ACTIVATE_PYTHON_ENV}{command}; deactivate )'


# Install conda on the remote cluster if it is not already installed.
# We do not install the latest conda with python 3.11 because ray has not
# officially supported it yet.
# https://github.com/ray-project/ray/issues/31606
# We use python 3.10 to be consistent with the python version of the
# AWS's Deep Learning AMI's default conda environment.
_RUN_PYTHON = run_in_python_env('python \\$@')
_RUN_PIP = run_in_python_env('pip \\$@')
_RUN_RAY = run_in_python_env('ray \\$@')
CONDA_INSTALLATION_COMMANDS = (
'which conda > /dev/null 2>&1 || '
'(wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
'{ wget -nc https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -O Miniconda3-Linux-x86_64.sh && ' # pylint: disable=line-too-long
'bash Miniconda3-Linux-x86_64.sh -b && '
'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
'conda config --set auto_activate_base true); '
'conda config --set auto_activate_base true && source ~/.bashrc; }; '
# Only run `conda init` if the conda is not installed under /opt/conda,
# which is the case for VMs created on GCP, and running `conda init` will
# cause error and waiting for the error to be reported: #2273.
'which conda | grep /opt/conda || conda init > /dev/null;')
'which conda | grep /opt/conda || conda init > /dev/null;'
# Create a separate conda environment for SkyPilot dependencies.
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
f'python -m venv {SKY_REMOTE_PYTHON_ENV}; '
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have we considered using a conda env instead? What are the pros & cons of the two implementations?

f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate; '
f'echo "function skypy () {{ {_RUN_PYTHON} }}" >> ~/.bashrc;'
f'echo "function skypip () {{ {_RUN_PIP} }}" >> ~/.bashrc;'
f'echo "function skyray () {{ {_RUN_RAY} }}" >> ~/.bashrc;')

RAY_AND_SKYPILOT_SETUP_COMMANDS = (
'(type -a python | grep -q python3) || '
'echo "alias python=python3" >> ~/.bashrc;'
'(type -a pip | grep -q pip3) || echo "alias pip=pip3" >> ~/.bashrc;'
'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && '
'touch ~/.sudo_as_admin_successful;'
f'(pip list | grep "ray " | grep "{SKY_REMOTE_RAY_VERSION}" '
'2>&1 > /dev/null || '
f'pip install --exists-action w -U '
f'ray[default]=={SKY_REMOTE_RAY_VERSION});'
f'(pip list | grep "skypilot " && '
f'[ "$(cat {SKY_REMOTE_WHEEL_PATH}/current_sky_wheel_hash)" == '
f'"{{sky_wheel_hash}}" ]) || (pip uninstall skypilot -y; '
f'pip install "$(echo {SKY_REMOTE_WHEEL_PATH}/'
f'{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[{{cloud}}, remote]" && '
f'echo "{{sky_wheel_hash}}" '
f'> {SKY_REMOTE_WHEEL_PATH}/current_sky_wheel_hash || exit 1);'
f'python -c '
'"from sky.skylet.ray_patches import patch; patch()" || exit 1;')

# The name for the environment variable that stores SkyPilot user hash, which
# is mainly used to make sure sky commands runs on a VM launched by SkyPilot
Expand Down
3 changes: 2 additions & 1 deletion sky/skylet/job_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,4 +920,5 @@ def get_run_timestamp_with_globbing(cls,
def _build(cls, code: List[str]) -> str:
code = cls._PREFIX + code
code = ';'.join(code)
return f'python3 -u -c {shlex.quote(code)}'
return (f'{constants.ACTIVATE_PYTHON_ENV} '
f'python -u -c {shlex.quote(code)}')
3 changes: 2 additions & 1 deletion sky/spot/spot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,8 @@ def set_pending(cls, job_id: int, spot_dag: 'dag_lib.Dag') -> str:
def _build(cls, code: List[str]) -> str:
code = cls._PREFIX + code
generated_code = '; '.join(code)
return f'python3 -u -c {shlex.quote(generated_code)}'
return (f'{constants.ACTIVATE_PYTHON_ENV} '
f'python -u -c {shlex.quote(generated_code)}')


def dump_job_table_cache(job_table: str):
Expand Down
8 changes: 1 addition & 7 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -156,18 +156,12 @@ setup_commands:
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- mkdir -p ~/.ssh; touch ~/.ssh/config;
{{ conda_installation_commands }}
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
source ~/.bashrc;
mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
(pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}});
(pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
{{ ray_and_skypilot_setup_commands }}
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
{%- if docker_image is none %}
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
{%- endif %}
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;

# Command to start ray clusters are now placed in `sky.provision.instance_setup`.
Expand Down
16 changes: 5 additions & 11 deletions sky/templates/azure-ray.yml.j2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add this for newly-added cloud as well, e.g. runpod

Original file line number Diff line number Diff line change
Expand Up @@ -140,12 +140,7 @@ setup_commands:
# This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
- mkdir -p ~/.ssh; touch ~/.ssh/config;
{{ conda_installation_commands }}
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
source ~/.bashrc;
mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful;
(pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}});
(pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure, remote]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
{{ ray_and_skypilot_setup_commands }}
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
{%- if docker_image is none %}
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
Expand All @@ -155,7 +150,6 @@ setup_commands:
sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;
{%- endif %}
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
python3 -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');

# Command to start ray on the head node. You don't need to change this.
Expand All @@ -167,14 +161,14 @@ setup_commands:
# current num items (num SSH connections): 2
head_start_ray_commands:
# NOTE: --disable-usage-stats in `ray start` saves 10 seconds of idle wait.
- ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
- source ~/skypilot-runtime/bin/activate; ray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; ray start --disable-usage-stats --head --port={{ray_port}} --dashboard-port={{ray_dashboard_port}} --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
{{dump_port_command}};
{{dump_port_command}}

{%- if num_nodes > 1 %}
worker_start_ray_commands:
- ray stop; RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0 ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done;
- source ~/skypilot-runtime/bin/activate; ray stop; export RAY_SCHEDULER_EVENTS=0 RAY_DEDUP_LOGS=0; ray start --disable-usage-stats --address=$RAY_HEAD_IP:{{ray_port}} --object-manager-port=8076 {{"--resources='%s'" % custom_resources if custom_resources}} --temp-dir {{ray_temp_dir}} || exit 1;
which prlimit && for id in $(pgrep -f raylet/raylet); do sudo prlimit --nofile=1048576:1048576 --pid=$id || true; done
{%- else %}
worker_start_ray_commands: []
{%- endif %}
Expand Down
Loading