From f03c6acd7d6e1341b0754c81fac158af0675f9ce Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 22 Feb 2022 17:40:26 -0800 Subject: [PATCH 01/14] Add conda activate support to bashrc --- sky/backends/cloud_vm_ray_backend.py | 11 ++++------- sky/skylet/log_lib.py | 1 - sky/templates/aws-ray.yml.j2 | 2 ++ 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index a9d4e0d6ffd..4b1604a1cce 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -734,7 +734,7 @@ def _retry_region_zones(self, to_provision: Resources, num_nodes: int, plural = '' if num_nodes == 1 else 's' logger.info( f'{style.BRIGHT}Successfully provisioned or found' - f' existing VM{plural}. Setup completed.{style.RESET_ALL}') + f' existing VM{plural}.{style.RESET_ALL}') return config_dict message = ('Failed to acquire resources in all regions/zones' f' (requested {to_provision}).' @@ -1219,13 +1219,10 @@ def setup(self, handle: ResourceHandle, task: Task) -> None: if task.setup is None: return - codegen = textwrap.dedent(f"""\ - #!/bin/bash - # TODO(zhwu): Move this to bashrc - . $(conda info --base)/etc/profile.d/conda.sh - {task.setup}""") + + setup_script = log_lib.make_task_bash_script(task.setup) with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f: - f.write(codegen) + f.write(setup_script) f.flush() setup_sh_path = f.name setup_file = os.path.basename(setup_sh_path) diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index f44214c2c04..386aa7747fc 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -155,7 +155,6 @@ def make_task_bash_script(codegen: str) -> str: textwrap.dedent(f"""\ #!/bin/bash source ~/.bashrc - . $(conda info --base)/etc/profile.d/conda.sh 2> /dev/null || true cd {SKY_REMOTE_WORKDIR}"""), codegen, ] diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 795309cb93a..a8185964f98 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -73,6 +73,8 @@ setup_commands: # This AMI's system Python is version 2+. - pip3 install -U ray[default]=={{ray_version}} && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app - pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file + # Make `conda activate` available + - grep -qxF '. $(conda info --base)/etc/profile.d/conda.sh' ~/.bashrc || echo '. $(conda info --base)/etc/profile.d/conda.sh' >> ~/.bashrc # Command to start ray on the head node. You don't need to change this. head_start_ray_commands: From ae96c9387ddd59f0b73074f6e5803075adbb154a Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 22 Feb 2022 19:38:35 -0800 Subject: [PATCH 02/14] Add doc and make sure conda activate works --- docs/source/getting-started/tutorial.rst | 8 ++++++++ sky/backends/cloud_vm_ray_backend.py | 3 ++- sky/skylet/log_lib.py | 1 - sky/templates/aws-ray.yml.j2 | 2 -- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst index b1dda49a028..ea61b566be5 100644 --- a/docs/source/getting-started/tutorial.rst +++ b/docs/source/getting-started/tutorial.rst @@ -17,6 +17,10 @@ and run command: accelerators: V100:4 setup: | + # If using a my_setup.sh script to setup, please use + # `bash -i my_setup.sh` to capture the environment + # variable and make sure `conda activate` works + git clone https://github.com/huggingface/transformers/ cd transformers pip3 install . @@ -24,6 +28,10 @@ and run command: pip3 install -r requirements.txt run: | + # If using a my_run.sh script to run commands, please use + # `bash -i my_run.sh` to capture the environment variable + # and make sure `conda activate` works + cd transformers/examples/pytorch/text-classification python3 run_glue.py \ --model_name_or_path bert-base-cased \ diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4b1604a1cce..b9a66c04705 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1246,7 +1246,8 @@ def setup(self, handle: ResourceHandle, task: Task) -> None: with_outputs=False) backend_utils.run_command_on_ip_via_ssh( ip, - f'cd {SKY_REMOTE_WORKDIR}; /bin/bash /tmp/{setup_file}', + # -i will make sure `conda activate` works + f'/bin/bash -i /tmp/{setup_file}', ssh_user=ssh_user, ssh_private_key=ssh_private_key, log_path=os.path.join(self.log_dir, 'setup.log'), diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index 386aa7747fc..93921ae894e 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -154,7 +154,6 @@ def make_task_bash_script(codegen: str) -> str: script = [ textwrap.dedent(f"""\ #!/bin/bash - source ~/.bashrc cd {SKY_REMOTE_WORKDIR}"""), codegen, ] diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index a8185964f98..795309cb93a 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -73,8 +73,6 @@ setup_commands: # This AMI's system Python is version 2+. - pip3 install -U ray[default]=={{ray_version}} && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app - pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file - # Make `conda activate` available - - grep -qxF '. $(conda info --base)/etc/profile.d/conda.sh' ~/.bashrc || echo '. $(conda info --base)/etc/profile.d/conda.sh' >> ~/.bashrc # Command to start ray on the head node. You don't need to change this. head_start_ray_commands: From 8bd498ccc6efb70f58921a31a018401740bd41f8 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 22 Feb 2022 19:46:59 -0800 Subject: [PATCH 03/14] bring back conda activate command for GCP --- sky/skylet/log_lib.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py index 93921ae894e..f44214c2c04 100644 --- a/sky/skylet/log_lib.py +++ b/sky/skylet/log_lib.py @@ -154,6 +154,8 @@ def make_task_bash_script(codegen: str) -> str: script = [ textwrap.dedent(f"""\ #!/bin/bash + source ~/.bashrc + . $(conda info --base)/etc/profile.d/conda.sh 2> /dev/null || true cd {SKY_REMOTE_WORKDIR}"""), codegen, ] From f14a281c0bc8af6016b2b9329fe4b9d179909531 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 22 Feb 2022 19:47:10 -0800 Subject: [PATCH 04/14] Move comment to quickstart --- docs/source/getting-started/quickstart.rst | 8 ++++++++ docs/source/getting-started/tutorial.rst | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index 83029485f6d..287b3d4370e 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -48,10 +48,18 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (See more example yaml files in the `r setup: | # Typical use: pip install -r requirements.txt + + # If using a `my_setup.sh` script to setup, please use + # `bash -i my_setup.sh` to capture the environment + # variable and make sure `conda activate` works echo "running setup" run: | # Typical use: make use of resources, such as running training. + + # If using a my_run.sh script to run commands, please use + # `bash -i my_run.sh` to capture the environment variable + # and make sure `conda activate` works echo "hello sky!" conda env list diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst index ea61b566be5..b1dda49a028 100644 --- a/docs/source/getting-started/tutorial.rst +++ b/docs/source/getting-started/tutorial.rst @@ -17,10 +17,6 @@ and run command: accelerators: V100:4 setup: | - # If using a my_setup.sh script to setup, please use - # `bash -i my_setup.sh` to capture the environment - # variable and make sure `conda activate` works - git clone https://github.com/huggingface/transformers/ cd transformers pip3 install . @@ -28,10 +24,6 @@ and run command: pip3 install -r requirements.txt run: | - # If using a my_run.sh script to run commands, please use - # `bash -i my_run.sh` to capture the environment variable - # and make sure `conda activate` works - cd transformers/examples/pytorch/text-classification python3 run_glue.py \ --model_name_or_path bert-base-cased \ From b3d654319965910aa750d5c3a57fb00363c8831e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Tue, 22 Feb 2022 19:50:48 -0800 Subject: [PATCH 05/14] format --- sky/backends/cloud_vm_ray_backend.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index b9a66c04705..4be6a064de4 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -732,9 +732,8 @@ def _retry_region_zones(self, to_provision: Resources, num_nodes: int, cluster_name = config_dict['cluster_name'] plural = '' if num_nodes == 1 else 's' - logger.info( - f'{style.BRIGHT}Successfully provisioned or found' - f' existing VM{plural}.{style.RESET_ALL}') + logger.info(f'{style.BRIGHT}Successfully provisioned or found' + f' existing VM{plural}.{style.RESET_ALL}') return config_dict message = ('Failed to acquire resources in all regions/zones' f' (requested {to_provision}).' From ec83eafed64088450af9160354b7dcfbd7deea9b Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 10:23:02 -0800 Subject: [PATCH 06/14] Fix comments --- docs/source/getting-started/quickstart.rst | 13 +++++----- sky/backends/cloud_vm_ray_backend.py | 28 +++++++++++++++------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index 287b3d4370e..573c1883678 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -49,17 +49,18 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (See more example yaml files in the `r setup: | # Typical use: pip install -r requirements.txt - # If using a `my_setup.sh` script to setup, please use - # `bash -i my_setup.sh` to capture the environment - # variable and make sure `conda activate` works + # If using a `my_setup.sh` script that requires conda, + # invoke it as below to ensure `conda activate` works: + # bash -i my_setup.sh echo "running setup" run: | # Typical use: make use of resources, such as running training. - # If using a my_run.sh script to run commands, please use - # `bash -i my_run.sh` to capture the environment variable - # and make sure `conda activate` works + # If using a `my_run.sh` script that requires conda and (or) + # sky environment variables, e.g. $SKY_NODE_RANK and $SKY_NODE_IPS, + # invoke it as below to ensure both works: + # `bash -i my_run.sh` echo "hello sky!" conda env list diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 4be6a064de4..cd4b0658ddb 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -8,6 +8,7 @@ import json import os import re +import sys import subprocess import tempfile import textwrap @@ -1243,15 +1244,24 @@ def setup(self, handle: ResourceHandle, task: Task) -> None: source=setup_sh_path, target=f'/tmp/{setup_file}', with_outputs=False) - backend_utils.run_command_on_ip_via_ssh( - ip, - # -i will make sure `conda activate` works - f'/bin/bash -i /tmp/{setup_file}', - ssh_user=ssh_user, - ssh_private_key=ssh_private_key, - log_path=os.path.join(self.log_dir, 'setup.log'), - check=True, - ssh_control_name=self._ssh_control_name(handle)) + try: + backend_utils.run_command_on_ip_via_ssh( + ip, + # -i will make sure `conda activate` works + f'/bin/bash -i /tmp/{setup_file}', + ssh_user=ssh_user, + ssh_private_key=ssh_private_key, + log_path=os.path.join(self.log_dir, 'setup.log'), + check=True, + ssh_control_name=self._ssh_control_name(handle)) + except subprocess.CalledProcessError as e: + logger.error( + f'{fore.RED}Setup failed with return code' + f' {e.returncode}.{style.RESET_ALL}') + # Suppress the error traceback. Fail as soon as + # possible (head node). + sys.exit(e.returncode) + logger.info(f'{fore.GREEN}Setup completed.{style.RESET_ALL}') def sync_down_logs(self, handle: ResourceHandle, job_id: int) -> None: codegen = backend_utils.JobLibCodeGen() From d1f99e9772c902606fdf3d2e41976cdb8521c424 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 10:50:03 -0800 Subject: [PATCH 07/14] Add test/example of using user_script --- .../resnet_distributed_torch_scripts/run.sh | 7 +++++++ .../resnet_distributed_torch_scripts/setup.sh | 19 +++++++++++++++++++ .../resnet_distributed_torch_with_script.yaml | 13 +++++++++++++ sky/backends/cloud_vm_ray_backend.py | 5 ++--- 4 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 examples/resnet_distributed_torch_scripts/run.sh create mode 100644 examples/resnet_distributed_torch_scripts/setup.sh create mode 100644 examples/resnet_distributed_torch_with_script.yaml diff --git a/examples/resnet_distributed_torch_scripts/run.sh b/examples/resnet_distributed_torch_scripts/run.sh new file mode 100644 index 00000000000..5bb0b456259 --- /dev/null +++ b/examples/resnet_distributed_torch_scripts/run.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +conda activate resnet +cd pytorch-distributed-resnet +python3 -m torch.distributed.launch --nproc_per_node=1 \ +--nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \ +--master_port=8008 resnet_ddp.py --num_epochs 20 diff --git a/examples/resnet_distributed_torch_scripts/setup.sh b/examples/resnet_distributed_torch_scripts/setup.sh new file mode 100644 index 00000000000..d9082b563b6 --- /dev/null +++ b/examples/resnet_distributed_torch_scripts/setup.sh @@ -0,0 +1,19 @@ +#!/bin/bash +[ -d pytorch-distributed-resnet ] || git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet +cd pytorch-distributed-resnet + +conda activate resnet +if [ $? -eq 0 ]; then + echo "conda env exists" +else + echo "conda env does not exist" + conda create -n resnet python=3.6 -y + conda activate resnet + pip3 install -r requirements.txt +fi + +mkdir -p data +mkdir -p saved_models +cd data +wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz +tar -xvzf cifar-10-python.tar.gz diff --git a/examples/resnet_distributed_torch_with_script.yaml b/examples/resnet_distributed_torch_with_script.yaml new file mode 100644 index 00000000000..9f744a5c8a2 --- /dev/null +++ b/examples/resnet_distributed_torch_with_script.yaml @@ -0,0 +1,13 @@ +name: resnet-distributed-app + + +resources: + accelerators: V100 + +num_nodes: 2 + +setup: | + bash -i resnet_distributed_torch_scripts/setup.sh + +run: | + bash -i resnet_distributed_torch_scripts/run.sh diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index cd4b0658ddb..59ed2cbe3d7 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1255,9 +1255,8 @@ def setup(self, handle: ResourceHandle, task: Task) -> None: check=True, ssh_control_name=self._ssh_control_name(handle)) except subprocess.CalledProcessError as e: - logger.error( - f'{fore.RED}Setup failed with return code' - f' {e.returncode}.{style.RESET_ALL}') + logger.error(f'{fore.RED}Setup failed with return code' + f' {e.returncode}.{style.RESET_ALL}') # Suppress the error traceback. Fail as soon as # possible (head node). sys.exit(e.returncode) From 1e6b64c9f64103193fec54a633185c5320d95af6 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 12:01:06 -0800 Subject: [PATCH 08/14] Fix indents --- README.md | 4 + docs/source/examples/distributed-jobs.rst | 22 ++--- docs/source/examples/grid-search.rst | 10 +-- .../source/examples/iterative-dev-project.rst | 10 +-- docs/source/getting-started/installation.rst | 60 ++++++------- docs/source/getting-started/quickstart.rst | 48 +++++----- docs/source/getting-started/tutorial.rst | 88 +++++++++---------- 7 files changed, 123 insertions(+), 119 deletions(-) diff --git a/README.md b/README.md index 2952926e930..8c61b788146 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,11 @@ workdir: . # Sync code dir to cloud setup: | # Typical use: pip install -r requirements.txt + echo "running setup" + # If using a `my_setup.sh` script that requires conda, + # invoke it as below to ensure `conda activate` works: + # bash -i my_setup.sh run: | # Typical use: make use of resources, such as running training. diff --git a/docs/source/examples/distributed-jobs.rst b/docs/source/examples/distributed-jobs.rst index 22cc2510c7d..fa043be1853 100644 --- a/docs/source/examples/distributed-jobs.rst +++ b/docs/source/examples/distributed-jobs.rst @@ -11,23 +11,23 @@ For example, here is a simple PyTorch Distributed training example: name: resnet-distributed-app resources: - accelerators: V100 + accelerators: V100 num_nodes: 2 setup: | - pip3 install --upgrade pip - git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet - cd pytorch-distributed-resnet && pip3 install -r requirements.txt - mkdir -p data && mkdir -p saved_models && cd data && \ - wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz - tar -xvzf cifar-10-python.tar.gz + pip3 install --upgrade pip + git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet + cd pytorch-distributed-resnet && pip3 install -r requirements.txt + mkdir -p data && mkdir -p saved_models && cd data && \ + wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz + tar -xvzf cifar-10-python.tar.gz run: | - cd pytorch-distributed-resnet - python3 -m torch.distributed.launch --nproc_per_node=1 \ - --nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \ - --master_port=8008 resnet_ddp.py --num_epochs 20 + cd pytorch-distributed-resnet + python3 -m torch.distributed.launch --nproc_per_node=1 \ + --nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \ + --master_port=8008 resnet_ddp.py --num_epochs 20 In the above, :code:`num_nodes: 2` specifies that this task is to be run on 2 nodes. The commands in :code:`run` are executed on both nodes. Several useful diff --git a/docs/source/examples/grid-search.rst b/docs/source/examples/grid-search.rst index b76437d1a00..e8a7a16381b 100644 --- a/docs/source/examples/grid-search.rst +++ b/docs/source/examples/grid-search.rst @@ -12,11 +12,11 @@ Submitting multiple trials with different hyperparameters is simple: .. code-block:: bash - # Launch 4 trials in parallel - sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-3 - sky exec mycluster --gpus V100:1 -d -- python train.py --lr 3e-3 - sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-4 - sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-2 + $ # Launch 4 trials in parallel + $ sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-3 + $ sky exec mycluster --gpus V100:1 -d -- python train.py --lr 3e-3 + $ sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-4 + $ sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-2 # gets queued and will run once a GPU is available sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-6 diff --git a/docs/source/examples/iterative-dev-project.rst b/docs/source/examples/iterative-dev-project.rst index d8ae0984648..3dbdf66f06e 100644 --- a/docs/source/examples/iterative-dev-project.rst +++ b/docs/source/examples/iterative-dev-project.rst @@ -47,8 +47,8 @@ Use the familiar scp/rsync to transfer files between your local machine and remo .. code-block:: - $ scp -r my_code/ dev:/path/to/destination # copy files to remote VM - $ scp -r dev:/path/to/source my_code/ # copy files from remote VM + $ scp -r my_code/ dev:/path/to/destination # copy files to remote VM + $ scp -r dev:/path/to/source my_code/ # copy files from remote VM Sky **simplifies code syncing** by the automatic transfer of a working directory to the cluster. The working directory can be configured with the @@ -57,8 +57,8 @@ option: .. code-block:: - $ sky launch --workdir=/path/to/code task.yaml - $ sky exec --workdir=/path/to/code task.yaml + $ sky launch --workdir=/path/to/code task.yaml + $ sky exec --workdir=/path/to/code task.yaml These commands sync the working directory to a location on the remote VM, and the task is run under that working directory (e.g., to invoke scripts, access @@ -79,4 +79,4 @@ To restart a stopped cluster: .. code-block:: console - $ sky start dev + $ sky start dev diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index d2dbd1c2d16..ac2db91c70b 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -7,11 +7,11 @@ Install Sky using pip: .. code-block:: console - $ # Clone the sky codebase - $ git clone ssh://git@github.com/sky-proj/sky.git - $ cd sky - $ # Sky requires python >= 3.6. - $ pip install ".[all]" + $ # Clone the sky codebase + $ git clone ssh://git@github.com/sky-proj/sky.git + $ cd sky + $ # Sky requires python >= 3.6. + $ pip install ".[all]" If you only want the dependencies for certain clouds, you can also use :code:`".[aws,azure,gcp]"`. @@ -26,11 +26,11 @@ tasks in the clouds, configure access to at least one cloud: .. code-block:: console - $ # Install boto - $ pip install boto3 + $ # Install boto + $ pip install boto3 - $ # Configure your AWS credentials - $ aws configure + $ # Configure your AWS credentials + $ aws configure To get the **AWS Access Key** required by the :code:`aws configure`, please refer to the `AWS manual `_. The **Default region name [None]:** and **Default output format [None]:** are optional. @@ -38,16 +38,16 @@ To get the **AWS Access Key** required by the :code:`aws configure`, please refe .. code-block:: console - $ pip install google-api-python-client - $ # Install `gcloud`; see https://cloud.google.com/sdk/docs/quickstart - $ conda install -c conda-forge google-cloud-sdk + $ pip install google-api-python-client + $ # Install `gcloud`; see https://cloud.google.com/sdk/docs/quickstart + $ conda install -c conda-forge google-cloud-sdk - $ # Init. - $ gcloud init + $ # Init. + $ gcloud init - $ # Run this if you don't have a credentials file. - $ # This will generate ~/.config/gcloud/application_default_credentials.json. - $ gcloud auth application-default login + $ # Run this if you don't have a credentials file. + $ # This will generate ~/.config/gcloud/application_default_credentials.json. + $ gcloud auth application-default login If you meet the following error (*RemoveError: 'requests' is a dependency of conda and cannot be removed from conda's operating environment*) while running :code:`conda install -c conda-forge google-cloud-sdk`, please try :code:`conda update --force conda` and run it again. @@ -56,12 +56,12 @@ If you meet the following error (*RemoveError: 'requests' is a dependency of con .. code-block:: console - $ # Install the Azure CLI - $ pip install azure-cli==2.30.0 - $ # Login azure - $ az login - $ # Set the subscription to use - $ az account set -s + $ # Install the Azure CLI + $ pip install azure-cli==2.30.0 + $ # Login azure + $ az login + $ # Set the subscription to use + $ az account set -s **Verifying cloud setup** @@ -70,16 +70,16 @@ the CLI: .. code-block:: console - $ # Verify cloud account setup - $ sky check + $ # Verify cloud account setup + $ sky check This will produce output verifying the correct setup of each supported cloud. .. code-block:: text - Checking credentials to enable clouds for Sky. - AWS: enabled - GCP: enabled - Azure: enabled + Checking credentials to enable clouds for Sky. + AWS: enabled + GCP: enabled + Azure: enabled - Sky will use only the enabled clouds to run tasks. To change this, configure cloud credentials, and run sky check. + Sky will use only the enabled clouds to run tasks. To change this, configure cloud credentials, and run sky check. diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index 573c1883678..2a982e66760 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -35,34 +35,34 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (See more example yaml files in the `r .. code-block:: yaml - # hello_sky.yaml + # hello_sky.yaml - resources: - # Optional; if left out, pick from the available clouds. - cloud: aws + resources: + # Optional; if left out, pick from the available clouds. + cloud: aws - # Get 1 K80 GPU. Use : to get more (e.g., "K80:8"). - accelerators: K80 + # Get 1 K80 GPU. Use : to get more (e.g., "K80:8"). + accelerators: K80 - workdir: . + workdir: . - setup: | - # Typical use: pip install -r requirements.txt + setup: | + # Typical use: pip install -r requirements.txt - # If using a `my_setup.sh` script that requires conda, - # invoke it as below to ensure `conda activate` works: - # bash -i my_setup.sh - echo "running setup" + echo "running setup" + # If using a `my_setup.sh` script that requires conda, + # invoke it as below to ensure `conda activate` works: + # bash -i my_setup.sh - run: | - # Typical use: make use of resources, such as running training. + run: | + # Typical use: make use of resources, such as running training. - # If using a `my_run.sh` script that requires conda and (or) - # sky environment variables, e.g. $SKY_NODE_RANK and $SKY_NODE_IPS, - # invoke it as below to ensure both works: - # `bash -i my_run.sh` - echo "hello sky!" - conda env list + echo "hello sky!" + conda env list + # If using a `my_run.sh` script that requires conda and (or) + # sky environment variables, e.g. $SKY_NODE_RANK and $SKY_NODE_IPS, + # invoke it as below to ensure both works: + # `bash -i my_run.sh` Sky handles selecting an appropriate VM based on user-specified resource constraints, launching the cluster on an appropriate cloud provider, and @@ -72,7 +72,7 @@ To launch a task based on our above YAML spec, we can use :code:`sky launch`. .. code-block:: console - $ sky launch -c mycluster hello_sky.yaml + $ sky launch -c mycluster hello_sky.yaml The :code:`-c` option allows us to specify a cluster name. If a cluster with the same name already exists, Sky will reuse that cluster. If no such cluster @@ -84,7 +84,7 @@ We can view our existing clusters by running :code:`sky status`: .. code-block:: console - $ sky status + $ sky status This may show multiple clusters, if you have created several: @@ -98,7 +98,7 @@ If you would like to log into the a cluster, Sky provides convenient SSH access .. code-block:: console - $ ssh mycluster + $ ssh mycluster Sky is more than a tool for easily provisioning and managing multiple clusters on different clouds. It also comes with features for storing and moving data, diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst index b1dda49a028..0189ceb5ae7 100644 --- a/docs/source/getting-started/tutorial.rst +++ b/docs/source/getting-started/tutorial.rst @@ -9,32 +9,32 @@ and run command: .. code-block:: yaml - # dnn.yaml - - name: huggingface - - resources: - accelerators: V100:4 - - setup: | - git clone https://github.com/huggingface/transformers/ - cd transformers - pip3 install . - cd examples/pytorch/text-classification - pip3 install -r requirements.txt - - run: | - cd transformers/examples/pytorch/text-classification - python3 run_glue.py \ - --model_name_or_path bert-base-cased \ - --dataset_name imdb \ - --do_train \ - --max_seq_length 128 \ - --per_device_train_batch_size 32 \ - --learning_rate 2e-5 \ - --max_steps 50 \ - --output_dir /tmp/imdb/ --overwrite_output_dir \ - --fp16 + # dnn.yaml + + name: huggingface + + resources: + accelerators: V100:4 + + setup: | + git clone https://github.com/huggingface/transformers/ + cd transformers + pip3 install . + cd examples/pytorch/text-classification + pip3 install -r requirements.txt + + run: | + cd transformers/examples/pytorch/text-classification + python3 run_glue.py \ + --model_name_or_path bert-base-cased \ + --dataset_name imdb \ + --do_train \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate 2e-5 \ + --max_steps 50 \ + --output_dir /tmp/imdb/ --overwrite_output_dir \ + --fp16 We can launch training by running: @@ -57,12 +57,12 @@ terminal, which is useful for launching long-running jobs concurrently. .. code-block:: console - $ # Launch 4 jobs, perhaps with different hyperparameters. - $ # We can override the task name with `-n` (optional) and resource requirement with `--gpus` (optional) - $ sky exec lm-cluster dnn.yaml -d -n job2 --gpus=V100:1 - $ sky exec lm-cluster dnn.yaml -d -n job3 --gpus=V100:1 - $ sky exec lm-cluster dnn.yaml -d -n job4 --gpus=V100:3 - $ sky exec lm-cluster dnn.yaml -d -n job5 --gpus=V100:2 + $ # Launch 4 jobs, perhaps with different hyperparameters. + $ # We can override the task name with `-n` (optional) and resource requirement with `--gpus` (optional) + $ sky exec lm-cluster dnn.yaml -d -n job2 --gpus=V100:1 + $ sky exec lm-cluster dnn.yaml -d -n job3 --gpus=V100:1 + $ sky exec lm-cluster dnn.yaml -d -n job4 --gpus=V100:3 + $ sky exec lm-cluster dnn.yaml -d -n job5 --gpus=V100:2 Because the cluster only has 4 V100 GPU, we will see the following behavior: @@ -76,19 +76,19 @@ If we wish to view the output for each run after it has completed we can use: .. code-block:: console - $ # View the jobs in the queue - $ sky queue lm-cluster + $ # View the jobs in the queue + $ sky queue lm-cluster - ID NAME USER SUBMITTED STARTED STATUS - 5 job5 user 10 mins ago 10 mins ago RUNNING - 4 job4 user 10 mins ago - PENDING - 3 job3 user 10 mins ago 9 mins ago RUNNING - 2 job2 user 10 mins ago 9 mins ago RUNNING - 1 huggingface user 10 mins ago 1 min ago SUCCEEDED + ID NAME USER SUBMITTED STARTED STATUS + 5 job5 user 10 mins ago 10 mins ago RUNNING + 4 job4 user 10 mins ago - PENDING + 3 job3 user 10 mins ago 9 mins ago RUNNING + 2 job2 user 10 mins ago 9 mins ago RUNNING + 1 huggingface user 10 mins ago 1 min ago SUCCEEDED - $ # Stream the logs of job5 (ID: 5) to the console - $ sky logs lm-cluster 5 + $ # Stream the logs of job5 (ID: 5) to the console + $ sky logs lm-cluster 5 - $ # Cancel job job3 (ID: 3) - $ sky cancel lm-cluster 3 + $ # Cancel job job3 (ID: 3) + $ sky cancel lm-cluster 3 From 0b9d33160574f49c3f33c2d872f3e48955c82a9f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 14:27:29 -0800 Subject: [PATCH 09/14] bash -i only for conda activate --- docs/source/getting-started/quickstart.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index 2a982e66760..7a4250f0bfb 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -59,9 +59,8 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (See more example yaml files in the `r echo "hello sky!" conda env list - # If using a `my_run.sh` script that requires conda and (or) - # sky environment variables, e.g. $SKY_NODE_RANK and $SKY_NODE_IPS, - # invoke it as below to ensure both works: + # If using a `my_run.sh` script that requires conda, + # invoke it as below to ensure `conda activate` works: # `bash -i my_run.sh` Sky handles selecting an appropriate VM based on user-specified resource From 4cc5113d4dbfa5f75977f521374a3b43263a52da Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 15:05:34 -0800 Subject: [PATCH 10/14] Fix the SKY_NODE_IPS fail to pass to the shell script --- docs/source/examples/distributed-jobs.rst | 11 ++++++++--- examples/resnet_distributed_torch.yaml | 5 ++++- examples/resnet_distributed_torch_scripts/run.sh | 5 ++++- examples/resnet_distributed_torch_scripts/setup.sh | 2 +- examples/resnet_distributed_torch_with_script.yaml | 7 +++++-- sky/backends/cloud_vm_ray_backend.py | 4 ++-- 6 files changed, 24 insertions(+), 10 deletions(-) diff --git a/docs/source/examples/distributed-jobs.rst b/docs/source/examples/distributed-jobs.rst index fa043be1853..12158eef228 100644 --- a/docs/source/examples/distributed-jobs.rst +++ b/docs/source/examples/distributed-jobs.rst @@ -25,8 +25,11 @@ For example, here is a simple PyTorch Distributed training example: run: | cd pytorch-distributed-resnet + + num_nodes=`echo "$SKY_NODE_IPS" | wc -l` + master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p` python3 -m torch.distributed.launch --nproc_per_node=1 \ - --nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \ + --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \ --master_port=8008 resnet_ddp.py --num_epochs 20 In the above, :code:`num_nodes: 2` specifies that this task is to be run on 2 @@ -35,5 +38,7 @@ environment variables are available to distinguish per-node commands: - :code:`SKY_NODE_RANK`: rank (an integer ID from 0 to :code:`num_nodes-1`) of the node executing the task -- :code:`SKY_NODE_IPS`: a list of IP addresses of the nodes reserved to execute - the task +- :code:`SKY_NODE_IPS`: a string of IP addresses of the nodes reserved to execute + the task, where each line contains one IP address. You can retrieve the number of + nodes by :code:`echo "$SKY_NODE_IPS" | wc -l` and the IP address of node-3 by + :code:`echo "$SKY_NODE_IPS" | cut -n 3p` diff --git a/examples/resnet_distributed_torch.yaml b/examples/resnet_distributed_torch.yaml index 4637ab90319..34ffbc8f6c2 100644 --- a/examples/resnet_distributed_torch.yaml +++ b/examples/resnet_distributed_torch.yaml @@ -16,6 +16,9 @@ setup: | run: | cd pytorch-distributed-resnet + + num_nodes=`echo "$SKY_NODE_IPS" | wc -l` + master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p` python3 -m torch.distributed.launch --nproc_per_node=1 \ - --nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \ + --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \ --master_port=8008 resnet_ddp.py --num_epochs 20 diff --git a/examples/resnet_distributed_torch_scripts/run.sh b/examples/resnet_distributed_torch_scripts/run.sh index 5bb0b456259..89f8e3ebc45 100644 --- a/examples/resnet_distributed_torch_scripts/run.sh +++ b/examples/resnet_distributed_torch_scripts/run.sh @@ -2,6 +2,9 @@ conda activate resnet cd pytorch-distributed-resnet +num_nodes=`echo "$SKY_NODE_IPS" | wc -l` +master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p` +echo MASTER_ADDR $master_addr python3 -m torch.distributed.launch --nproc_per_node=1 \ ---nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \ +--nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \ --master_port=8008 resnet_ddp.py --num_epochs 20 diff --git a/examples/resnet_distributed_torch_scripts/setup.sh b/examples/resnet_distributed_torch_scripts/setup.sh index d9082b563b6..a87a0194041 100644 --- a/examples/resnet_distributed_torch_scripts/setup.sh +++ b/examples/resnet_distributed_torch_scripts/setup.sh @@ -9,8 +9,8 @@ else echo "conda env does not exist" conda create -n resnet python=3.6 -y conda activate resnet - pip3 install -r requirements.txt fi +pip install -r requirements.txt mkdir -p data mkdir -p saved_models diff --git a/examples/resnet_distributed_torch_with_script.yaml b/examples/resnet_distributed_torch_with_script.yaml index 9f744a5c8a2..374f5d74857 100644 --- a/examples/resnet_distributed_torch_with_script.yaml +++ b/examples/resnet_distributed_torch_with_script.yaml @@ -2,12 +2,15 @@ name: resnet-distributed-app resources: + cloud: aws accelerators: V100 num_nodes: 2 +workdir: . + setup: | - bash -i resnet_distributed_torch_scripts/setup.sh + bash -i examples/resnet_distributed_torch_scripts/setup.sh run: | - bash -i resnet_distributed_torch_scripts/run.sh + bash -i examples/resnet_distributed_torch_scripts/run.sh diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 59ed2cbe3d7..fff3c74fa25 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -244,8 +244,8 @@ def check_ip(): for i in range(pg.bundle_count) ]) print('SKY INFO: Placement group IPs:', ip_list) - ip_list_str = ' '.join([repr(ip) for ip in ip_list]) - export_sky_env_vars = 'export SKY_NODE_IPS=(' + ip_list_str + ')\\n' + ip_list_str = '\\n'.join(ip_list) + export_sky_env_vars = 'export SKY_NODE_IPS="' + ip_list_str + '"\\n' """), ] From f18540e0984d49d614014cf3906bcae960ddf58f Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 15:17:53 -0800 Subject: [PATCH 11/14] Update readme --- README.md | 19 ++++++++---- docs/source/getting-started/quickstart.rst | 34 +++++++++++----------- 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index b49e99ae8d8..9305d137f9a 100644 --- a/README.md +++ b/README.md @@ -13,23 +13,32 @@ sky launch -c mycluster hello_sky.yaml ```yaml # hello_sky.yaml + resources: - accelerators: V100:1 # 1x NVIDIA V100 GPU + # Optional; if left out, pick from the available clouds. + cloud: aws -workdir: . # Sync code dir to cloud + # Get 1 K80 GPU. Use : to get more (e.g., "K80:8"). + accelerators: K80 -setup: | - # Typical use: pip install -r requirements.txt +# Working directory (optional) containing the project codebase. +# This directory will be synced to ~/sky_workdir on the provisioned cluster. +workdir: . +# Typical use: pip install -r requirements.txt +setup: | echo "running setup" # If using a `my_setup.sh` script that requires conda, # invoke it as below to ensure `conda activate` works: # bash -i my_setup.sh +# Typical use: make use of resources, such as running training. run: | - # Typical use: make use of resources, such as running training. echo "hello sky!" conda env list + # If using a `my_run.sh` script that requires conda, + # invoke it as below to ensure `conda activate` works: + # `bash -i my_run.sh` ``` ## Getting Started diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index c70f5d6b520..3211925f7df 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -31,23 +31,23 @@ Let's provision an instance with a single K80 GPU. .. code-block:: bash - # Provisions/reuses an interactive node with a single K80 GPU. - # Any of the interactive node commands (gpunode, tpunode, cpunode) - # will automatically log in to the cluster. - sky gpunode -c mygpu --gpus K80 - - Last login: Wed Feb 23 22:35:47 2022 from 136.152.143.101 - ubuntu@ip-172-31-86-108:~$ gpustat - ip-172-31-86-108 Wed Feb 23 22:42:43 2022 450.142.00 - [0] Tesla K80 | 31°C, 0 % | 0 / 11441 MB | - ubuntu@ip-172-31-86-108:~$ - ^D - - # View the machine in the cluster table. - sky status - - NAME LAUNCHED RESOURCES COMMAND STATUS - mygpu a few secs ago 1x Azure(Standard_NC6_Promo) sky gpunode -c mygpu --gpus K80 UP + # Provisions/reuses an interactive node with a single K80 GPU. + # Any of the interactive node commands (gpunode, tpunode, cpunode) + # will automatically log in to the cluster. + sky gpunode -c mygpu --gpus K80 + + Last login: Wed Feb 23 22:35:47 2022 from 136.152.143.101 + ubuntu@ip-172-31-86-108:~$ gpustat + ip-172-31-86-108 Wed Feb 23 22:42:43 2022 450.142.00 + [0] Tesla K80 | 31°C, 0 % | 0 / 11441 MB | + ubuntu@ip-172-31-86-108:~$ + ^D + + # View the machine in the cluster table. + sky status + + NAME LAUNCHED RESOURCES COMMAND STATUS + mygpu a few secs ago 1x Azure(Standard_NC6_Promo) sky gpunode -c mygpu --gpus K80 UP After you are done, run :code:`sky down mygpu` to terminate the cluster. Find more details on managing the lifecycle of your cluster :ref:`here `. From b157fecafef6855d95adba02529ed01afeca306e Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 15:38:25 -0800 Subject: [PATCH 12/14] update env_check --- examples/env_check.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/env_check.yaml b/examples/env_check.yaml index c14460f5d4e..c05d75188b8 100644 --- a/examples/env_check.yaml +++ b/examples/env_check.yaml @@ -25,4 +25,6 @@ run: | fi echo NODE ID: $SKY_NODE_RANK - echo NODE IP: ${SKY_NODE_IPS[$SKY_NODE_RANK]} + echo NODE IPS: "$SKY_NODE_IPS" + worker_addr=`echo "$SKY_NODE_IPS" | sed -n 2p` + echo Worker IP: $worker_addr From 4b91b2aedd60c14aa1ea621be8634fd9258b0349 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 20:46:23 -0800 Subject: [PATCH 13/14] Fix comments --- README.md | 5 ++--- docs/source/examples/distributed-jobs.rst | 4 ++-- docs/source/getting-started/quickstart.rst | 3 +-- sky/backends/cloud_vm_ray_backend.py | 3 ++- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 9305d137f9a..d17ad3f5f61 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,7 @@ resources: # Optional; if left out, pick from the available clouds. cloud: aws - # Get 1 K80 GPU. Use : to get more (e.g., "K80:8"). - accelerators: K80 + accelerators: V100:1 # 1x NVIDIA V100 GPU # Working directory (optional) containing the project codebase. # This directory will be synced to ~/sky_workdir on the provisioned cluster. @@ -38,7 +37,7 @@ run: | conda env list # If using a `my_run.sh` script that requires conda, # invoke it as below to ensure `conda activate` works: - # `bash -i my_run.sh` + # bash -i my_run.sh ``` ## Getting Started diff --git a/docs/source/examples/distributed-jobs.rst b/docs/source/examples/distributed-jobs.rst index 12158eef228..5bea00fd2c1 100644 --- a/docs/source/examples/distributed-jobs.rst +++ b/docs/source/examples/distributed-jobs.rst @@ -30,7 +30,7 @@ For example, here is a simple PyTorch Distributed training example: master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p` python3 -m torch.distributed.launch --nproc_per_node=1 \ --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \ - --master_port=8008 resnet_ddp.py --num_epochs 20 + --master_port=8008 resnet_ddp.py --num_epochs 20 In the above, :code:`num_nodes: 2` specifies that this task is to be run on 2 nodes. The commands in :code:`run` are executed on both nodes. Several useful @@ -41,4 +41,4 @@ environment variables are available to distinguish per-node commands: - :code:`SKY_NODE_IPS`: a string of IP addresses of the nodes reserved to execute the task, where each line contains one IP address. You can retrieve the number of nodes by :code:`echo "$SKY_NODE_IPS" | wc -l` and the IP address of node-3 by - :code:`echo "$SKY_NODE_IPS" | cut -n 3p` + :code:`echo "$SKY_NODE_IPS" | sed -n 3p` diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst index 3211925f7df..15deaed3322 100644 --- a/docs/source/getting-started/quickstart.rst +++ b/docs/source/getting-started/quickstart.rst @@ -84,8 +84,7 @@ requiring an NVIDIA Tesla K80 GPU on AWS. See more example yaml files in the `re # Optional; if left out, pick from the available clouds. cloud: aws - # Get 1 K80 GPU. Use : to get more (e.g., "K80:8"). - accelerators: K80 + accelerators: V100:1 # 1x NVIDIA V100 GPU # Working directory (optional) containing the project codebase. # This directory will be synced to ~/sky_workdir on the provisioned cluster. diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index fff3c74fa25..15edce9b34b 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -648,6 +648,7 @@ def _retry_region_zones(self, to_provision: Resources, num_nodes: int, dryrun: bool, stream_logs: bool, cluster_name: str): """The provision retry loop.""" style = colorama.Style + fore = colorama.Fore # Get log_path name log_path = os.path.join(self.log_dir, 'provision.log') log_abs_path = os.path.abspath(log_path) @@ -733,7 +734,7 @@ def _retry_region_zones(self, to_provision: Resources, num_nodes: int, cluster_name = config_dict['cluster_name'] plural = '' if num_nodes == 1 else 's' - logger.info(f'{style.BRIGHT}Successfully provisioned or found' + logger.info(f'{fore.GREEN}Successfully provisioned or found' f' existing VM{plural}.{style.RESET_ALL}') return config_dict message = ('Failed to acquire resources in all regions/zones' From 77bbea30bfdfde5e15c9a75f66592ef312216836 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Wed, 23 Feb 2022 21:29:17 -0800 Subject: [PATCH 14/14] Change to head -n1 --- docs/source/examples/distributed-jobs.rst | 2 +- examples/resnet_distributed_torch.yaml | 2 +- examples/resnet_distributed_torch_scripts/run.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/examples/distributed-jobs.rst b/docs/source/examples/distributed-jobs.rst index 5bea00fd2c1..f004522128e 100644 --- a/docs/source/examples/distributed-jobs.rst +++ b/docs/source/examples/distributed-jobs.rst @@ -27,7 +27,7 @@ For example, here is a simple PyTorch Distributed training example: cd pytorch-distributed-resnet num_nodes=`echo "$SKY_NODE_IPS" | wc -l` - master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p` + master_addr=`echo "$SKY_NODE_IPS" | head -n1` python3 -m torch.distributed.launch --nproc_per_node=1 \ --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \ --master_port=8008 resnet_ddp.py --num_epochs 20 diff --git a/examples/resnet_distributed_torch.yaml b/examples/resnet_distributed_torch.yaml index 34ffbc8f6c2..8df3810c25e 100644 --- a/examples/resnet_distributed_torch.yaml +++ b/examples/resnet_distributed_torch.yaml @@ -18,7 +18,7 @@ run: | cd pytorch-distributed-resnet num_nodes=`echo "$SKY_NODE_IPS" | wc -l` - master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p` + master_addr=`echo "$SKY_NODE_IPS" | head -n1` python3 -m torch.distributed.launch --nproc_per_node=1 \ --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \ --master_port=8008 resnet_ddp.py --num_epochs 20 diff --git a/examples/resnet_distributed_torch_scripts/run.sh b/examples/resnet_distributed_torch_scripts/run.sh index 89f8e3ebc45..bc49f331280 100644 --- a/examples/resnet_distributed_torch_scripts/run.sh +++ b/examples/resnet_distributed_torch_scripts/run.sh @@ -3,7 +3,7 @@ conda activate resnet cd pytorch-distributed-resnet num_nodes=`echo "$SKY_NODE_IPS" | wc -l` -master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p` +master_addr=`echo "$SKY_NODE_IPS" | head -n1` echo MASTER_ADDR $master_addr python3 -m torch.distributed.launch --nproc_per_node=1 \ --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \