From f03c6acd7d6e1341b0754c81fac158af0675f9ce Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 22 Feb 2022 17:40:26 -0800
Subject: [PATCH 01/14] Add conda activate support to bashrc

---
 sky/backends/cloud_vm_ray_backend.py | 11 ++++-------
 sky/skylet/log_lib.py                |  1 -
 sky/templates/aws-ray.yml.j2         |  2 ++
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index a9d4e0d6ffd..4b1604a1cce 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -734,7 +734,7 @@ def _retry_region_zones(self, to_provision: Resources, num_nodes: int,
                 plural = '' if num_nodes == 1 else 's'
                 logger.info(
                     f'{style.BRIGHT}Successfully provisioned or found'
-                    f' existing VM{plural}. Setup completed.{style.RESET_ALL}')
+                    f' existing VM{plural}.{style.RESET_ALL}')
                 return config_dict
         message = ('Failed to acquire resources in all regions/zones'
                    f' (requested {to_provision}).'
@@ -1219,13 +1219,10 @@ def setup(self, handle: ResourceHandle, task: Task) -> None:
 
         if task.setup is None:
             return
-        codegen = textwrap.dedent(f"""\
-            #!/bin/bash
-            # TODO(zhwu): Move this to bashrc
-            . $(conda info --base)/etc/profile.d/conda.sh
-            {task.setup}""")
+
+        setup_script = log_lib.make_task_bash_script(task.setup)
         with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
-            f.write(codegen)
+            f.write(setup_script)
             f.flush()
             setup_sh_path = f.name
             setup_file = os.path.basename(setup_sh_path)
diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py
index f44214c2c04..386aa7747fc 100644
--- a/sky/skylet/log_lib.py
+++ b/sky/skylet/log_lib.py
@@ -155,7 +155,6 @@ def make_task_bash_script(codegen: str) -> str:
         textwrap.dedent(f"""\
                 #!/bin/bash
                 source ~/.bashrc
-                . $(conda info --base)/etc/profile.d/conda.sh 2> /dev/null || true
                 cd {SKY_REMOTE_WORKDIR}"""),
         codegen,
     ]
diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2
index 795309cb93a..a8185964f98 100644
--- a/sky/templates/aws-ray.yml.j2
+++ b/sky/templates/aws-ray.yml.j2
@@ -73,6 +73,8 @@ setup_commands:
   # This AMI's system Python is version 2+.
   - pip3 install -U ray[default]=={{ray_version}} && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app
   - pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
+  # Make `conda activate` available
+  - grep -qxF '. $(conda info --base)/etc/profile.d/conda.sh' ~/.bashrc || echo '. $(conda info --base)/etc/profile.d/conda.sh' >> ~/.bashrc
 
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:

From ae96c9387ddd59f0b73074f6e5803075adbb154a Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 22 Feb 2022 19:38:35 -0800
Subject: [PATCH 02/14] Add doc and make sure conda activate works

---
 docs/source/getting-started/tutorial.rst | 8 ++++++++
 sky/backends/cloud_vm_ray_backend.py     | 3 ++-
 sky/skylet/log_lib.py                    | 1 -
 sky/templates/aws-ray.yml.j2             | 2 --
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst
index b1dda49a028..ea61b566be5 100644
--- a/docs/source/getting-started/tutorial.rst
+++ b/docs/source/getting-started/tutorial.rst
@@ -17,6 +17,10 @@ and run command:
      accelerators: V100:4
 
    setup: |
+     # If using a my_setup.sh script to setup, please use
+     # `bash -i my_setup.sh` to capture the environment
+     # variable and make sure `conda activate` works
+
      git clone https://github.com/huggingface/transformers/
      cd transformers
      pip3 install .
@@ -24,6 +28,10 @@ and run command:
      pip3 install -r requirements.txt
 
    run: |
+     # If using a my_run.sh script to run commands, please use
+     # `bash -i my_run.sh` to capture the environment variable
+     # and make sure `conda activate` works
+     
      cd transformers/examples/pytorch/text-classification
      python3 run_glue.py \
        --model_name_or_path bert-base-cased \
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index 4b1604a1cce..b9a66c04705 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -1246,7 +1246,8 @@ def setup(self, handle: ResourceHandle, task: Task) -> None:
                                with_outputs=False)
                 backend_utils.run_command_on_ip_via_ssh(
                     ip,
-                    f'cd {SKY_REMOTE_WORKDIR}; /bin/bash /tmp/{setup_file}',
+                    # -i will make sure `conda activate` works
+                    f'/bin/bash -i /tmp/{setup_file}',
                     ssh_user=ssh_user,
                     ssh_private_key=ssh_private_key,
                     log_path=os.path.join(self.log_dir, 'setup.log'),
diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py
index 386aa7747fc..93921ae894e 100644
--- a/sky/skylet/log_lib.py
+++ b/sky/skylet/log_lib.py
@@ -154,7 +154,6 @@ def make_task_bash_script(codegen: str) -> str:
     script = [
         textwrap.dedent(f"""\
                 #!/bin/bash
-                source ~/.bashrc
                 cd {SKY_REMOTE_WORKDIR}"""),
         codegen,
     ]
diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2
index a8185964f98..795309cb93a 100644
--- a/sky/templates/aws-ray.yml.j2
+++ b/sky/templates/aws-ray.yml.j2
@@ -73,8 +73,6 @@ setup_commands:
   # This AMI's system Python is version 2+.
   - pip3 install -U ray[default]=={{ray_version}} && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app
   - pip3 uninstall sky -y &> /dev/null; pip3 install {{sky_remote_path}}/*.whl && python3 -c "from sky.skylet.ray_patches import patch; patch()" # patch the buggy ray file
-  # Make `conda activate` available
-  - grep -qxF '. $(conda info --base)/etc/profile.d/conda.sh' ~/.bashrc || echo '. $(conda info --base)/etc/profile.d/conda.sh' >> ~/.bashrc
 
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:

From 8bd498ccc6efb70f58921a31a018401740bd41f8 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 22 Feb 2022 19:46:59 -0800
Subject: [PATCH 03/14] bring back conda activate command for GCP

---
 sky/skylet/log_lib.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sky/skylet/log_lib.py b/sky/skylet/log_lib.py
index 93921ae894e..f44214c2c04 100644
--- a/sky/skylet/log_lib.py
+++ b/sky/skylet/log_lib.py
@@ -154,6 +154,8 @@ def make_task_bash_script(codegen: str) -> str:
     script = [
         textwrap.dedent(f"""\
                 #!/bin/bash
+                source ~/.bashrc
+                . $(conda info --base)/etc/profile.d/conda.sh 2> /dev/null || true
                 cd {SKY_REMOTE_WORKDIR}"""),
         codegen,
     ]

From f14a281c0bc8af6016b2b9329fe4b9d179909531 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 22 Feb 2022 19:47:10 -0800
Subject: [PATCH 04/14] Move comment to quickstart

---
 docs/source/getting-started/quickstart.rst | 8 ++++++++
 docs/source/getting-started/tutorial.rst   | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index 83029485f6d..287b3d4370e 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -48,10 +48,18 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (See more example yaml files in the `r
 
    setup: |
      # Typical use: pip install -r requirements.txt
+
+     # If using a `my_setup.sh` script to setup, please use
+     # `bash -i my_setup.sh` to capture the environment
+     # variable and make sure `conda activate` works
      echo "running setup"
 
    run: |
      # Typical use: make use of resources, such as running training.
+
+     # If using a my_run.sh script to run commands, please use
+     # `bash -i my_run.sh` to capture the environment variable
+     # and make sure `conda activate` works
      echo "hello sky!"
      conda env list
 
diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst
index ea61b566be5..b1dda49a028 100644
--- a/docs/source/getting-started/tutorial.rst
+++ b/docs/source/getting-started/tutorial.rst
@@ -17,10 +17,6 @@ and run command:
      accelerators: V100:4
 
    setup: |
-     # If using a my_setup.sh script to setup, please use
-     # `bash -i my_setup.sh` to capture the environment
-     # variable and make sure `conda activate` works
-
      git clone https://github.com/huggingface/transformers/
      cd transformers
      pip3 install .
@@ -28,10 +24,6 @@ and run command:
      pip3 install -r requirements.txt
 
    run: |
-     # If using a my_run.sh script to run commands, please use
-     # `bash -i my_run.sh` to capture the environment variable
-     # and make sure `conda activate` works
-     
      cd transformers/examples/pytorch/text-classification
      python3 run_glue.py \
        --model_name_or_path bert-base-cased \

From b3d654319965910aa750d5c3a57fb00363c8831e Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Tue, 22 Feb 2022 19:50:48 -0800
Subject: [PATCH 05/14] format

---
 sky/backends/cloud_vm_ray_backend.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index b9a66c04705..4be6a064de4 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -732,9 +732,8 @@ def _retry_region_zones(self, to_provision: Resources, num_nodes: int,
 
                 cluster_name = config_dict['cluster_name']
                 plural = '' if num_nodes == 1 else 's'
-                logger.info(
-                    f'{style.BRIGHT}Successfully provisioned or found'
-                    f' existing VM{plural}.{style.RESET_ALL}')
+                logger.info(f'{style.BRIGHT}Successfully provisioned or found'
+                            f' existing VM{plural}.{style.RESET_ALL}')
                 return config_dict
         message = ('Failed to acquire resources in all regions/zones'
                    f' (requested {to_provision}).'

From ec83eafed64088450af9160354b7dcfbd7deea9b Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 10:23:02 -0800
Subject: [PATCH 06/14] Fix comments

---
 docs/source/getting-started/quickstart.rst | 13 +++++-----
 sky/backends/cloud_vm_ray_backend.py       | 28 +++++++++++++++-------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index 287b3d4370e..573c1883678 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -49,17 +49,18 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (See more example yaml files in the `r
    setup: |
      # Typical use: pip install -r requirements.txt
 
-     # If using a `my_setup.sh` script to setup, please use
-     # `bash -i my_setup.sh` to capture the environment
-     # variable and make sure `conda activate` works
+     # If using a `my_setup.sh` script that requires conda,
+     # invoke it as below to ensure `conda activate` works:
+     # bash -i my_setup.sh
      echo "running setup"
 
    run: |
      # Typical use: make use of resources, such as running training.
 
-     # If using a my_run.sh script to run commands, please use
-     # `bash -i my_run.sh` to capture the environment variable
-     # and make sure `conda activate` works
+     # If using a `my_run.sh` script that requires conda and (or)
+     # sky environment variables, e.g. $SKY_NODE_RANK and $SKY_NODE_IPS,
+     # invoke it as below to ensure both works:
+     # `bash -i my_run.sh`
      echo "hello sky!"
      conda env list
 
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index 4be6a064de4..cd4b0658ddb 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -8,6 +8,7 @@
 import json
 import os
 import re
+import sys
 import subprocess
 import tempfile
 import textwrap
@@ -1243,15 +1244,24 @@ def setup(self, handle: ResourceHandle, task: Task) -> None:
                                source=setup_sh_path,
                                target=f'/tmp/{setup_file}',
                                with_outputs=False)
-                backend_utils.run_command_on_ip_via_ssh(
-                    ip,
-                    # -i will make sure `conda activate` works
-                    f'/bin/bash -i /tmp/{setup_file}',
-                    ssh_user=ssh_user,
-                    ssh_private_key=ssh_private_key,
-                    log_path=os.path.join(self.log_dir, 'setup.log'),
-                    check=True,
-                    ssh_control_name=self._ssh_control_name(handle))
+                try:
+                    backend_utils.run_command_on_ip_via_ssh(
+                        ip,
+                        # -i will make sure `conda activate` works
+                        f'/bin/bash -i /tmp/{setup_file}',
+                        ssh_user=ssh_user,
+                        ssh_private_key=ssh_private_key,
+                        log_path=os.path.join(self.log_dir, 'setup.log'),
+                        check=True,
+                        ssh_control_name=self._ssh_control_name(handle))
+                except subprocess.CalledProcessError as e:
+                    logger.error(
+                        f'{fore.RED}Setup failed with return code'
+                        f' {e.returncode}.{style.RESET_ALL}')
+                    # Suppress the error traceback. Fail as soon as
+                    # possible (head node).
+                    sys.exit(e.returncode)
+        logger.info(f'{fore.GREEN}Setup completed.{style.RESET_ALL}')
 
     def sync_down_logs(self, handle: ResourceHandle, job_id: int) -> None:
         codegen = backend_utils.JobLibCodeGen()

From d1f99e9772c902606fdf3d2e41976cdb8521c424 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 10:50:03 -0800
Subject: [PATCH 07/14] Add test/example of using user_script

---
 .../resnet_distributed_torch_scripts/run.sh   |  7 +++++++
 .../resnet_distributed_torch_scripts/setup.sh | 19 +++++++++++++++++++
 .../resnet_distributed_torch_with_script.yaml | 13 +++++++++++++
 sky/backends/cloud_vm_ray_backend.py          |  5 ++---
 4 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 examples/resnet_distributed_torch_scripts/run.sh
 create mode 100644 examples/resnet_distributed_torch_scripts/setup.sh
 create mode 100644 examples/resnet_distributed_torch_with_script.yaml

diff --git a/examples/resnet_distributed_torch_scripts/run.sh b/examples/resnet_distributed_torch_scripts/run.sh
new file mode 100644
index 00000000000..5bb0b456259
--- /dev/null
+++ b/examples/resnet_distributed_torch_scripts/run.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+conda activate resnet
+cd pytorch-distributed-resnet
+python3 -m torch.distributed.launch --nproc_per_node=1 \
+--nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \
+--master_port=8008 resnet_ddp.py --num_epochs 20
diff --git a/examples/resnet_distributed_torch_scripts/setup.sh b/examples/resnet_distributed_torch_scripts/setup.sh
new file mode 100644
index 00000000000..d9082b563b6
--- /dev/null
+++ b/examples/resnet_distributed_torch_scripts/setup.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+[ -d pytorch-distributed-resnet ] || git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet
+cd pytorch-distributed-resnet
+
+conda activate resnet
+if [ $? -eq 0 ]; then
+    echo "conda env exists"
+else
+    echo "conda env does not exist"
+    conda create -n resnet python=3.6 -y
+    conda activate resnet
+    pip3 install -r requirements.txt
+fi
+
+mkdir -p data
+mkdir -p saved_models
+cd data
+wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+tar -xvzf cifar-10-python.tar.gz
diff --git a/examples/resnet_distributed_torch_with_script.yaml b/examples/resnet_distributed_torch_with_script.yaml
new file mode 100644
index 00000000000..9f744a5c8a2
--- /dev/null
+++ b/examples/resnet_distributed_torch_with_script.yaml
@@ -0,0 +1,13 @@
+name: resnet-distributed-app
+
+
+resources:
+    accelerators: V100
+
+num_nodes: 2
+
+setup: |
+    bash -i resnet_distributed_torch_scripts/setup.sh
+
+run: |
+    bash -i resnet_distributed_torch_scripts/run.sh
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index cd4b0658ddb..59ed2cbe3d7 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -1255,9 +1255,8 @@ def setup(self, handle: ResourceHandle, task: Task) -> None:
                         check=True,
                         ssh_control_name=self._ssh_control_name(handle))
                 except subprocess.CalledProcessError as e:
-                    logger.error(
-                        f'{fore.RED}Setup failed with return code'
-                        f' {e.returncode}.{style.RESET_ALL}')
+                    logger.error(f'{fore.RED}Setup failed with return code'
+                                 f' {e.returncode}.{style.RESET_ALL}')
                     # Suppress the error traceback. Fail as soon as
                     # possible (head node).
                     sys.exit(e.returncode)

From 1e6b64c9f64103193fec54a633185c5320d95af6 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 12:01:06 -0800
Subject: [PATCH 08/14] Fix indents

---
 README.md                                     |  4 +
 docs/source/examples/distributed-jobs.rst     | 22 ++---
 docs/source/examples/grid-search.rst          | 10 +--
 .../source/examples/iterative-dev-project.rst | 10 +--
 docs/source/getting-started/installation.rst  | 60 ++++++-------
 docs/source/getting-started/quickstart.rst    | 48 +++++-----
 docs/source/getting-started/tutorial.rst      | 88 +++++++++----------
 7 files changed, 123 insertions(+), 119 deletions(-)

diff --git a/README.md b/README.md
index 2952926e930..8c61b788146 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,11 @@ workdir: .  # Sync code dir to cloud
 
 setup: |
   # Typical use: pip install -r requirements.txt
+
   echo "running setup"
+  # If using a `my_setup.sh` script that requires conda,
+  # invoke it as below to ensure `conda activate` works:
+  # bash -i my_setup.sh
 
 run: |
   # Typical use: make use of resources, such as running training.
diff --git a/docs/source/examples/distributed-jobs.rst b/docs/source/examples/distributed-jobs.rst
index 22cc2510c7d..fa043be1853 100644
--- a/docs/source/examples/distributed-jobs.rst
+++ b/docs/source/examples/distributed-jobs.rst
@@ -11,23 +11,23 @@ For example, here is a simple PyTorch Distributed training example:
   name: resnet-distributed-app
 
   resources:
-      accelerators: V100
+    accelerators: V100
 
   num_nodes: 2
 
   setup: |
-      pip3 install --upgrade pip
-      git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet
-      cd pytorch-distributed-resnet && pip3 install -r requirements.txt
-      mkdir -p data  && mkdir -p saved_models && cd data && \
-        wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
-      tar -xvzf cifar-10-python.tar.gz
+    pip3 install --upgrade pip
+    git clone https://github.com/michaelzhiluo/pytorch-distributed-resnet
+    cd pytorch-distributed-resnet && pip3 install -r requirements.txt
+    mkdir -p data  && mkdir -p saved_models && cd data && \
+      wget -c --quiet https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
+    tar -xvzf cifar-10-python.tar.gz
 
   run: |
-      cd pytorch-distributed-resnet
-      python3 -m torch.distributed.launch --nproc_per_node=1 \
-        --nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \
-      --master_port=8008 resnet_ddp.py --num_epochs 20
+    cd pytorch-distributed-resnet
+    python3 -m torch.distributed.launch --nproc_per_node=1 \
+      --nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \
+    --master_port=8008 resnet_ddp.py --num_epochs 20
 
 In the above, :code:`num_nodes: 2` specifies that this task is to be run on 2
 nodes. The commands in :code:`run` are executed on both nodes.  Several useful
diff --git a/docs/source/examples/grid-search.rst b/docs/source/examples/grid-search.rst
index b76437d1a00..e8a7a16381b 100644
--- a/docs/source/examples/grid-search.rst
+++ b/docs/source/examples/grid-search.rst
@@ -12,11 +12,11 @@ Submitting multiple trials with different hyperparameters is simple:
 
 .. code-block:: bash
 
-  # Launch 4 trials in parallel
-  sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-3
-  sky exec mycluster --gpus V100:1 -d -- python train.py --lr 3e-3
-  sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-4
-  sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-2
+  $ # Launch 4 trials in parallel
+  $ sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-3
+  $ sky exec mycluster --gpus V100:1 -d -- python train.py --lr 3e-3
+  $ sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-4
+  $ sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-2
 
   # gets queued and will run once a GPU is available
   sky exec mycluster --gpus V100:1 -d -- python train.py --lr 1e-6
diff --git a/docs/source/examples/iterative-dev-project.rst b/docs/source/examples/iterative-dev-project.rst
index d8ae0984648..3dbdf66f06e 100644
--- a/docs/source/examples/iterative-dev-project.rst
+++ b/docs/source/examples/iterative-dev-project.rst
@@ -47,8 +47,8 @@ Use the familiar scp/rsync to transfer files between your local machine and remo
 
 .. code-block::
 
-    $ scp -r my_code/ dev:/path/to/destination  # copy files to remote VM
-    $ scp -r dev:/path/to/source my_code/       # copy files from remote VM
+  $ scp -r my_code/ dev:/path/to/destination  # copy files to remote VM
+  $ scp -r dev:/path/to/source my_code/       # copy files from remote VM
 
 Sky **simplifies code syncing** by the automatic transfer of a working directory
 to the cluster.  The working directory can be configured with the
@@ -57,8 +57,8 @@ option:
 
 .. code-block::
 
-    $ sky launch --workdir=/path/to/code task.yaml
-    $ sky exec --workdir=/path/to/code task.yaml
+  $ sky launch --workdir=/path/to/code task.yaml
+  $ sky exec --workdir=/path/to/code task.yaml
 
 These commands sync the working directory to a location on the remote VM, and
 the task is run under that working directory (e.g., to invoke scripts, access
@@ -79,4 +79,4 @@ To restart a stopped cluster:
 
 .. code-block:: console
 
-    $ sky start dev
+  $ sky start dev
diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst
index d2dbd1c2d16..ac2db91c70b 100644
--- a/docs/source/getting-started/installation.rst
+++ b/docs/source/getting-started/installation.rst
@@ -7,11 +7,11 @@ Install Sky using pip:
 
 .. code-block:: console
 
-   $ # Clone the sky codebase
-   $ git clone ssh://git@github.com/sky-proj/sky.git
-   $ cd sky
-   $ # Sky requires python >= 3.6.
-   $ pip install ".[all]"
+  $ # Clone the sky codebase
+  $ git clone ssh://git@github.com/sky-proj/sky.git
+  $ cd sky
+  $ # Sky requires python >= 3.6.
+  $ pip install ".[all]"
 
 If you only want the dependencies for certain clouds, you can also use
 :code:`".[aws,azure,gcp]"`.
@@ -26,11 +26,11 @@ tasks in the clouds, configure access to at least one cloud:
 
 .. code-block:: console
 
-   $ # Install boto
-   $ pip install boto3
+  $ # Install boto
+  $ pip install boto3
 
-   $ # Configure your AWS credentials
-   $ aws configure
+  $ # Configure your AWS credentials
+  $ aws configure
 
 To get the **AWS Access Key** required by the :code:`aws configure`, please refer to the `AWS manual <https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html#Using_CreateAccessKey>`_. The **Default region name [None]:** and **Default output format [None]:** are optional.
 
@@ -38,16 +38,16 @@ To get the **AWS Access Key** required by the :code:`aws configure`, please refe
 
 .. code-block:: console
 
-   $ pip install google-api-python-client
-   $ # Install `gcloud`; see https://cloud.google.com/sdk/docs/quickstart
-   $ conda install -c conda-forge google-cloud-sdk
+  $ pip install google-api-python-client
+  $ # Install `gcloud`; see https://cloud.google.com/sdk/docs/quickstart
+  $ conda install -c conda-forge google-cloud-sdk
 
-   $ # Init.
-   $ gcloud init
+  $ # Init.
+  $ gcloud init
 
-   $ # Run this if you don't have a credentials file.
-   $ # This will generate ~/.config/gcloud/application_default_credentials.json.
-   $ gcloud auth application-default login
+  $ # Run this if you don't have a credentials file.
+  $ # This will generate ~/.config/gcloud/application_default_credentials.json.
+  $ gcloud auth application-default login
 
 If you meet the following error (*RemoveError: 'requests' is a dependency of conda and cannot be removed from conda's operating environment*) while running :code:`conda install -c conda-forge google-cloud-sdk`, please try :code:`conda update --force conda` and run it again.
 
@@ -56,12 +56,12 @@ If you meet the following error (*RemoveError: 'requests' is a dependency of con
 
 .. code-block:: console
 
-   $ # Install the Azure CLI
-   $ pip install azure-cli==2.30.0
-   $ # Login azure
-   $ az login
-   $ # Set the subscription to use
-   $ az account set -s <subscription_id>
+  $ # Install the Azure CLI
+  $ pip install azure-cli==2.30.0
+  $ # Login azure
+  $ az login
+  $ # Set the subscription to use
+  $ az account set -s <subscription_id>
 
 **Verifying cloud setup**
 
@@ -70,16 +70,16 @@ the CLI:
 
 .. code-block:: console
 
-   $ # Verify cloud account setup
-   $ sky check
+  $ # Verify cloud account setup
+  $ sky check
 
 This will produce output verifying the correct setup of each supported cloud.
 
 .. code-block:: text
 
-   Checking credentials to enable clouds for Sky.
-      AWS: enabled
-      GCP: enabled
-      Azure: enabled
+  Checking credentials to enable clouds for Sky.
+    AWS: enabled
+    GCP: enabled
+    Azure: enabled
 
-   Sky will use only the enabled clouds to run tasks. To change this, configure cloud credentials, and run sky check.
+  Sky will use only the enabled clouds to run tasks. To change this, configure cloud credentials, and run sky check.
diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index 573c1883678..2a982e66760 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -35,34 +35,34 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (See more example yaml files in the `r
 
 .. code-block:: yaml
 
-   # hello_sky.yaml
+  # hello_sky.yaml
 
-   resources:
-     # Optional; if left out, pick from the available clouds.
-     cloud: aws
+  resources:
+    # Optional; if left out, pick from the available clouds.
+    cloud: aws
 
-     # Get 1 K80 GPU.  Use <name>:<n> to get more (e.g., "K80:8").
-     accelerators: K80
+    # Get 1 K80 GPU.  Use <name>:<n> to get more (e.g., "K80:8").
+    accelerators: K80
 
-   workdir: .
+  workdir: .
 
-   setup: |
-     # Typical use: pip install -r requirements.txt
+  setup: |
+    # Typical use: pip install -r requirements.txt
 
-     # If using a `my_setup.sh` script that requires conda,
-     # invoke it as below to ensure `conda activate` works:
-     # bash -i my_setup.sh
-     echo "running setup"
+    echo "running setup"
+    # If using a `my_setup.sh` script that requires conda,
+    # invoke it as below to ensure `conda activate` works:
+    # bash -i my_setup.sh
 
-   run: |
-     # Typical use: make use of resources, such as running training.
+  run: |
+    # Typical use: make use of resources, such as running training.
 
-     # If using a `my_run.sh` script that requires conda and (or)
-     # sky environment variables, e.g. $SKY_NODE_RANK and $SKY_NODE_IPS,
-     # invoke it as below to ensure both works:
-     # `bash -i my_run.sh`
-     echo "hello sky!"
-     conda env list
+    echo "hello sky!"
+    conda env list
+    # If using a `my_run.sh` script that requires conda and (or)
+    # sky environment variables, e.g. $SKY_NODE_RANK and $SKY_NODE_IPS,
+    # invoke it as below to ensure both works:
+    # `bash -i my_run.sh`
 
 Sky handles selecting an appropriate VM based on user-specified resource
 constraints, launching the cluster on an appropriate cloud provider, and
@@ -72,7 +72,7 @@ To launch a task based on our above YAML spec, we can use :code:`sky launch`.
 
 .. code-block:: console
 
-   $ sky launch -c mycluster hello_sky.yaml
+  $ sky launch -c mycluster hello_sky.yaml
 
 The :code:`-c` option allows us to specify a cluster name. If a cluster with the
 same name already exists, Sky will reuse that cluster. If no such cluster
@@ -84,7 +84,7 @@ We can view our existing clusters by running :code:`sky status`:
 
 .. code-block:: console
 
-   $ sky status
+  $ sky status
 
 This may show multiple clusters, if you have created several:
 
@@ -98,7 +98,7 @@ If you would like to log into the a cluster, Sky provides convenient SSH access
 
 .. code-block:: console
 
-   $ ssh mycluster
+  $ ssh mycluster
 
 Sky is more than a tool for easily provisioning and managing multiple clusters
 on different clouds.  It also comes with features for storing and moving data,
diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst
index b1dda49a028..0189ceb5ae7 100644
--- a/docs/source/getting-started/tutorial.rst
+++ b/docs/source/getting-started/tutorial.rst
@@ -9,32 +9,32 @@ and run command:
 
 .. code-block:: yaml
 
-   # dnn.yaml
-
-   name: huggingface
-
-   resources:
-     accelerators: V100:4
-
-   setup: |
-     git clone https://github.com/huggingface/transformers/
-     cd transformers
-     pip3 install .
-     cd examples/pytorch/text-classification
-     pip3 install -r requirements.txt
-
-   run: |
-     cd transformers/examples/pytorch/text-classification
-     python3 run_glue.py \
-       --model_name_or_path bert-base-cased \
-       --dataset_name imdb  \
-       --do_train \
-       --max_seq_length 128 \
-       --per_device_train_batch_size 32 \
-       --learning_rate 2e-5 \
-       --max_steps 50 \
-       --output_dir /tmp/imdb/ --overwrite_output_dir \
-       --fp16
+  # dnn.yaml
+
+  name: huggingface
+
+  resources:
+    accelerators: V100:4
+
+  setup: |
+    git clone https://github.com/huggingface/transformers/
+    cd transformers
+    pip3 install .
+    cd examples/pytorch/text-classification
+    pip3 install -r requirements.txt
+
+  run: |
+    cd transformers/examples/pytorch/text-classification
+    python3 run_glue.py \
+      --model_name_or_path bert-base-cased \
+      --dataset_name imdb  \
+      --do_train \
+      --max_seq_length 128 \
+      --per_device_train_batch_size 32 \
+      --learning_rate 2e-5 \
+      --max_steps 50 \
+      --output_dir /tmp/imdb/ --overwrite_output_dir \
+      --fp16
 
 
 We can launch training by running:
@@ -57,12 +57,12 @@ terminal, which is useful for launching long-running jobs concurrently.
 
 .. code-block:: console
 
-   $ # Launch 4 jobs, perhaps with different hyperparameters.
-   $ # We can override the task name with `-n` (optional) and resource requirement with `--gpus` (optional)
-   $ sky exec lm-cluster dnn.yaml -d -n job2 --gpus=V100:1
-   $ sky exec lm-cluster dnn.yaml -d -n job3 --gpus=V100:1
-   $ sky exec lm-cluster dnn.yaml -d -n job4 --gpus=V100:3
-   $ sky exec lm-cluster dnn.yaml -d -n job5 --gpus=V100:2
+  $ # Launch 4 jobs, perhaps with different hyperparameters.
+  $ # We can override the task name with `-n` (optional) and resource requirement with `--gpus` (optional)
+  $ sky exec lm-cluster dnn.yaml -d -n job2 --gpus=V100:1
+  $ sky exec lm-cluster dnn.yaml -d -n job3 --gpus=V100:1
+  $ sky exec lm-cluster dnn.yaml -d -n job4 --gpus=V100:3
+  $ sky exec lm-cluster dnn.yaml -d -n job5 --gpus=V100:2
 
 Because the cluster only has 4 V100 GPU, we will see the following behavior:
 
@@ -76,19 +76,19 @@ If we wish to view the output for each run after it has completed we can use:
 
 .. code-block:: console
 
-   $ # View the jobs in the queue
-   $ sky queue lm-cluster
+  $ # View the jobs in the queue
+  $ sky queue lm-cluster
 
-   ID  NAME         USER  SUBMITTED    STARTED     STATUS   
-   5   job5         user  10 mins ago  10 mins ago RUNNING
-   4   job4         user  10 mins ago  -           PENDING
-   3   job3         user  10 mins ago  9 mins ago  RUNNING
-   2   job2         user  10 mins ago  9 mins ago  RUNNING
-   1   huggingface  user  10 mins ago  1 min ago   SUCCEEDED
+  ID  NAME         USER  SUBMITTED    STARTED     STATUS   
+  5   job5         user  10 mins ago  10 mins ago RUNNING
+  4   job4         user  10 mins ago  -           PENDING
+  3   job3         user  10 mins ago  9 mins ago  RUNNING
+  2   job2         user  10 mins ago  9 mins ago  RUNNING
+  1   huggingface  user  10 mins ago  1 min ago   SUCCEEDED
 
 
-   $ # Stream the logs of job5 (ID: 5) to the console
-   $ sky logs lm-cluster 5
+  $ # Stream the logs of job5 (ID: 5) to the console
+  $ sky logs lm-cluster 5
 
-   $ # Cancel job job3 (ID: 3)
-   $ sky cancel lm-cluster 3
+  $ # Cancel job job3 (ID: 3)
+  $ sky cancel lm-cluster 3

From 0b9d33160574f49c3f33c2d872f3e48955c82a9f Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 14:27:29 -0800
Subject: [PATCH 09/14] bash -i only for conda activate

---
 docs/source/getting-started/quickstart.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index 2a982e66760..7a4250f0bfb 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -59,9 +59,8 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (See more example yaml files in the `r
 
     echo "hello sky!"
     conda env list
-    # If using a `my_run.sh` script that requires conda and (or)
-    # sky environment variables, e.g. $SKY_NODE_RANK and $SKY_NODE_IPS,
-    # invoke it as below to ensure both works:
+    # If using a `my_run.sh` script that requires conda,
+    # invoke it as below to ensure `conda activate` works:
     # `bash -i my_run.sh`
 
 Sky handles selecting an appropriate VM based on user-specified resource

From 4cc5113d4dbfa5f75977f521374a3b43263a52da Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 15:05:34 -0800
Subject: [PATCH 10/14] Fix the SKY_NODE_IPS fail to pass to the shell script

---
 docs/source/examples/distributed-jobs.rst          | 11 ++++++++---
 examples/resnet_distributed_torch.yaml             |  5 ++++-
 examples/resnet_distributed_torch_scripts/run.sh   |  5 ++++-
 examples/resnet_distributed_torch_scripts/setup.sh |  2 +-
 examples/resnet_distributed_torch_with_script.yaml |  7 +++++--
 sky/backends/cloud_vm_ray_backend.py               |  4 ++--
 6 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/docs/source/examples/distributed-jobs.rst b/docs/source/examples/distributed-jobs.rst
index fa043be1853..12158eef228 100644
--- a/docs/source/examples/distributed-jobs.rst
+++ b/docs/source/examples/distributed-jobs.rst
@@ -25,8 +25,11 @@ For example, here is a simple PyTorch Distributed training example:
 
   run: |
     cd pytorch-distributed-resnet
+
+    num_nodes=`echo "$SKY_NODE_IPS" | wc -l`
+    master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p`
     python3 -m torch.distributed.launch --nproc_per_node=1 \
-      --nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \
+      --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \
     --master_port=8008 resnet_ddp.py --num_epochs 20
 
 In the above, :code:`num_nodes: 2` specifies that this task is to be run on 2
@@ -35,5 +38,7 @@ environment variables are available to distinguish per-node commands:
 
 - :code:`SKY_NODE_RANK`: rank (an integer ID from 0 to :code:`num_nodes-1`) of
   the node executing the task
-- :code:`SKY_NODE_IPS`: a list of IP addresses of the nodes reserved to execute
-  the task
+- :code:`SKY_NODE_IPS`: a string of IP addresses of the nodes reserved to execute
+  the task, where each line contains one IP address. You can retrieve the number of
+  nodes by :code:`echo "$SKY_NODE_IPS" | wc -l` and the IP address of node-3 by
+  :code:`echo "$SKY_NODE_IPS" | cut -n 3p`
diff --git a/examples/resnet_distributed_torch.yaml b/examples/resnet_distributed_torch.yaml
index 4637ab90319..34ffbc8f6c2 100644
--- a/examples/resnet_distributed_torch.yaml
+++ b/examples/resnet_distributed_torch.yaml
@@ -16,6 +16,9 @@ setup: |
 
 run: |
     cd pytorch-distributed-resnet
+
+    num_nodes=`echo "$SKY_NODE_IPS" | wc -l`
+    master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p`
     python3 -m torch.distributed.launch --nproc_per_node=1 \
-    --nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \
+    --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \
     --master_port=8008 resnet_ddp.py --num_epochs 20
diff --git a/examples/resnet_distributed_torch_scripts/run.sh b/examples/resnet_distributed_torch_scripts/run.sh
index 5bb0b456259..89f8e3ebc45 100644
--- a/examples/resnet_distributed_torch_scripts/run.sh
+++ b/examples/resnet_distributed_torch_scripts/run.sh
@@ -2,6 +2,9 @@
 
 conda activate resnet
 cd pytorch-distributed-resnet
+num_nodes=`echo "$SKY_NODE_IPS" | wc -l`
+master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p`
+echo MASTER_ADDR $master_addr
 python3 -m torch.distributed.launch --nproc_per_node=1 \
---nnodes=${#SKY_NODE_IPS[@]} --node_rank=${SKY_NODE_RANK} --master_addr=${SKY_NODE_IPS[0]} \
+--nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \
 --master_port=8008 resnet_ddp.py --num_epochs 20
diff --git a/examples/resnet_distributed_torch_scripts/setup.sh b/examples/resnet_distributed_torch_scripts/setup.sh
index d9082b563b6..a87a0194041 100644
--- a/examples/resnet_distributed_torch_scripts/setup.sh
+++ b/examples/resnet_distributed_torch_scripts/setup.sh
@@ -9,8 +9,8 @@ else
     echo "conda env does not exist"
     conda create -n resnet python=3.6 -y
     conda activate resnet
-    pip3 install -r requirements.txt
 fi
+pip install -r requirements.txt
 
 mkdir -p data
 mkdir -p saved_models
diff --git a/examples/resnet_distributed_torch_with_script.yaml b/examples/resnet_distributed_torch_with_script.yaml
index 9f744a5c8a2..374f5d74857 100644
--- a/examples/resnet_distributed_torch_with_script.yaml
+++ b/examples/resnet_distributed_torch_with_script.yaml
@@ -2,12 +2,15 @@ name: resnet-distributed-app
 
 
 resources:
+    cloud: aws
     accelerators: V100
 
 num_nodes: 2
 
+workdir: .
+
 setup: |
-    bash -i resnet_distributed_torch_scripts/setup.sh
+    bash -i examples/resnet_distributed_torch_scripts/setup.sh
 
 run: |
-    bash -i resnet_distributed_torch_scripts/run.sh
+    bash -i examples/resnet_distributed_torch_scripts/run.sh
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index 59ed2cbe3d7..fff3c74fa25 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -244,8 +244,8 @@ def check_ip():
                     for i in range(pg.bundle_count)
                 ])
                 print('SKY INFO: Placement group IPs:', ip_list)
-                ip_list_str = ' '.join([repr(ip) for ip in ip_list])
-                export_sky_env_vars = 'export SKY_NODE_IPS=(' + ip_list_str + ')\\n'
+                ip_list_str = '\\n'.join(ip_list)
+                export_sky_env_vars = 'export SKY_NODE_IPS="' + ip_list_str + '"\\n'
                 """),
         ]
 

From f18540e0984d49d614014cf3906bcae960ddf58f Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 15:17:53 -0800
Subject: [PATCH 11/14] Update readme

---
 README.md                                  | 19 ++++++++----
 docs/source/getting-started/quickstart.rst | 34 +++++++++++-----------
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index b49e99ae8d8..9305d137f9a 100644
--- a/README.md
+++ b/README.md
@@ -13,23 +13,32 @@ sky launch -c mycluster hello_sky.yaml
 
 ```yaml
 # hello_sky.yaml
+
 resources:
-  accelerators: V100:1  # 1x NVIDIA V100 GPU
+  # Optional; if left out, pick from the available clouds.
+  cloud: aws
 
-workdir: .  # Sync code dir to cloud
+  # Get 1 K80 GPU.  Use <name>:<n> to get more (e.g., "K80:8").
+  accelerators: K80
 
-setup: |
-  # Typical use: pip install -r requirements.txt
+# Working directory (optional) containing the project codebase.
+# This directory will be synced to ~/sky_workdir on the provisioned cluster.
+workdir: .
 
+# Typical use: pip install -r requirements.txt
+setup: |
   echo "running setup"
   # If using a `my_setup.sh` script that requires conda,
   # invoke it as below to ensure `conda activate` works:
   # bash -i my_setup.sh
 
+# Typical use: make use of resources, such as running training.
 run: |
-  # Typical use: make use of resources, such as running training.
   echo "hello sky!"
   conda env list
+  # If using a `my_run.sh` script that requires conda,
+  # invoke it as below to ensure `conda activate` works:
+  # `bash -i my_run.sh`
 ```
 
 ## Getting Started
diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index c70f5d6b520..3211925f7df 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -31,23 +31,23 @@ Let's provision an instance with a single K80 GPU.
 
 .. code-block:: bash
 
-    # Provisions/reuses an interactive node with a single K80 GPU.
-    # Any of the interactive node commands (gpunode, tpunode, cpunode)
-    # will automatically log in to the cluster.
-    sky gpunode -c mygpu --gpus K80
-
-    Last login: Wed Feb 23 22:35:47 2022 from 136.152.143.101
-    ubuntu@ip-172-31-86-108:~$ gpustat
-    ip-172-31-86-108     Wed Feb 23 22:42:43 2022  450.142.00
-    [0] Tesla K80        | 31°C,   0 % |     0 / 11441 MB |
-    ubuntu@ip-172-31-86-108:~$
-    ^D
-
-    # View the machine in the cluster table.
-    sky status
-
-    NAME   LAUNCHED        RESOURCES                     COMMAND                          STATUS
-    mygpu  a few secs ago  1x Azure(Standard_NC6_Promo)  sky gpunode -c mygpu --gpus K80  UP
+  # Provisions/reuses an interactive node with a single K80 GPU.
+  # Any of the interactive node commands (gpunode, tpunode, cpunode)
+  # will automatically log in to the cluster.
+  sky gpunode -c mygpu --gpus K80
+
+  Last login: Wed Feb 23 22:35:47 2022 from 136.152.143.101
+  ubuntu@ip-172-31-86-108:~$ gpustat
+  ip-172-31-86-108     Wed Feb 23 22:42:43 2022  450.142.00
+  [0] Tesla K80        | 31°C,   0 % |     0 / 11441 MB |
+  ubuntu@ip-172-31-86-108:~$
+  ^D
+
+  # View the machine in the cluster table.
+  sky status
+
+  NAME   LAUNCHED        RESOURCES                     COMMAND                          STATUS
+  mygpu  a few secs ago  1x Azure(Standard_NC6_Promo)  sky gpunode -c mygpu --gpus K80  UP
 
 After you are done, run :code:`sky down mygpu` to terminate the cluster. Find more details
 on managing the lifecycle of your cluster :ref:`here <interactive-nodes>`.

From b157fecafef6855d95adba02529ed01afeca306e Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 15:38:25 -0800
Subject: [PATCH 12/14] update env_check

---
 examples/env_check.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/env_check.yaml b/examples/env_check.yaml
index c14460f5d4e..c05d75188b8 100644
--- a/examples/env_check.yaml
+++ b/examples/env_check.yaml
@@ -25,4 +25,6 @@ run: |
   fi
 
   echo NODE ID: $SKY_NODE_RANK
-  echo NODE IP: ${SKY_NODE_IPS[$SKY_NODE_RANK]}
+  echo NODE IPS: "$SKY_NODE_IPS"
+  worker_addr=`echo "$SKY_NODE_IPS" | sed -n 2p`
+  echo Worker IP: $worker_addr

From 4b91b2aedd60c14aa1ea621be8634fd9258b0349 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 20:46:23 -0800
Subject: [PATCH 13/14] Fix comments

---
 README.md                                  | 5 ++---
 docs/source/examples/distributed-jobs.rst  | 4 ++--
 docs/source/getting-started/quickstart.rst | 3 +--
 sky/backends/cloud_vm_ray_backend.py       | 3 ++-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 9305d137f9a..d17ad3f5f61 100644
--- a/README.md
+++ b/README.md
@@ -18,8 +18,7 @@ resources:
   # Optional; if left out, pick from the available clouds.
   cloud: aws
 
-  # Get 1 K80 GPU.  Use <name>:<n> to get more (e.g., "K80:8").
-  accelerators: K80
+  accelerators: V100:1 # 1x NVIDIA V100 GPU
 
 # Working directory (optional) containing the project codebase.
 # This directory will be synced to ~/sky_workdir on the provisioned cluster.
@@ -38,7 +37,7 @@ run: |
   conda env list
   # If using a `my_run.sh` script that requires conda,
   # invoke it as below to ensure `conda activate` works:
-  # `bash -i my_run.sh`
+  # bash -i my_run.sh
 ```
 
 ## Getting Started
diff --git a/docs/source/examples/distributed-jobs.rst b/docs/source/examples/distributed-jobs.rst
index 12158eef228..5bea00fd2c1 100644
--- a/docs/source/examples/distributed-jobs.rst
+++ b/docs/source/examples/distributed-jobs.rst
@@ -30,7 +30,7 @@ For example, here is a simple PyTorch Distributed training example:
     master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p`
     python3 -m torch.distributed.launch --nproc_per_node=1 \
       --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \
-    --master_port=8008 resnet_ddp.py --num_epochs 20
+      --master_port=8008 resnet_ddp.py --num_epochs 20
 
 In the above, :code:`num_nodes: 2` specifies that this task is to be run on 2
 nodes. The commands in :code:`run` are executed on both nodes.  Several useful
@@ -41,4 +41,4 @@ environment variables are available to distinguish per-node commands:
 - :code:`SKY_NODE_IPS`: a string of IP addresses of the nodes reserved to execute
   the task, where each line contains one IP address. You can retrieve the number of
   nodes by :code:`echo "$SKY_NODE_IPS" | wc -l` and the IP address of node-3 by
-  :code:`echo "$SKY_NODE_IPS" | cut -n 3p`
+  :code:`echo "$SKY_NODE_IPS" | sed -n 3p`
diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
index 3211925f7df..15deaed3322 100644
--- a/docs/source/getting-started/quickstart.rst
+++ b/docs/source/getting-started/quickstart.rst
@@ -84,8 +84,7 @@ requiring an NVIDIA Tesla K80 GPU on AWS. See more example yaml files in the `re
     # Optional; if left out, pick from the available clouds.
     cloud: aws
 
-    # Get 1 K80 GPU.  Use <name>:<n> to get more (e.g., "K80:8").
-    accelerators: K80
+    accelerators: V100:1 # 1x NVIDIA V100 GPU
 
   # Working directory (optional) containing the project codebase.
   # This directory will be synced to ~/sky_workdir on the provisioned cluster.
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
index fff3c74fa25..15edce9b34b 100644
--- a/sky/backends/cloud_vm_ray_backend.py
+++ b/sky/backends/cloud_vm_ray_backend.py
@@ -648,6 +648,7 @@ def _retry_region_zones(self, to_provision: Resources, num_nodes: int,
                             dryrun: bool, stream_logs: bool, cluster_name: str):
         """The provision retry loop."""
         style = colorama.Style
+        fore = colorama.Fore
         # Get log_path name
         log_path = os.path.join(self.log_dir, 'provision.log')
         log_abs_path = os.path.abspath(log_path)
@@ -733,7 +734,7 @@ def _retry_region_zones(self, to_provision: Resources, num_nodes: int,
 
                 cluster_name = config_dict['cluster_name']
                 plural = '' if num_nodes == 1 else 's'
-                logger.info(f'{style.BRIGHT}Successfully provisioned or found'
+                logger.info(f'{fore.GREEN}Successfully provisioned or found'
                             f' existing VM{plural}.{style.RESET_ALL}')
                 return config_dict
         message = ('Failed to acquire resources in all regions/zones'

From 77bbea30bfdfde5e15c9a75f66592ef312216836 Mon Sep 17 00:00:00 2001
From: Zhanghao Wu <zhanghao.wu@outlook.com>
Date: Wed, 23 Feb 2022 21:29:17 -0800
Subject: [PATCH 14/14] Change to head -n1

---
 docs/source/examples/distributed-jobs.rst        | 2 +-
 examples/resnet_distributed_torch.yaml           | 2 +-
 examples/resnet_distributed_torch_scripts/run.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/examples/distributed-jobs.rst b/docs/source/examples/distributed-jobs.rst
index 5bea00fd2c1..f004522128e 100644
--- a/docs/source/examples/distributed-jobs.rst
+++ b/docs/source/examples/distributed-jobs.rst
@@ -27,7 +27,7 @@ For example, here is a simple PyTorch Distributed training example:
     cd pytorch-distributed-resnet
 
     num_nodes=`echo "$SKY_NODE_IPS" | wc -l`
-    master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p`
+    master_addr=`echo "$SKY_NODE_IPS" | head -n1`
     python3 -m torch.distributed.launch --nproc_per_node=1 \
       --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \
       --master_port=8008 resnet_ddp.py --num_epochs 20
diff --git a/examples/resnet_distributed_torch.yaml b/examples/resnet_distributed_torch.yaml
index 34ffbc8f6c2..8df3810c25e 100644
--- a/examples/resnet_distributed_torch.yaml
+++ b/examples/resnet_distributed_torch.yaml
@@ -18,7 +18,7 @@ run: |
     cd pytorch-distributed-resnet
 
     num_nodes=`echo "$SKY_NODE_IPS" | wc -l`
-    master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p`
+    master_addr=`echo "$SKY_NODE_IPS" | head -n1`
     python3 -m torch.distributed.launch --nproc_per_node=1 \
     --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \
     --master_port=8008 resnet_ddp.py --num_epochs 20
diff --git a/examples/resnet_distributed_torch_scripts/run.sh b/examples/resnet_distributed_torch_scripts/run.sh
index 89f8e3ebc45..bc49f331280 100644
--- a/examples/resnet_distributed_torch_scripts/run.sh
+++ b/examples/resnet_distributed_torch_scripts/run.sh
@@ -3,7 +3,7 @@
 conda activate resnet
 cd pytorch-distributed-resnet
 num_nodes=`echo "$SKY_NODE_IPS" | wc -l`
-master_addr=`echo "$SKY_NODE_IPS" | sed -n 1p`
+master_addr=`echo "$SKY_NODE_IPS" | head -n1`
 echo MASTER_ADDR $master_addr
 python3 -m torch.distributed.launch --nproc_per_node=1 \
 --nnodes=$num_nodes --node_rank=${SKY_NODE_RANK} --master_addr=$master_addr \