Support 'name:cnt' accelerators spec in YAML (#396)

concretevitamin · web-flow · commit 003eac8eb9c0 · 2022-02-20T21:59:38.000-08:00
* Support 'name:cnt' accelerators spec in YAML * Fixes #373: 'sky start/down' should error out
diff --git a/README.md b/README.md
@@ -14,8 +14,7 @@ sky launch -c mycluster hello_sky.yaml
 ```yaml
 # hello_sky.yaml
 resources:
-  accelerators:
-    K80:4
+  accelerators: K80:4
 
 setup: |
   # Typical use: pip install -r requirements.txt
diff --git a/docs/source/getting-started/quickstart.rst b/docs/source/getting-started/quickstart.rst
@@ -37,22 +37,20 @@ requiring an NVIDIA Tesla K80 GPU on AWS. (More example yaml files can be found
    # hello_sky.yaml
 
    resources:
-      # Optional; if left out, pick from the available clouds.
-      cloud: aws
+     # Optional; if left out, pick from the available clouds.
+     cloud: aws
 
-      # Get more GPUs with
-      #   accelerators:
-      #     K80: 8
-      accelerators: K80
+     # Get 1 K80 GPU.  Use <name>:<n> to get more (e.g., "K80:8").
+     accelerators: K80
 
    setup: |
-      # Typical use: pip install -r requirements.txt
-      echo "running setup"
+     # Typical use: pip install -r requirements.txt
+     echo "running setup"
 
    run: |
-      # Typical use: make use of resources, such as running training.
-      echo "hello sky!"
-      conda env list
+     # Typical use: make use of resources, such as running training.
+     echo "hello sky!"
+     conda env list
 
 Sky handles selecting an appropriate VM based on user-specified resource
 constraints, launching the cluster on an appropriate cloud provider, and
diff --git a/docs/source/getting-started/tutorial.rst b/docs/source/getting-started/tutorial.rst
@@ -14,28 +14,27 @@ and run command:
    name: huggingface
 
    resources:
-      accelerators:
-         V100: 4
+     accelerators: V100:4
 
    setup: |
-      git clone https://github.com/huggingface/transformers/
-      cd transformers
-      pip3 install .
-      cd examples/pytorch/text-classification
-      pip3 install -r requirements.txt
+     git clone https://github.com/huggingface/transformers/
+     cd transformers
+     pip3 install .
+     cd examples/pytorch/text-classification
+     pip3 install -r requirements.txt
 
    run: |
-      cd transformers/examples/pytorch/text-classification
-      python3 run_glue.py \
-         --model_name_or_path bert-base-cased \
-         --dataset_name imdb  \
-         --do_train \
-         --max_seq_length 128 \
-         --per_device_train_batch_size 32 \
-         --learning_rate 2e-5 \
-         --max_steps 50 \
-         --output_dir /tmp/imdb/ --overwrite_output_dir \
-         --fp16
+     cd transformers/examples/pytorch/text-classification
+     python3 run_glue.py \
+       --model_name_or_path bert-base-cased \
+       --dataset_name imdb  \
+       --do_train \
+       --max_seq_length 128 \
+       --per_device_train_batch_size 32 \
+       --learning_rate 2e-5 \
+       --max_steps 50 \
+       --output_dir /tmp/imdb/ --overwrite_output_dir \
+       --fp16
 
 
 We can launch training by running:
@@ -93,4 +92,3 @@ If we wish to view the output for each run after it has completed we can use:
 
    $ # Cancel job job3 (ID: 3)
    $ sky cancel lm-cluster 3
-
diff --git a/docs/source/reference/interactive-nodes.rst b/docs/source/reference/interactive-nodes.rst
@@ -69,8 +69,7 @@ By default, interactive clusters are a single node. If you require a cluster wit
 
     num_nodes: 16
     resources:
-      accelerators:
-        V100: 8
+      accelerators: V100:8
 
 .. code-block:: console
 
@@ -81,4 +80,3 @@ To log in to the head node:
 .. code-block:: console
 
     $ ssh my-cluster
-
diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst
@@ -23,10 +23,11 @@ describe all fields available.
     resources:
       cloud: aws  # A cloud (optional) can be specified, if desired.
 
-      # Accelerator requirements (optional) can be specified, use sky show-gpus
+      # Accelerator requirements (optional) can be specified, use `sky show-gpus`
       # to view available accelerator configurations.
-      accelerators:
-        V100: 4  # Specify the accelerator type and the count per node.
+      # This specifies the accelerator type and the count per node. Format:
+      # <name>:<cnt> or <name> (short for a count of 1).
+      accelerators: V100:4
 
       # Accelerator arguments (optional) provides additional metadata for some
       # accelerators, such as the TensorFlow version for TPUs.
diff --git a/examples/huggingface_glue_imdb_app.yaml b/examples/huggingface_glue_imdb_app.yaml
@@ -3,8 +3,7 @@ name: huggingface
 resources:
   accelerators: V100
   # The above is a shorthand for <name>: <count=1>.  Same as:
-  # accelerators:
-  #   V100: 1
+  # accelerators: V100:1
 
 # The setup command.  Will be run under the working directory.
 setup: |
diff --git a/examples/job_queue/job.yaml b/examples/job_queue/job.yaml
@@ -9,8 +9,7 @@
 name: job
 
 resources:
-  accelerators:
-    K80: 0.5
+  accelerators: K80:0.5
 
 setup: |
   echo "running setup"
diff --git a/examples/job_queue/job_gpu.yaml b/examples/job_queue/job_gpu.yaml
@@ -9,8 +9,7 @@
 name: job
 
 resources:
-  accelerators:
-    K80: 0.5
+  accelerators: K80:0.5
 
 # setup: |
 #   conda create -n test python=3.7 -y
@@ -25,4 +24,3 @@ run: |
   echo "started"
   python -u -c "import torch; a = torch.randn(10000, 10000).cuda(); b = torch.randn(10000, 10000).cuda(); [print((a @ b).sum()) for _ in range(10000000000)]"
   echo "ended"
-
diff --git a/examples/job_queue/job_multinode.yaml b/examples/job_queue/job_multinode.yaml
@@ -10,8 +10,7 @@
 name: job_multinode
 
 resources:
-  accelerators:
-    K80: 0.5
+  accelerators: K80:0.5
 
 num_nodes: 2
 
diff --git a/examples/many_gpu_vms.yaml b/examples/many_gpu_vms.yaml
@@ -2,8 +2,7 @@ name: many_gpu_vms
 
 resources:
   cloud: aws
-  accelerators:
-    V100: 8
+  accelerators: V100:8
   # use_spot: true
 
 num_nodes: 16
diff --git a/examples/ray_tune_app.yaml b/examples/ray_tune_app.yaml
@@ -1,7 +1,6 @@
 resources:
     cloud: aws
-    accelerators:
-        V100: 1
+    accelerators: V100
 
 num_nodes: 2
 
diff --git a/examples/resnet_distributed_torch.yaml b/examples/resnet_distributed_torch.yaml
@@ -2,8 +2,7 @@ name: resnet-distributed-app
 
 
 resources:
-    accelerators:
-        V100: 1
+    accelerators: V100
 
 num_nodes: 2
 
diff --git a/sky/cli.py b/sky/cli.py
@@ -904,7 +904,7 @@ def stop(
 
 
 @cli.command(cls=_DocumentedCodeCommand)
-@click.argument('clusters', nargs=-1, required=False)
+@click.argument('clusters', nargs=-1, required=True)
 def start(clusters: Tuple[str]):
     """Restart cluster(s).
 
@@ -1038,10 +1038,7 @@ def down(
         sky down -a
 
     """
-    names = clusters
-    if not all and not names:
-        return
-    _terminate_or_stop_clusters(names, apply_to_all=all, terminate=True)
+    _terminate_or_stop_clusters(clusters, apply_to_all=all, terminate=True)
 
 
 def _terminate_or_stop_clusters(names: Tuple[str], apply_to_all: Optional[bool],
diff --git a/sky/resources.py b/sky/resources.py
@@ -21,13 +21,15 @@ class Resources:
     Examples:
 
         # Fully specified cloud and instance type (is_launchable() is True).
-        sky.Resources(clouds.AWS(), 'p3.2xlarge'),
-        sky.Resources(clouds.GCP(), 'n1-standard-16'),
+        sky.Resources(clouds.AWS(), 'p3.2xlarge')
+        sky.Resources(clouds.GCP(), 'n1-standard-16')
         sky.Resources(clouds.GCP(), 'n1-standard-8', 'V100')
 
         # Specifying required resources; Sky decides the cloud/instance type.
-        sky.Resources(accelerators='V100'),
-        sky.Resources(clouds.GCP(), accelerators={'V100': 1}),
+        # The below are equivalent:
+        sky.Resources(accelerators='V100')
+        sky.Resources(accelerators='V100:1')
+        sky.Resources(accelerators={'V100': 1})
 
         # TODO:
         sky.Resources(requests={'mem': '16g', 'cpu': 8})
@@ -48,7 +50,22 @@ def __init__(
             'If instance_type is specified, must specify the cloud'
         if accelerators is not None:
             if isinstance(accelerators, str):  # Convert to Dict[str, int].
-                accelerators = {accelerators: 1}
+                if ':' not in accelerators:
+                    accelerators = {accelerators: 1}
+                else:
+                    splits = accelerators.split(':')
+                    parse_error = ('The "accelerators" field as a str '
+                                   'should be <name> or <name>:<cnt>. '
+                                   f'Found: {accelerators!r}')
+                    if len(splits) != 2:
+                        raise ValueError(parse_error)
+                    try:
+                        accelerators = {splits[0]: int(splits[1])}
+                    except ValueError:
+                        try:
+                            accelerators = {splits[0]: float(splits[1])}
+                        except ValueError:
+                            raise ValueError(parse_error) from None
             assert len(accelerators) == 1, accelerators
 
             acc, _ = list(accelerators.items())[0]
diff --git a/tests/test_optimizer_dryruns.py b/tests/test_optimizer_dryruns.py
@@ -1,10 +1,21 @@
 import pytest
+import tempfile
+import textwrap
 
 import sky
 from sky import clouds
 from sky import exceptions
 
 
+def _test_parse_accelerators(spec, expected_accelerators):
+    with tempfile.NamedTemporaryFile('w') as f:
+        f.write(spec)
+        f.flush()
+        with sky.Dag():
+            task = sky.Task.from_yaml(f.name)
+            assert list(task.resources)[0].accelerators == expected_accelerators
+
+
 # Monkey-patching is required because in the test environment, no cloud is
 # enabled. The optimizer checks the environment to find enabled clouds, and
 # only generates plans within these clouds. The tests assume that all three
@@ -133,3 +144,33 @@ def test_instance_type_matches_accelerators(monkeypatch):
         sky.Resources(sky.AWS(),
                       instance_type='p3.16xlarge',
                       accelerators={'V100': 1}))
+
+
+def test_parse_accelerators_from_yaml():
+    spec = textwrap.dedent("""\
+      resources:
+        accelerators: V100""")
+    _test_parse_accelerators(spec, {'V100': 1})
+
+    spec = textwrap.dedent("""\
+      resources:
+        accelerators: V100:4""")
+    _test_parse_accelerators(spec, {'V100': 4})
+
+    spec = textwrap.dedent("""\
+      resources:
+        accelerators: V100:0.5""")
+    _test_parse_accelerators(spec, {'V100': 0.5})
+
+    spec = textwrap.dedent("""\
+      resources:
+        accelerators: \"V100: 0.5\"""")
+    _test_parse_accelerators(spec, {'V100': 0.5})
+
+    # Invalid.
+    spec = textwrap.dedent("""\
+      resources:
+        accelerators: \"V100: expected_a_float_here\"""")
+    with pytest.raises(ValueError) as e:
+        _test_parse_accelerators(spec, None)
+        assert 'The "accelerators" field as a str ' in str(e.value)