[CI][GCE][RLlib] Add GCE variations to RLlib release tests (#34080)

* Update cluster compute yamls * Add tests to release_tests.yaml * Update region and allowed azs to fit GCE Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
ray-project · Apr 5, 2023 · 3d335e1 · 3d335e1
1 parent fb441ce
commit 3d335e1
Show file tree

Hide file tree

Showing 15 changed files with 323 additions and 38 deletions.
diff --git a/release/long_running_tests/tpl_cpu_3_gce.yaml b/release/long_running_tests/tpl_cpu_3_gce.yaml
@@ -1,6 +1,6 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west1
-allowed_azs: 
+allowed_azs:
     - us-west1-c
 
 max_workers: 2

diff --git a/release/release_tests.yaml b/release/release_tests.yaml
@@ -1707,21 +1707,21 @@
     wait_for_nodes:
       num_nodes: 3
 
-
   smoke_test:
     frequency: nightly
 
     run:
       timeout: 3600
 
+
   alert: long_running_tests
 
   variations:
     - __suffix__: aws
     - __suffix__: gce
       env: gce
       frequency: manual
-      smoke_test: 
+      smoke_test:
         frequency: manual
         run:
           timeout: 3600
@@ -2820,6 +2820,15 @@
 
   alert: default
 
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_env: app_config.yaml
+        cluster_compute: 2gpus_32cpus_gce.yaml
+
 - name: rllib_learning_tests_appo_torch
   group: RLlib tests
   working_dir: rllib_tests
@@ -3142,6 +3151,15 @@
 
   alert: default
 
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_env: app_config.yaml
+        cluster_compute: 2gpus_32cpus_gce.yaml
+
 - name: rllib_learning_tests_sac_tf
   group: RLlib tests
   working_dir: rllib_tests
@@ -3287,6 +3305,15 @@
 
   alert: default
 
+  variations:
+    - __suffix__: aws
+    - __suffix__: gce
+      env: gce
+      frequency: manual
+      cluster:
+        cluster_env: app_config.yaml
+        cluster_compute: 8gpus_96cpus_gce.yaml
+
 - name: rllib_multi_gpu_with_attention_learning_tests
   group: RLlib tests
   working_dir: rllib_tests
@@ -3323,8 +3350,6 @@
     wait_for_nodes:
       num_nodes: 6
 
-
-
   smoke_test:
     frequency: nightly
 
@@ -3344,7 +3369,8 @@
           timeout: 2000
       cluster:
         cluster_env: app_config.yaml
-        cluster_compute: 4gpus_544_cpus_gce.yaml 
+        cluster_compute: 4gpus_512_cpus_gce.yaml
+
 
 ########################
 # Core Nightly Tests

diff --git a/release/rllib_tests/12gpus_192cpus.yaml b/release/rllib_tests/12gpus_192cpus.yaml
diff --git a/release/rllib_tests/1gpu_16cpus_gce.yaml b/release/rllib_tests/1gpu_16cpus_gce.yaml
@@ -0,0 +1,32 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+    - us-west1-b
+
+max_workers: 0
+
+head_node_type:
+    name: head_node
+    instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: n2-standard-4 # m5.xlarge
+      min_workers: 0
+      max_workers: 0
+      use_spot: false
+
+gcp_advanced_configurations_json:
+  instance_properties:
+    disks:
+      - boot: true
+        auto_delete: true
+        initialize_params:
+          disk_size_gb: 500
+
+#aws:
+#    BlockDeviceMappings:
+#        - DeviceName: /dev/sda1
+#          Ebs:
+#            DeleteOnTermination: true
+#            VolumeSize: 500
diff --git a/release/rllib_tests/1gpu_24cpus_gce.yaml b/release/rllib_tests/1gpu_24cpus_gce.yaml
@@ -0,0 +1,32 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+    - us-west1-b
+
+max_workers: 2
+
+head_node_type:
+    name: head_node
+    instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: n2-standard-8 # m5.2xlarge
+      min_workers: 1
+      max_workers: 1
+      use_spot: false
+
+gcp_advanced_configurations_json:
+  instance_properties:
+    disks:
+      - boot: true
+        auto_delete: true
+        initialize_params:
+          disk_size_gb: 500
+
+#aws:
+#    BlockDeviceMappings:
+#        - DeviceName: /dev/sda1
+#          Ebs:
+#            DeleteOnTermination: true
+#            VolumeSize: 500
diff --git a/release/rllib_tests/8gpus_64cpus.yaml → release/rllib_tests/1gpu_32cpus.yaml b/release/rllib_tests/8gpus_64cpus.yaml → release/rllib_tests/1gpu_32cpus.yaml
@@ -1,17 +1,17 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 7
 
 head_node_type:
     name: head_node
-    instance_type: p3.16xlarge
+    instance_type: g3s.xlarge
 
 worker_node_types:
     - name: worker_node
       instance_type: m5.xlarge
-      min_workers: 0
-      max_workers: 0
+      min_workers: 7
+      max_workers: 7
       use_spot: false
 
 aws:

diff --git a/release/rllib_tests/1gpu_32cpus_gce.yaml b/release/rllib_tests/1gpu_32cpus_gce.yaml
@@ -0,0 +1,32 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+    - us-west1-b
+
+max_workers: 7
+
+head_node_type:
+    name: head_node
+    instance_type: n1-highmem-4-nvidia-tesla-v100-1 # g3s.xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: n2-standard-4 # m5.xlarge
+      min_workers: 7
+      max_workers: 7
+      use_spot: false
+
+gcp_advanced_configurations_json:
+  instance_properties:
+    disks:
+      - boot: true
+        auto_delete: true
+        initialize_params:
+          disk_size_gb: 500
+
+#aws:
+#    BlockDeviceMappings:
+#        - DeviceName: /dev/sda1
+#          Ebs:
+#            DeleteOnTermination: true
+#            VolumeSize: 500
diff --git a/release/rllib_tests/1gpu_4cpus_gce.yaml b/release/rllib_tests/1gpu_4cpus_gce.yaml
@@ -0,0 +1,32 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+    - us-west1-b
+
+max_workers: 0
+
+head_node_type:
+    name: head_node
+    instance_type: n1-standard-4-nvidia-tesla-t4-1 # p2.xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: n2-standard-4 # m5.xlarge
+      min_workers: 0
+      max_workers: 0
+      use_spot: false
+
+gcp_advanced_configurations_json:
+  instance_properties:
+    disks:
+      - boot: true
+        auto_delete: true
+        initialize_params:
+          disk_size_gb: 500
+
+#aws:
+#    BlockDeviceMappings:
+#        - DeviceName: /dev/sda1
+#          Ebs:
+#            DeleteOnTermination: true
+#            VolumeSize: 500
diff --git a/release/rllib_tests/2gpus_32cpus_gce.yaml b/release/rllib_tests/2gpus_32cpus_gce.yaml
@@ -0,0 +1,37 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+    - us-west1-b
+
+max_workers: 0
+
+# This head node was intentionally chosen to be larger than 2GPUs (has 4GPUs).
+# This makes sure we can safely migrate the 2GPU testcases to GCE without having
+# 2GPU instances available on GCE today.
+# TODO (Artur): Move to 2GPU head node once it's available or possibly make tests using this use 4 GPUs.
+# Note: Remember to also update the worker counts in this file when doing this!
+head_node_type:
+    name: head_node
+    instance_type: n1-highmem-32-nvidia-tesla-v100-4 # g3.8xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: n2-standard-4 # m5.xlarge
+      min_workers: 0
+      max_workers: 0
+      use_spot: false
+
+gcp_advanced_configurations_json:
+  instance_properties:
+    disks:
+      - boot: true
+        auto_delete: true
+        initialize_params:
+          disk_size_gb: 500
+
+#aws:
+#    BlockDeviceMappings:
+#        - DeviceName: /dev/sda1
+#          Ebs:
+#            DeleteOnTermination: true
+#            VolumeSize: 500
diff --git a/release/rllib_tests/2gpus_64cpus_gce.yaml b/release/rllib_tests/2gpus_64cpus_gce.yaml
@@ -0,0 +1,36 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+    - us-west1-b
+
+max_workers: 1
+
+# This head node was intentionally chosen to be larger than 2GPUs (has 4GPUs).
+# This makes sure we can safely migrate the 2GPU testcases to GCE without having
+# 2GPU instances available on GCE today.
+# TODO (Artur): Move to 2GPU head node once it's available or possibly make tests using this use 4 GPUs.
+head_node_type:
+    name: head_node
+    instance_type: n1-highmem-32-nvidia-tesla-v100-4 # g3.8xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: n2-standard-32 # m5.8xlarge
+      min_workers: 1
+      max_workers: 1
+      use_spot: false
+
+gcp_advanced_configurations_json:
+  instance_properties:
+    disks:
+      - boot: true
+        auto_delete: true
+        initialize_params:
+          disk_size_gb: 500
+
+#aws:
+#    BlockDeviceMappings:
+#        - DeviceName: /dev/sda1
+#          Ebs:
+#            DeleteOnTermination: true
+#            VolumeSize: 500
diff --git a/release/rllib_tests/32cpus_gce.yaml b/release/rllib_tests/32cpus_gce.yaml
@@ -0,0 +1,32 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+    - us-west1-c
+
+max_workers: 0
+
+head_node_type:
+    name: head_node
+    instance_type: n2-standard-32 # m5.8xlarge
+
+worker_node_types:
+    - name: worker_node
+      instance_type: n2-standard-4 # m5.xlarge
+      min_workers: 0
+      max_workers: 0
+      use_spot: false
+
+gcp_advanced_configurations_json:
+  instance_properties:
+    disks:
+      - boot: true
+        auto_delete: true
+        initialize_params:
+          disk_size_gb: 500
+
+#aws:
+#    BlockDeviceMappings:
+#        - DeviceName: /dev/sda1
+#          Ebs:
+#            DeleteOnTermination: true
+#            VolumeSize: 500