Skip to content

Commit

Permalink
[CI][GCE][RLlib] Add GCE variations to RLlib release tests (#34080)
Browse files Browse the repository at this point in the history
* Update cluster compute yamls
* Add tests to release_tests.yaml
* Update region and allowed azs to fit GCE

Signed-off-by: Artur Niederfahrenhorst <artur@anyscale.com>
  • Loading branch information
ArturNiederfahrenhorst authored Apr 5, 2023
1 parent fb441ce commit 3d335e1
Show file tree
Hide file tree
Showing 15 changed files with 323 additions and 38 deletions.
2 changes: 1 addition & 1 deletion release/long_running_tests/tpl_cpu_3_gce.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
allowed_azs:
- us-west1-c

max_workers: 2
Expand Down
36 changes: 31 additions & 5 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1707,21 +1707,21 @@
wait_for_nodes:
num_nodes: 3


smoke_test:
frequency: nightly

run:
timeout: 3600


alert: long_running_tests

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
smoke_test:
smoke_test:
frequency: manual
run:
timeout: 3600
Expand Down Expand Up @@ -2820,6 +2820,15 @@

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus_gce.yaml

- name: rllib_learning_tests_appo_torch
group: RLlib tests
working_dir: rllib_tests
Expand Down Expand Up @@ -3142,6 +3151,15 @@

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_env: app_config.yaml
cluster_compute: 2gpus_32cpus_gce.yaml

- name: rllib_learning_tests_sac_tf
group: RLlib tests
working_dir: rllib_tests
Expand Down Expand Up @@ -3287,6 +3305,15 @@

alert: default

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
frequency: manual
cluster:
cluster_env: app_config.yaml
cluster_compute: 8gpus_96cpus_gce.yaml

- name: rllib_multi_gpu_with_attention_learning_tests
group: RLlib tests
working_dir: rllib_tests
Expand Down Expand Up @@ -3323,8 +3350,6 @@
wait_for_nodes:
num_nodes: 6



smoke_test:
frequency: nightly

Expand All @@ -3344,7 +3369,8 @@
timeout: 2000
cluster:
cluster_env: app_config.yaml
cluster_compute: 4gpus_544_cpus_gce.yaml
cluster_compute: 4gpus_512_cpus_gce.yaml


########################
# Core Nightly Tests
Expand Down
22 changes: 0 additions & 22 deletions release/rllib_tests/12gpus_192cpus.yaml

This file was deleted.

32 changes: 32 additions & 0 deletions release/rllib_tests/1gpu_16cpus_gce.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-b

max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge

worker_node_types:
- name: worker_node
instance_type: n2-standard-4 # m5.xlarge
min_workers: 0
max_workers: 0
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
disks:
- boot: true
auto_delete: true
initialize_params:
disk_size_gb: 500

#aws:
# BlockDeviceMappings:
# - DeviceName: /dev/sda1
# Ebs:
# DeleteOnTermination: true
# VolumeSize: 500
32 changes: 32 additions & 0 deletions release/rllib_tests/1gpu_24cpus_gce.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-b

max_workers: 2

head_node_type:
name: head_node
instance_type: n1-standard-16-nvidia-tesla-t4-1 # g3.4xlarge

worker_node_types:
- name: worker_node
instance_type: n2-standard-8 # m5.2xlarge
min_workers: 1
max_workers: 1
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
disks:
- boot: true
auto_delete: true
initialize_params:
disk_size_gb: 500

#aws:
# BlockDeviceMappings:
# - DeviceName: /dev/sda1
# Ebs:
# DeleteOnTermination: true
# VolumeSize: 500
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 7

head_node_type:
name: head_node
instance_type: p3.16xlarge
instance_type: g3s.xlarge

worker_node_types:
- name: worker_node
instance_type: m5.xlarge
min_workers: 0
max_workers: 0
min_workers: 7
max_workers: 7
use_spot: false

aws:
Expand Down
32 changes: 32 additions & 0 deletions release/rllib_tests/1gpu_32cpus_gce.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-b

max_workers: 7

head_node_type:
name: head_node
instance_type: n1-highmem-4-nvidia-tesla-v100-1 # g3s.xlarge

worker_node_types:
- name: worker_node
instance_type: n2-standard-4 # m5.xlarge
min_workers: 7
max_workers: 7
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
disks:
- boot: true
auto_delete: true
initialize_params:
disk_size_gb: 500

#aws:
# BlockDeviceMappings:
# - DeviceName: /dev/sda1
# Ebs:
# DeleteOnTermination: true
# VolumeSize: 500
32 changes: 32 additions & 0 deletions release/rllib_tests/1gpu_4cpus_gce.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-b

max_workers: 0

head_node_type:
name: head_node
instance_type: n1-standard-4-nvidia-tesla-t4-1 # p2.xlarge

worker_node_types:
- name: worker_node
instance_type: n2-standard-4 # m5.xlarge
min_workers: 0
max_workers: 0
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
disks:
- boot: true
auto_delete: true
initialize_params:
disk_size_gb: 500

#aws:
# BlockDeviceMappings:
# - DeviceName: /dev/sda1
# Ebs:
# DeleteOnTermination: true
# VolumeSize: 500
37 changes: 37 additions & 0 deletions release/rllib_tests/2gpus_32cpus_gce.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-b

max_workers: 0

# This head node was intentionally chosen to be larger than 2GPUs (has 4GPUs).
# This makes sure we can safely migrate the 2GPU testcases to GCE without having
# 2GPU instances available on GCE today.
# TODO (Artur): Move to 2GPU head node once it's available or possibly make tests using this use 4 GPUs.
# Note: Remember to also update the worker counts in this file when doing this!
head_node_type:
name: head_node
instance_type: n1-highmem-32-nvidia-tesla-v100-4 # g3.8xlarge

worker_node_types:
- name: worker_node
instance_type: n2-standard-4 # m5.xlarge
min_workers: 0
max_workers: 0
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
disks:
- boot: true
auto_delete: true
initialize_params:
disk_size_gb: 500

#aws:
# BlockDeviceMappings:
# - DeviceName: /dev/sda1
# Ebs:
# DeleteOnTermination: true
# VolumeSize: 500
36 changes: 36 additions & 0 deletions release/rllib_tests/2gpus_64cpus_gce.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-b

max_workers: 1

# This head node was intentionally chosen to be larger than 2GPUs (has 4GPUs).
# This makes sure we can safely migrate the 2GPU testcases to GCE without having
# 2GPU instances available on GCE today.
# TODO (Artur): Move to 2GPU head node once it's available or possibly make tests using this use 4 GPUs.
head_node_type:
name: head_node
instance_type: n1-highmem-32-nvidia-tesla-v100-4 # g3.8xlarge

worker_node_types:
- name: worker_node
instance_type: n2-standard-32 # m5.8xlarge
min_workers: 1
max_workers: 1
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
disks:
- boot: true
auto_delete: true
initialize_params:
disk_size_gb: 500

#aws:
# BlockDeviceMappings:
# - DeviceName: /dev/sda1
# Ebs:
# DeleteOnTermination: true
# VolumeSize: 500
32 changes: 32 additions & 0 deletions release/rllib_tests/32cpus_gce.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west1
allowed_azs:
- us-west1-c

max_workers: 0

head_node_type:
name: head_node
instance_type: n2-standard-32 # m5.8xlarge

worker_node_types:
- name: worker_node
instance_type: n2-standard-4 # m5.xlarge
min_workers: 0
max_workers: 0
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
disks:
- boot: true
auto_delete: true
initialize_params:
disk_size_gb: 500

#aws:
# BlockDeviceMappings:
# - DeviceName: /dev/sda1
# Ebs:
# DeleteOnTermination: true
# VolumeSize: 500
Loading

0 comments on commit 3d335e1

Please sign in to comment.