Skip to content

Commit

Permalink
[dataset][nightly-test] add pipelined ingestion/training nightly test
Browse files Browse the repository at this point in the history
  • Loading branch information
scv119 authored Sep 24, 2021
1 parent 565131a commit 7c99aae
Show file tree
Hide file tree
Showing 8 changed files with 473 additions and 6 deletions.
2 changes: 2 additions & 0 deletions release/.buildkite/build_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ def __init__(self, name: str, retry: int = 0):
"~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
"inference",
"shuffle_data_loader",
"pipelined_training_50_gb",
"pipelined_ingestion_1500_gb_15_windows",
],
}

Expand Down
30 changes: 29 additions & 1 deletion release/nightly_tests/dataset/dataset_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

run:
timeout: 600
prepare: python wait_cluster.py
prepare: python wait_cluster.py 2 600
script: python inference.py

- name: shuffle_data_loader
Expand All @@ -24,3 +24,31 @@
run:
timeout: 1800
script: python dataset_shuffle_data_loader.py

- name: pipelined_training_50_gb
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"

cluster:
app_config: pipelined_training_app.yaml
compute_template: pipelined_training_compute.yaml

run:
timeout: 4800
prepare: python wait_cluster.py 15 1200
script: python pipelined_training.py --epochs 5

- name: pipelined_ingestion_1500_gb_15_windows
owner:
mail: "core@anyscale.com"
slack: "@Chen Shen"

cluster:
app_config: pipelined_ingestion_app.yaml
compute_template: pipelined_ingestion_compute.yaml

run:
timeout: 4800
prepare: python wait_cluster.py 21 2400
script: python pipelined_training.py --epochs 2 --num-windows 15 --num-files 915 --debug
17 changes: 17 additions & 0 deletions release/nightly_tests/dataset/pipelined_ingestion_app.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
base_image: "anyscale/ray-ml:pinned-nightly-py37-gpu"
env_vars: {}

python:
pip_packages: []
conda_packages: []

post_build_cmds:
- pip uninstall -y numpy ray || true
- sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
- pip install numpy || true
- pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
- pip install -U git+https://github.com/ray-project/ray_shuffling_data_loader.git@add-embedding-model
- pip install ray[default]
- pip install pyarrow
- pip install torch torchvision
- HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip install -U git+https://github.com/horovod/horovod.git
29 changes: 29 additions & 0 deletions release/nightly_tests/dataset/pipelined_ingestion_compute.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
cloud_id: cld_17WvYIBBkdgLwEUNcLeRAE
region: us-west-2

max_workers: 999

aws:
IamInstanceProfile: {"Name": "ray-autoscaler-v1"}
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 500

head_node_type:
name: head_node
instance_type: i3.8xlarge

worker_node_types:
- name: memory_node
instance_type: i3.8xlarge
min_workers: 16
max_workers: 16
use_spot: false
- name: gpu_node
instance_type: i3.8xlarge
min_workers: 4
max_workers: 4
use_spot: false
resources:
gpu: 4
Loading

0 comments on commit 7c99aae

Please sign in to comment.