[dataset][nightly-test] add pipelined ingestion/training nightly test

ray-project · Sep 24, 2021 · 7c99aae · 7c99aae
1 parent 565131a
commit 7c99aae
Show file tree

Hide file tree

Showing 8 changed files with 473 additions and 6 deletions.
diff --git a/release/.buildkite/build_pipeline.py b/release/.buildkite/build_pipeline.py
@@ -82,6 +82,8 @@ def __init__(self, name: str, retry: int = 0):
     "~/ray/release/nightly_tests/dataset/dataset_test.yaml": [
         "inference",
         "shuffle_data_loader",
+        "pipelined_training_50_gb",
+        "pipelined_ingestion_1500_gb_15_windows",
     ],
 }
 

diff --git a/release/nightly_tests/dataset/dataset_test.yaml b/release/nightly_tests/dataset/dataset_test.yaml
@@ -9,7 +9,7 @@
 
   run:
     timeout: 600
-    prepare: python wait_cluster.py
+    prepare: python wait_cluster.py 2 600
     script: python inference.py
 
 - name: shuffle_data_loader
@@ -24,3 +24,31 @@
   run:
     timeout: 1800
     script: python dataset_shuffle_data_loader.py
+
+- name: pipelined_training_50_gb
+  owner:
+    mail: "core@anyscale.com"
+    slack: "@Chen Shen"
+
+  cluster:
+    app_config: pipelined_training_app.yaml
+    compute_template: pipelined_training_compute.yaml
+
+  run:
+    timeout: 4800
+    prepare: python wait_cluster.py 15 1200
+    script: python pipelined_training.py --epochs 5
+
+- name: pipelined_ingestion_1500_gb_15_windows
+  owner:
+    mail: "core@anyscale.com"
+    slack: "@Chen Shen"
+
+  cluster:
+    app_config: pipelined_ingestion_app.yaml
+    compute_template: pipelined_ingestion_compute.yaml
+
+  run:
+    timeout: 4800
+    prepare: python wait_cluster.py 21 2400
+    script: python pipelined_training.py --epochs 2 --num-windows 15  --num-files 915 --debug
diff --git a/release/nightly_tests/dataset/pipelined_ingestion_app.yaml b/release/nightly_tests/dataset/pipelined_ingestion_app.yaml
@@ -0,0 +1,17 @@
+base_image: "anyscale/ray-ml:pinned-nightly-py37-gpu"
+env_vars: {}
+
+python:
+  pip_packages: []
+  conda_packages: []
+
+post_build_cmds:
+  - pip uninstall -y numpy ray || true
+  - sudo rm -rf /home/ray/anaconda3/lib/python3.7/site-packages/numpy
+  - pip install numpy || true
+  - pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+  - pip install -U git+https://github.com/ray-project/ray_shuffling_data_loader.git@add-embedding-model
+  - pip install ray[default]
+  - pip install pyarrow
+  - pip install torch torchvision
+  - HOROVOD_WITH_GLOO=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITHOUT_MXNET=1 HOROVOD_WITH_PYTORCH=1 pip install -U git+https://github.com/horovod/horovod.git
diff --git a/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml b/release/nightly_tests/dataset/pipelined_ingestion_compute.yaml
@@ -0,0 +1,29 @@
+cloud_id: cld_17WvYIBBkdgLwEUNcLeRAE
+region: us-west-2
+
+max_workers: 999
+
+aws:
+    IamInstanceProfile: {"Name": "ray-autoscaler-v1"}
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+            VolumeSize: 500
+
+head_node_type:
+    name: head_node
+    instance_type: i3.8xlarge
+
+worker_node_types:
+    - name: memory_node
+      instance_type: i3.8xlarge
+      min_workers: 16
+      max_workers: 16
+      use_spot: false
+    - name: gpu_node
+      instance_type: i3.8xlarge
+      min_workers: 4
+      max_workers: 4
+      use_spot: false
+      resources:
+        gpu: 4