From 2b27c59b24553ea8a1b49529f391ec8a587e1522 Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 18:07:52 +0000 Subject: [PATCH 01/11] p Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 1 + .buildkite/test-template-aws.j2 | 50 ++------------------------------- 2 files changed, 4 insertions(+), 47 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6439a315e327..8c0aea324023 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -24,6 +24,7 @@ steps: - label: Core Test mirror_hardwares: [amd] + gpu: a100 command: pytest -v -s core - label: Distributed Comm Ops Test diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 01f7ff1e0e2b..d2287f692d72 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -2,52 +2,6 @@ {% set default_working_dir = "/vllm-workspace/tests" %} steps: - - label: ":docker: build image" - agents: - queue: cpu_queue - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - - - group: "AMD Tests" - depends_on: ~ - steps: - {% for step in steps %} - {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} - - label: "AMD: {{ step.label }}" - agents: - queue: amd - command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" - env: - DOCKER_BUILDKIT: "1" - priority: 100 - soft_fail: true - {% endif %} - {% endfor %} - - - label: "Neuron Test" - depends_on: ~ - agents: - queue: neuron - command: bash .buildkite/run-neuron-test.sh - soft_fail: false - - - label: "Intel Test" - depends_on: ~ - agents: - queue: intel - command: bash .buildkite/run-cpu-test.sh - {% for step in steps %} - label: "{{ step.label }}" agents: @@ -55,6 +9,8 @@ steps: queue: small_cpu_queue {% elif step.no_gpu %} queue: cpu_queue + {% elif step.gpu == "a100" %} + queue: a100-queue {% elif step.num_gpus == 2 or step.num_gpus == 4 %} queue: gpu_4_queue {% else %} @@ -72,7 +28,7 @@ steps: limit: 5 plugins: - docker#v5.2.0: - image: {{ docker_image }} + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:13db4369d9ab3158a01192d60c744c6523961824 always-pull: true propagate-environment: true {% if not step.no_gpu %} From b86d5d07044fcfb32c5ffcfbcb4f09276a9d4c5a Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 19:03:10 +0000 Subject: [PATCH 02/11] p Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index d2287f692d72..bfa2804f3e79 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -2,6 +2,12 @@ {% set default_working_dir = "/vllm-workspace/tests" %} steps: + - label: "user check" + agents: + queue: a100-queue + commands: + - whoami + - wait {% for step in steps %} - label: "{{ step.label }}" agents: From 3f47a010c4b51fb728bada759250f214b8fcfcb0 Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 19:16:49 +0000 Subject: [PATCH 03/11] p Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index bfa2804f3e79..f34a417bd580 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -7,6 +7,9 @@ steps: queue: a100-queue commands: - whoami + - sudo yum update -y + - sudo amazon-linux-extras install docker + - sudo service docker start - wait {% for step in steps %} - label: "{{ step.label }}" From 195a3ca2f3d58927040324115fa387a258fe82d8 Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 19:24:22 +0000 Subject: [PATCH 04/11] p Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index f34a417bd580..7c778fb42216 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -7,9 +7,9 @@ steps: queue: a100-queue commands: - whoami - - sudo yum update -y - - sudo amazon-linux-extras install docker - - sudo service docker start + - yum update -y + - amazon-linux-extras install docker + - service docker start - wait {% for step in steps %} - label: "{{ step.label }}" From 67bfd0ee22e1f63bd15b3ea74e20a966ce92f5aa Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 19:32:40 +0000 Subject: [PATCH 05/11] p Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 7c778fb42216..44e46edc4c91 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -7,9 +7,8 @@ steps: queue: a100-queue commands: - whoami - - yum update -y - - amazon-linux-extras install docker - - service docker start + - cat /etc/os-release + - nvidia-smi - wait {% for step in steps %} - label: "{{ step.label }}" From f318df6a73078412595eca3bec71f4515297f97c Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 21:53:51 +0000 Subject: [PATCH 06/11] p Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 1 + .buildkite/test-template-aws.j2 | 54 +++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8c0aea324023..130bc3f521e3 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -15,6 +15,7 @@ steps: - label: Basic Correctness Test mirror_hardwares: [amd] + gpu: a100 commands: - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 44e46edc4c91..e899ea927e0b 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -2,23 +2,58 @@ {% set default_working_dir = "/vllm-workspace/tests" %} steps: - - label: "user check" + {% for step in steps %} + {% if step.gpu == "a100" %} + - label: "{{ step.label }}" agents: queue: a100-queue - commands: - - whoami - - cat /etc/os-release - - nvidia-smi - - wait - {% for step in steps %} + soft_fail: {{ step.soft_fail or false }} + {% if step.parallelism %} + parallelism: {{ step.parallelism }} + {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + plugins: + - kubernetes: + podSpec: + containers: + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:13db4369d9ab3158a01192d60c744c6523961824 + command: ["bash"] + args: + - '-c' + - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + {% endif %} + {% if step.gpu != "a100" %} - label: "{{ step.label }}" agents: {% if step.label == "Documentation Build" %} queue: small_cpu_queue {% elif step.no_gpu %} queue: cpu_queue - {% elif step.gpu == "a100" %} - queue: a100-queue {% elif step.num_gpus == 2 or step.num_gpus == 4 %} queue: gpu_4_queue {% else %} @@ -54,4 +89,5 @@ steps: {% endif %} volumes: - /dev/shm:/dev/shm + {% endif %} {% endfor %} From 87c9b87e9ebd2b301e64b890bf5bddfb97709e8b Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 22:50:03 +0000 Subject: [PATCH 07/11] p Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 50 +++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index e899ea927e0b..703159182b22 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -2,6 +2,52 @@ {% set default_working_dir = "/vllm-workspace/tests" %} steps: + - label: ":docker: build image" + agents: + queue: cpu_queue + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ." + - "docker push {{ docker_image }}" + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + - wait + + - group: "AMD Tests" + depends_on: ~ + steps: + {% for step in steps %} + {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} + - label: "AMD: {{ step.label }}" + agents: + queue: amd + command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" + env: + DOCKER_BUILDKIT: "1" + priority: 100 + soft_fail: true + {% endif %} + {% endfor %} + + - label: "Neuron Test" + depends_on: ~ + agents: + queue: neuron + command: bash .buildkite/run-neuron-test.sh + soft_fail: false + + - label: "Intel Test" + depends_on: ~ + agents: + queue: intel + command: bash .buildkite/run-cpu-test.sh + {% for step in steps %} {% if step.gpu == "a100" %} - label: "{{ step.label }}" @@ -21,7 +67,7 @@ steps: - kubernetes: podSpec: containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:13db4369d9ab3158a01192d60c744c6523961824 + - image: {{ docker_image }} command: ["bash"] args: - '-c' @@ -71,7 +117,7 @@ steps: limit: 5 plugins: - docker#v5.2.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:13db4369d9ab3158a01192d60c744c6523961824 + image: {{ docker_image }} always-pull: true propagate-environment: true {% if not step.no_gpu %} From dcb77ec37040c20ac96301d20f5f7f81379b18f0 Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 22:50:15 +0000 Subject: [PATCH 08/11] p Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 130bc3f521e3..6439a315e327 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -15,7 +15,6 @@ steps: - label: Basic Correctness Test mirror_hardwares: [amd] - gpu: a100 commands: - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py @@ -25,7 +24,6 @@ steps: - label: Core Test mirror_hardwares: [amd] - gpu: a100 command: pytest -v -s core - label: Distributed Comm Ops Test From 17fe660d8d3a9903cca8d1f9c947c6b5e5ee4467 Mon Sep 17 00:00:00 2001 From: kevin Date: Tue, 18 Jun 2024 22:51:16 +0000 Subject: [PATCH 09/11] p Signed-off-by: kevin --- .buildkite/test-template-aws.j2 | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 703159182b22..dcb73684254f 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -92,8 +92,7 @@ steps: - name: devshm emptyDir: medium: Memory - {% endif %} - {% if step.gpu != "a100" %} + {% else %} - label: "{{ step.label }}" agents: {% if step.label == "Documentation Build" %} From d0fa07e4c5313363970749cec4305ef63119c367 Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 19 Jun 2024 00:14:13 +0000 Subject: [PATCH 10/11] p Signed-off-by: kevin --- .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 1 + .buildkite/test-pipeline.yaml | 4 ++++ .buildkite/test-template-aws.j2 | 1 + 3 files changed, 6 insertions(+) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 8f12748b68f3..2b25c954b5c5 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -17,6 +17,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT command: diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6439a315e327..ea9901c208df 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -172,3 +172,7 @@ steps: commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html + +- label: A100 status + gpu: a100 + commands: nvidia-smi diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index dcb73684254f..08146bf4454c 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -66,6 +66,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: ci containers: - image: {{ docker_image }} command: ["bash"] From 486b423255fdc19477cbb4cf74041469cd7a5f71 Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 19 Jun 2024 01:18:19 +0000 Subject: [PATCH 11/11] p Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ea9901c208df..203bb9c6f53e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -175,4 +175,5 @@ steps: - label: A100 status gpu: a100 - commands: nvidia-smi + commands: + - nvidia-smi