Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CI: Add ROCm Docker Build #2886

Merged
merged 31 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
cfba4cd
CI: Add ROCm Docker Build
simon-mo Feb 15, 2024
d237ff6
fix name
simon-mo Feb 15, 2024
03aba6b
move
simon-mo Feb 15, 2024
e226c59
Add sanity test
simon-mo Feb 20, 2024
51d2d75
Add sanity test
simon-mo Feb 20, 2024
5a70ad7
remove only the rocm container
simon-mo Feb 20, 2024
2156f9e
Merge branch 'main' of github.com:vllm-project/vllm into rcom-ci
simon-mo Mar 8, 2024
cd910dc
migrate to buildkite
simon-mo Mar 8, 2024
1c872cb
change to shell script
simon-mo Mar 15, 2024
e1d886f
fix typo
simon-mo Mar 15, 2024
4d87cec
fix typo
simon-mo Mar 15, 2024
456b76b
fix docker command
simon-mo Mar 15, 2024
cb7989a
fix docker command
simon-mo Mar 15, 2024
7a60114
fix docker command
simon-mo Mar 15, 2024
7367304
run docker sync
simon-mo Mar 15, 2024
ff78d36
add cmd
simon-mo Mar 15, 2024
2c5277e
Merge branch 'main' of github.com:vllm-project/vllm into rcom-ci
simon-mo Mar 15, 2024
21b0dbb
add outlines
simon-mo Mar 15, 2024
fe983cc
Revert "Dynamically configure shared memory size for moe_align_block_…
simon-mo Mar 16, 2024
822bffd
add debug info
simon-mo Mar 16, 2024
0125268
use test server due to outlines issue
simon-mo Mar 16, 2024
5e3180d
actually use gpus
simon-mo Mar 16, 2024
fbc07a9
use rocm docker
simon-mo Mar 16, 2024
83447e4
use better healthcheck
simon-mo Mar 16, 2024
1117e46
use curl on host instead
simon-mo Mar 16, 2024
1d8911f
fix curl
simon-mo Mar 16, 2024
ec2d7ec
reset tests
simon-mo Mar 16, 2024
94fa91d
newline
simon-mo Mar 16, 2024
c5850ad
add comments
simon-mo Mar 16, 2024
4a88632
Revert "Revert "Dynamically configure shared memory size for moe_alig…
simon-mo Mar 18, 2024
06f9ccf
Merge branch 'main' of github.com:vllm-project/vllm into rcom-ci
simon-mo Mar 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# This script build the ROCm docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex

# Print ROCm version
rocminfo

# Try building the docker image
docker build -t rocm -f Dockerfile.rocm .

# Setup cleanup
remove_docker_container() { docker rm -f rocm || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image
docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &

# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0

while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start

# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'
5 changes: 5 additions & 0 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
{% set default_working_dir = "/vllm-workspace/tests" %}

steps:
- label: "AMD Test"
agents:
queue: amd
command: bash .buildkite/run-amd-test.sh

- label: ":docker: build image"
commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
Expand Down
42 changes: 13 additions & 29 deletions csrc/moe_align_block_size_kernels.cu
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this file reverted?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I just saw the PR description. I believe this can be easily fixed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@simon-mo Do you mind if I directly push a change to fix the kernel compilation error?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

go ahead!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh just created a new PR #3470 to get reviewed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I merged #3470 Can you please rebase the PR with the current main branch?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done!

Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,10 @@
#include "cuda_compat.h"
#include "dispatch_utils.h"

const static size_t NUM_MAX_EXPERTS = 64;
#define CEILDIV(x,y) (((x) + (y) - 1) / (y))

namespace vllm {

namespace {
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, int32_t col) {
// don't worry about overflow because num_experts is relatively small
return row * total_col + col;
}
}

template <typename scalar_t>
__global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
int32_t *sorted_token_ids,
Expand All @@ -28,14 +21,10 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
size_t numel) {
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
const size_t start_idx = threadIdx.x * tokens_per_thread;

extern __shared__ int32_t shared_mem[];

int32_t* tokens_cnts = shared_mem; // 2d tensor with shape (num_experts + 1, num_experts)
int32_t* cumsum = shared_mem + (num_experts + 1) * num_experts; // 1d tensor with shape (num_experts + 1)

__shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS];
__shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1];
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
tokens_cnts[threadIdx.x + 1][i] = 0;
}

/**
Expand All @@ -44,15 +33,15 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
* to expert expert_index.
*/
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
++tokens_cnts[threadIdx.x + 1][topk_ids[i]];
}

__syncthreads();

// For each expert we accumulate the token counts from the different threads.
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
tokens_cnts[0][threadIdx.x] = 0;
for (int i = 1; i <= blockDim.x; ++i) {
tokens_cnts[index(num_experts, i, threadIdx.x)] += tokens_cnts[index(num_experts, i-1, threadIdx.x)];
tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x];
}

__syncthreads();
Expand All @@ -61,7 +50,7 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
if (threadIdx.x == 0) {
cumsum[0] = 0;
for (int i = 1; i <= num_experts; ++i) {
cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], block_size) * block_size;
cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[blockDim.x][i - 1], block_size) * block_size;
}
*total_tokens_post_pad = cumsum[num_experts];
}
Expand Down Expand Up @@ -89,9 +78,9 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids,
* stores the indices of the tokens processed by the expert with expert_id within
* the current thread's token shard.
*/
int32_t rank_post_pad = tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id];
int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id];
sorted_token_ids[rank_post_pad] = i;
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
++tokens_cnts[threadIdx.x][expert_id];
}
}
}
Expand All @@ -104,16 +93,11 @@ void moe_align_block_size(
torch::Tensor experts_ids,
torch::Tensor num_tokens_post_pad) {
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
assert(num_experts <= NUM_MAX_EXPERTS);
VLLM_DISPATCH_INTEGRAL_TYPES(
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// calc needed amount of shared mem for `tokens_cnts` and `cumsum` tensors
const int32_t shared_mem = ((num_experts + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);

// set dynamic shared mem
auto kernel = vllm::moe_align_block_size_kernel<scalar_t>;
AT_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem));
kernel<<<1, num_experts, shared_mem, stream>>>(
topk_ids.data_ptr<scalar_t>(),
vllm::moe_align_block_size_kernel<scalar_t><<<1, num_experts, 0, stream>>>(
topk_ids.data_ptr<scalar_t>(),
sorted_token_ids.data_ptr<int32_t>(),
experts_ids.data_ptr<int32_t>(),
num_tokens_post_pad.data_ptr<int32_t>(),
Expand Down
1 change: 1 addition & 0 deletions requirements-rocm.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ fastapi
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
outlines == 0.0.34
Loading