Skip to content

Commit 6ccc0bf

Browse files
tjtanaapcmoritziAmir97kliuae
authored
Merge EmbeddedLLM/vllm-rocm into vLLM main (vllm-project#1836)
Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Amir Balwel <amoooori04@gmail.com> Co-authored-by: root <kuanfu.liu@akirakan.com> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: kuanfu <kuanfu.liu@embeddedllm.com> Co-authored-by: miloice <17350011+kliuae@users.noreply.github.com>
1 parent c8e7eb1 commit 6ccc0bf

29 files changed

+873
-118
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -177,3 +177,7 @@ _build/
177177
# vim swap files
178178
*.swo
179179
*.swp
180+
181+
# hip files generated by PyTorch
182+
*.hip
183+
*_hip*

Dockerfile.rocm

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
2+
3+
# Install some basic utilities
4+
RUN apt-get update && apt-get install python3 python3-pip -y
5+
6+
# Install some basic utilities
7+
RUN apt-get update && apt-get install -y \
8+
curl \
9+
ca-certificates \
10+
sudo \
11+
git \
12+
bzip2 \
13+
libx11-6 \
14+
build-essential \
15+
wget \
16+
unzip \
17+
nvidia-cuda-toolkit \
18+
tmux \
19+
&& rm -rf /var/lib/apt/lists/*
20+
21+
### Mount Point ###
22+
# When launching the container, mount the code directory to /app
23+
ARG APP_MOUNT=/app
24+
VOLUME [ ${APP_MOUNT} ]
25+
WORKDIR ${APP_MOUNT}
26+
27+
RUN python3 -m pip install --upgrade pip
28+
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
29+
30+
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
31+
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
32+
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
33+
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
34+
35+
# Install ROCm flash-attention
36+
RUN mkdir libs \
37+
&& cd libs \
38+
&& git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \
39+
&& cd flash-attention \
40+
&& git checkout 3d2b6f5 \
41+
&& git submodule update --init \
42+
&& export GPU_ARCHS=$(/opt/rocm/llvm/bin/amdgpu-offload-arch) \
43+
&& patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch \
44+
&& python3 setup.py install \
45+
&& cd ..
46+
47+
COPY ./ /app/vllm
48+
49+
RUN python3 -m pip install --upgrade pip
50+
RUN pip install xformers==0.0.22.post7 --no-deps
51+
52+
RUN cd /app \
53+
&& cd vllm \
54+
&& pip install -U -r requirements-rocm.txt \
55+
&& bash patch_xformers-0.0.22.post7.rocm.sh \
56+
&& python3 setup.py install \
57+
&& cd ..
58+
59+
RUN python3 -m pip install --upgrade pip
60+
RUN python3 -m pip install --no-cache-dir ray[all]
61+
62+
CMD ["/bin/bash"]

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Easy, fast, and cheap LLM serving for everyone
1717
---
1818

1919
*Latest News* 🔥
20+
- [2023/12] Added ROCm support to vLLM.
2021
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
2122
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
2223
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
@@ -43,6 +44,7 @@ vLLM is flexible and easy to use with:
4344
- Tensor parallelism support for distributed inference
4445
- Streaming outputs
4546
- OpenAI-compatible API server
47+
- Support NVIDIA CUDA and AMD ROCm.
4648

4749
vLLM seamlessly supports many Hugging Face models, including the following architectures:
4850

csrc/activation_kernels.cu

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <torch/extension.h>
22
#include <ATen/cuda/CUDAContext.h>
33

4+
#include "cuda_compat.h"
45
#include "dispatch_utils.h"
56

67
namespace vllm {
@@ -18,8 +19,8 @@ __global__ void silu_and_mul_kernel(
1819
const int d) {
1920
const int64_t token_idx = blockIdx.x;
2021
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
21-
const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
22-
const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
22+
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
23+
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
2324
out[token_idx * d + idx] = silu(x) * y;
2425
}
2526
}
@@ -57,7 +58,7 @@ __global__ void activation_kernel(
5758
const int d) {
5859
const int64_t token_idx = blockIdx.x;
5960
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
60-
const scalar_t x = __ldg(&input[token_idx * d + idx]);
61+
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
6162
out[token_idx * d + idx] = ACT_FN(x);
6263
}
6364
}

csrc/attention/attention_kernels.cu

+21-13
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
* See the License for the specific language governing permissions and
1616
* limitations under the License.
1717
*/
18+
#ifdef USE_ROCM
19+
#include <hip/hip_runtime.h>
20+
#endif
21+
1822
#include <torch/extension.h>
1923
#include <ATen/cuda/CUDAContext.h>
2024

@@ -23,7 +27,11 @@
2327

2428
#include <algorithm>
2529

30+
#ifndef USE_ROCM
2631
#define WARP_SIZE 32
32+
#else
33+
#define WARP_SIZE warpSize
34+
#endif
2735
#define MAX(a, b) ((a) > (b) ? (a) : (b))
2836
#define MIN(a, b) ((a) < (b) ? (a) : (b))
2937
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
@@ -40,7 +48,7 @@ inline __device__ float block_sum(float* red_smem, float sum) {
4048
// Compute the sum per warp.
4149
#pragma unroll
4250
for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
43-
sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
51+
sum += VLLM_SHFL_XOR_SYNC(sum, mask);
4452
}
4553

4654
// Warp leaders store the data to shared memory.
@@ -59,11 +67,11 @@ inline __device__ float block_sum(float* red_smem, float sum) {
5967
// Parallel reduction inside the warp.
6068
#pragma unroll
6169
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
62-
sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
70+
sum += VLLM_SHFL_XOR_SYNC(sum, mask);
6371
}
6472

6573
// Broadcast to other threads.
66-
return __shfl_sync(uint32_t(-1), sum, 0);
74+
return VLLM_SHFL_SYNC(sum, 0);
6775
}
6876

6977
// TODO(woosuk): Merge the last two dimensions of the grid.
@@ -223,7 +231,7 @@ __device__ void paged_attention_kernel(
223231
// The 0-th thread of each thread group already has its max qk value.
224232
#pragma unroll
225233
for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
226-
qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
234+
qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
227235
}
228236
if (lane == 0) {
229237
red_smem[warp_idx] = qk_max;
@@ -235,10 +243,10 @@ __device__ void paged_attention_kernel(
235243
qk_max = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
236244
#pragma unroll
237245
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
238-
qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
246+
qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
239247
}
240248
// Broadcast the max qk value to all threads.
241-
qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
249+
qk_max = VLLM_SHFL_SYNC(qk_max, 0);
242250

243251
// Get the sum of the exp values.
244252
float exp_sum = 0.f;
@@ -326,7 +334,7 @@ __device__ void paged_attention_kernel(
326334
float acc = accs[i];
327335
#pragma unroll
328336
for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
329-
acc += __shfl_xor_sync(uint32_t(-1), acc, mask);
337+
acc += VLLM_SHFL_XOR_SYNC(acc, mask);
330338
}
331339
accs[i] = acc;
332340
}
@@ -492,7 +500,7 @@ __global__ void paged_attention_v2_reduce_kernel(
492500
// Reduce within the warp.
493501
#pragma unroll
494502
for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
495-
max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask));
503+
max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
496504
}
497505
if (lane == 0) {
498506
red_smem[warp_idx] = max_logit;
@@ -502,10 +510,10 @@ __global__ void paged_attention_v2_reduce_kernel(
502510
max_logit = lane < NUM_WARPS ? red_smem[lane] : -FLT_MAX;
503511
#pragma unroll
504512
for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
505-
max_logit = fmaxf(max_logit, __shfl_xor_sync(uint32_t(-1), max_logit, mask));
513+
max_logit = fmaxf(max_logit, VLLM_SHFL_XOR_SYNC(max_logit, mask));
506514
}
507515
// Broadcast the max value to all threads.
508-
max_logit = __shfl_sync(uint32_t(-1), max_logit, 0);
516+
max_logit = VLLM_SHFL_SYNC(max_logit, 0);
509517

510518
// Load rescaled exp sums to shared memory.
511519
float* shared_exp_sums = reinterpret_cast<float*>(shared_mem + sizeof(float) * num_partitions);
@@ -539,9 +547,9 @@ __global__ void paged_attention_v2_reduce_kernel(
539547
} // namespace vllm
540548

541549
#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
542-
cudaFuncSetAttribute( \
543-
vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>, \
544-
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem_size); \
550+
VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
551+
((void*)vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS>), \
552+
shared_mem_size); \
545553
vllm::paged_attention_v1_kernel<T, HEAD_SIZE, BLOCK_SIZE, NUM_THREADS> \
546554
<<<grid, block, shared_mem_size, stream>>>( \
547555
out_ptr, \

csrc/attention/attention_utils.cuh

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
*/
1818
#pragma once
1919

20+
#include "../cuda_compat.h"
2021
#include "attention_dtypes.h"
2122

2223
#include <float.h>
@@ -39,7 +40,7 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
3940
float qk = sum(qk_vec);
4041
#pragma unroll
4142
for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
42-
qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
43+
qk += VLLM_SHFL_XOR_SYNC(qk, mask);
4344
}
4445
return qk;
4546
}

csrc/attention/dtype_bfloat16.cuh

+16-3
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,17 @@
2121
#include "attention_generic.cuh"
2222
#include "dtype_float32.cuh"
2323

24-
#include <cuda_bf16.h>
25-
#include <cuda_fp16.h>
24+
#ifndef USE_ROCM
25+
#include <cuda_bf16.h>
26+
#include <cuda_fp16.h>
27+
#else
28+
#include <hip/hip_bf16.h>
29+
#include <hip/hip_fp16.h>
30+
31+
typedef __hip_bfloat162 __nv_bfloat162;
32+
typedef __hip_bfloat16 __nv_bfloat16;
33+
#endif
34+
2635
#include <stdint.h>
2736

2837
namespace vllm {
@@ -98,7 +107,11 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
98107
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
99108
assert(false);
100109
#else
101-
return a + b;
110+
#ifndef USE_ROCM
111+
return a + b;
112+
#else
113+
return __hadd(a, b);
114+
#endif
102115
#endif
103116
}
104117

0 commit comments

Comments
 (0)