Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Upstream sync 2024 06 08 #288

Merged
merged 101 commits into from
Jun 10, 2024
Merged
Changes from 1 commit
Commits
Show all changes
101 commits
Select commit Hold shift + click to select a range
e69d23b
[Kernel] Add marlin_24 unit tests (#4901)
alexm-redhat May 19, 2024
81ec16b
[Kernel] Add flash-attn back (#4907)
WoosukKwon May 20, 2024
5500975
[Model] LLaVA model refactor (#4910)
DarkLight1337 May 20, 2024
b913d04
Remove marlin warning (#4918)
alexm-redhat May 20, 2024
683a30b
[Misc]: allow user to specify port in distributed setting (#4914)
ZwwWayne May 20, 2024
c8794c3
[Build/CI] Enabling AMD Entrypoints Test (#4834)
Alexei-V-Ivanov-AMD May 20, 2024
5b6a7b5
[Bugfix] Fix dummy weight for fp8 (#4916)
mzusman May 20, 2024
a5e66c7
[Core] Sharded State Loader download from HF (#4889)
aurickq May 20, 2024
8a78ed8
[Doc]Add documentation to benchmarking script when running TGI (#4920)
KuntaiDu May 20, 2024
6b46dcf
[Core] Fix scheduler considering "no LoRA" as "LoRA" (#4897)
Yard1 May 21, 2024
907d48a
[Model] add rope_scaling support for qwen2 (#4930)
hzhwcmhf May 21, 2024
11d6f7e
[Model] Add Phi-2 LoRA support (#4886)
Isotr0py May 21, 2024
5d98989
[Docs] Add acknowledgment for sponsors (#4925)
simon-mo May 21, 2024
58a235b
[CI/Build] Codespell ignore `build/` directory (#4945)
mgoin May 21, 2024
253d8fb
[Bugfix] Fix flag name for `max_seq_len_to_capture` (#4935)
kerthcet May 21, 2024
f744125
[Bugfix][Kernel] Add head size check for attention backend selection …
Isotr0py May 21, 2024
c1672a9
[Frontend] Dynamic RoPE scaling (#4638)
sasha0552 May 22, 2024
4b6c961
[CI/Build] Enforce style for C++ and CUDA code with `clang-format` (#…
mgoin May 22, 2024
4b74974
[misc] remove comments that were supposed to be removed (#4977)
rkooo567 May 22, 2024
39c15ee
[Kernel] Fixup for CUTLASS kernels in CUDA graphs (#4954)
tlrmchlsmth May 22, 2024
2835fc6
[Misc] Load FP8 kv-cache scaling factors from checkpoints (#4893)
comaniac May 22, 2024
3db99a6
[Model] LoRA gptbigcode implementation (#3949)
raywanb May 22, 2024
39a0a40
[Core] Eliminate parallel worker per-step task scheduling overhead (#…
njhill May 22, 2024
847ca88
[Minor] Fix small typo in llama.py: QKVParallelLinear -> Quantization…
pcmoritz May 22, 2024
c60384c
[Misc] Take user preference in attention selector (#4960)
comaniac May 22, 2024
dae5aaf
Marlin 24 prefill performance improvement (about 25% better on averag…
alexm-redhat May 23, 2024
05a4f64
[Bugfix] Update Dockerfile.cpu to fix NameError: name 'vllm_ops' is n…
LetianLee May 23, 2024
bf4c411
[Core][1/N] Support send/recv in PyNCCL Groups (#4988)
andoorve May 23, 2024
c623663
[Kernel] Initial Activation Quantization Support (#4525)
dsikka May 23, 2024
a9ca32d
[Core]: Option To Use Prompt Token Ids Inside Logits Processor (#4985)
kezouke May 23, 2024
0eb33b1
[Doc] add ccache guide in doc (#5012)
youkaichao May 23, 2024
acf362c
[Kernel] Initial Activation Quantization Support (#4525)
robertgshaw2-redhat May 24, 2024
1226d5d
[Core][Bugfix]: fix prefix caching for blockv2 (#4764)
leiwen83 May 24, 2024
29a2098
[Kernel][Backend][Model] Blocksparse flash attention kernel and Phi-3…
linxihui May 25, 2024
3fe7e52
[Misc] add logging level env var (#5045)
youkaichao May 25, 2024
8768b3f
[Dynamic Spec Decoding] Minor fix for disabling speculative decoding …
LiuXiaoxuanPKU May 25, 2024
e7e376f
[Misc] Make Serving Benchmark More User-friendly (#5044)
ywang96 May 25, 2024
67ce9ea
[Bugfix / Core] Prefix Caching Guards (merged with main) (#4846)
zhuohan123 May 27, 2024
2c59c91
[Core] Allow AQLM on Pascal (#5058)
sasha0552 May 27, 2024
9fb7b82
[Model] Add support for falcon-11B (#5069)
Isotr0py May 27, 2024
954c332
[Core] Sliding window for block manager v2 (#4545)
mmoskal May 28, 2024
9929fb2
[BugFix] Fix Embedding Models with TP>1 (#5075)
robertgshaw2-redhat May 28, 2024
b22d985
[Kernel][ROCm][AMD] Add fused_moe Triton configs for MI300X (#4951)
divakar-amd May 28, 2024
54c17a9
[Docs] Add Dropbox as sponsors (#5089)
simon-mo May 28, 2024
8c9aab4
[Core] Consolidate prompt arguments to LLM engines (#4328)
DarkLight1337 May 28, 2024
705789d
[Bugfix] Remove the last EOS token unless explicitly specified (#5077)
jsato8094 May 29, 2024
95c2a3d
[Misc] add gpu_memory_utilization arg (#5079)
pandyamarut May 29, 2024
9175890
[Core][Optimization] remove vllm-nccl (#5091)
youkaichao May 29, 2024
420c4ff
[Bugfix] Fix arguments passed to `Sequence` in stop checker test (#5092)
DarkLight1337 May 29, 2024
5bde5ba
[Core][Distributed] improve p2p access check (#4992)
youkaichao May 29, 2024
b86aa89
[Core] Cross-attention KV caching and memory-management (towards even…
afeldman-nm May 29, 2024
f63e8dd
[Doc]Replace deprecated flag in readme (#4526)
ronensc May 29, 2024
62a4fcb
[Bugfix][CI/Build] Fix test and improve code for `merge_async_iterato…
DarkLight1337 May 29, 2024
f900bcc
[Bugfix][CI/Build] Fix codespell failing to skip files in `git diff` …
DarkLight1337 May 29, 2024
6824b2f
[Core] Avoid the need to pass `None` values to `Sequence.inputs` (#5099)
DarkLight1337 May 29, 2024
623275f
[Bugfix] logprobs is not compatible with the OpenAI spec #4795 (#5031)
Etelis May 29, 2024
15dcd3e
[Bugfix / Core] Prefix Caching Guards (merged with main) (#4846)
youkaichao May 29, 2024
5763c73
[Bugfix] gptq_marlin: Ensure g_idx_sort_indices is not a Parameter (#…
alexm-redhat May 30, 2024
3a8332c
[CI/Build] Docker cleanup functionality for amd servers (#5112)
okakarpa May 30, 2024
11a5a26
[BUGFIX] [FRONTEND] Correct chat logprobs (#5029)
br3no May 30, 2024
2827c68
[Bugfix] Automatically Detect SparseML models (#5119)
robertgshaw2-redhat May 30, 2024
4ae80dd
[CI/Build] increase wheel size limit to 200 MB (#5130)
youkaichao May 30, 2024
886ead6
[Misc] remove duplicate definition of `seq_lens_tensor` in model_runn…
ita9naiwa May 30, 2024
758b903
[Doc] Use intersphinx and update entrypoints docs (#5125)
DarkLight1337 May 30, 2024
a190463
add doc about serving option on dstack (#3074)
deep-diver May 30, 2024
51cf757
Bump version to v0.4.3 (#5046)
simon-mo May 30, 2024
c72d890
[Build] Disable sm_90a in cu11 (#5141)
simon-mo May 30, 2024
cf0711b
[Bugfix] Avoid Warnings in SparseML Activation Quantization (#5120)
robertgshaw2-redhat May 31, 2024
dcaf819
[Kernel] Marlin_24: Ensure the mma.sp instruction is using the ::orde…
alexm-redhat May 31, 2024
7da3c3f
Fix cutlass sm_90a vesrion in CMakeList
simon-mo May 31, 2024
2c66f17
[Model] Support MAP-NEO model (#5081)
xingweiqu May 31, 2024
5388c64
Revert "[Kernel] Marlin_24: Ensure the mma.sp instruction is using th…
simon-mo May 31, 2024
5e9f300
[Misc]: optimize eager mode host time (#4196)
FuncSherl May 31, 2024
f329e2e
[Model] Enable FP8 QKV in MoE and refine kernel tuning script (#5039)
comaniac May 31, 2024
951e3d2
[Doc] Add checkmark for GPTBigCodeForCausalLM LoRA support (#5171)
njhill Jun 1, 2024
d349dbd
[Build] Guard against older CUDA versions when building CUTLASS 3.x k…
tlrmchlsmth Jun 1, 2024
031fd4e
format
Jun 8, 2024
9ed5f76
skip blockspase attention
Jun 9, 2024
ec71544
fix falcon
Jun 9, 2024
7381340
skip sliding window chunked prefill
Jun 9, 2024
c23ca05
skip prefix prefill
Jun 9, 2024
85512eb
skip tensorizer
Jun 9, 2024
0cea2c2
[Misc][Breaking] Change FP8 checkpoint format from act_scale -> input…
mgoin Jun 8, 2024
31147df
format
Jun 9, 2024
2256610
fix issue with internal method
Jun 9, 2024
01973f5
formatting
Jun 9, 2024
a1a659d
disabled more kernel tests that use triton
Jun 9, 2024
c50784c
updated cutlass skipping. We need cuda 12.4 in automation
Jun 9, 2024
99fa9f8
trigger kernel tests in automation
Jun 9, 2024
2ec6643
cleanup spurious setup.py change
Jun 9, 2024
0bb099c
readded the missing images
Jun 9, 2024
198f364
multilora inference
Jun 9, 2024
ec0e89a
offline inference with prefix
Jun 9, 2024
e6f1cbd
backend request func
Jun 9, 2024
ca8d74a
benchmark serving
Jun 9, 2024
5335ad9
prod monitoring readme
Jun 9, 2024
611cfed
format
Jun 9, 2024
73132a5
fix benchmark issue - internal method changed
Jun 9, 2024
7f5c715
removed skip for remote push edits
Jun 9, 2024
437912e
update internal method in benchmark throughput too
Jun 10, 2024
950981c
skip triton sampler tests
Jun 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[CI/Build] Enforce style for C++ and CUDA code with clang-format (v…
mgoin authored and Robert Shaw committed Jun 8, 2024
commit 4b6c96163bb9749d4f84d1158970dd0535e06bda
26 changes: 26 additions & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
BasedOnStyle: Google
UseTab: Never
IndentWidth: 2
ColumnLimit: 80

# Force pointers to the type for C++.
DerivePointerAlignment: false
PointerAlignment: Left

# Reordering #include statements can (and currently will) introduce errors
SortIncludes: false

# Style choices
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
IndentPPDirectives: BeforeHash

IncludeCategories:
- Regex: '^<'
Priority: 4
- Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
Priority: 3
- Regex: '^"(qoda|\.\.)/'
Priority: 2
- Regex: '.*'
Priority: 1
42 changes: 42 additions & 0 deletions .github/workflows/clang-format.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: clang-format

on:
# Trigger the workflow on push or pull request,
# but only for the main branch
push:
branches:
- main
pull_request:
branches:
- main

jobs:
clang-format:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install clang-format==18.1.5
- name: Running clang-format
run: |
EXCLUDES=(
'csrc/moe/topk_softmax_kernels.cu'
'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
'csrc/punica/bgmv/bgmv_config.h'
'csrc/punica/bgmv/bgmv_impl.cuh'
'csrc/punica/bgmv/vec_dtypes.cuh'
'csrc/punica/punica_ops.cu'
'csrc/punica/type_convert.h'
)
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
| xargs clang-format --dry-run --Werror
139 changes: 64 additions & 75 deletions csrc/activation_kernels.cu
Original file line number Diff line number Diff line change
@@ -10,11 +10,11 @@
namespace vllm {

// Activation and gating kernel template.
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
__global__ void act_and_mul_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
const int d) {
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
@@ -23,139 +23,128 @@ __global__ void act_and_mul_kernel(
}
}

template<typename T>
template <typename T>
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
return (T) (((float) x) / (1.0f + expf((float) -x)));
return (T)(((float)x) / (1.0f + expf((float)-x)));
}

template<typename T>
template <typename T>
__device__ __forceinline__ T gelu_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'none' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
const float f = (float) x;
const float f = (float)x;
constexpr float ALPHA = M_SQRT1_2;
return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
}

template<typename T>
template <typename T>
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'tanh' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
const float f = (float) x;
const float f = (float)x;
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
constexpr float KAPPA = 0.044715;
float x_cube = f * f * f;
float inner = BETA * (f + KAPPA * x_cube);
return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
}

} // namespace vllm
} // namespace vllm

// Launch activation and gating kernel.
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), \
"act_and_mul_kernel", \
[&] { \
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), \
d); \
});

void silu_and_mul(
torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), "act_and_mul_kernel", [&] { \
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
});

void silu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
}

void gelu_and_mul(
torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
void gelu_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
}

void gelu_tanh_and_mul(
torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
}

namespace vllm {

// Element-wise activation kernel template.
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
__global__ void activation_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., d]
const int d) {
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., d]
const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
out[token_idx * d + idx] = ACT_FN(x);
}
}

} // namespace vllm
} // namespace vllm

// Launch element-wise activation kernel.
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
int d = input.size(-1); \
int64_t num_tokens = input.numel() / d; \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), \
"activation_kernel", \
[&] { \
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \
out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), \
d); \
});
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
int d = input.size(-1); \
int64_t num_tokens = input.numel() / d; \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d); \
});

namespace vllm {

template<typename T>
template <typename T>
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
const float x3 = (float) (x * x * x);
const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
return ((T) 0.5) * x * (((T) 1.0) + t);
const float x3 = (float)(x * x * x);
const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
return ((T)0.5) * x * (((T)1.0) + t);
}

template<typename T>
template <typename T>
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
const float f = (float) x;
const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
return ((T) 0.5) * x * (((T) 1.0) + t);
const float f = (float)x;
const T t =
(T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
return ((T)0.5) * x * (((T)1.0) + t);
}

} // namespace vllm
} // namespace vllm

void gelu_new(
torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., d]
void gelu_new(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
}

void gelu_fast(
torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., d]
void gelu_fast(torch::Tensor& out, // [..., d]
torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
}
19 changes: 10 additions & 9 deletions csrc/attention/attention_generic.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Adapted from
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -22,31 +23,31 @@
namespace vllm {

// A vector type to store Q, K, V elements.
template<typename T, int VEC_SIZE>
template <typename T, int VEC_SIZE>
struct Vec {};

// A vector type to store FP32 accumulators.
template<typename T>
template <typename T>
struct FloatVec {};

// Template vector operations.
template<typename Acc, typename A, typename B>
template <typename Acc, typename A, typename B>
inline __device__ Acc mul(A a, B b);

template<typename T>
template <typename T>
inline __device__ float sum(T v);

template<typename T>
template <typename T>
inline __device__ float dot(T a, T b) {
return sum(mul<T, T, T>(a, b));
}

template<typename A, typename T>
template <typename A, typename T>
inline __device__ float dot(T a, T b) {
return sum(mul<A, T, T>(a, b));
}

template<typename T>
template <typename T>
inline __device__ void zero(T& dst) {
constexpr int WORDS = sizeof(T) / 4;
union {
@@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) {
dst = tmp.raw;
}

} // namespace vllm
} // namespace vllm
Loading