Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 7bbd2cc

Browse files
andy-neumaronenscMaxusmustiesmeetuWoosukKwon
authored
upstream merge sync 2024-03-11 (#108)
SUMMARY: * upstream merge (sync) up to `657061fdced8a33a60c1b09f5da2525de9da8f03` * some minor changes related to `ruff` and `yapf` NOTES: we are now consistently getting out memory of errors when running `tests/models/test_marlin.py`. i've disabled the test and created an ASANA ticket to track the issue. TEST PLAN: runs on remote push --------- Signed-off-by: Tao He <sighingnow@gmail.com> Signed-off-by: Yuan Tang <terrytangyuan@gmail.com> Co-authored-by: Ronen Schaffer <ronen.schaffer@ibm.com> Co-authored-by: Mustafa Eyceoz <maxusmusti@gmail.com> Co-authored-by: Roy <jasonailu87@gmail.com> Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Co-authored-by: Massimiliano Pronesti <massimiliano.pronesti@gmail.com> Co-authored-by: 44670 <44670@users.noreply.github.com> Co-authored-by: zhaoyang-star <zhaoyangstar@foxmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Jared Moore <27744679+jlcmoore@users.noreply.github.com> Co-authored-by: Philipp Moritz <pcmoritz@gmail.com> Co-authored-by: Cade Daniel <edacih@gmail.com> Co-authored-by: 张大成 <1345739055@qq.com> Co-authored-by: zhangdacheng <zhangdacheng@ainirobot.com> Co-authored-by: Jingru <niejingru@hotmail.com> Co-authored-by: Dylan Hawk <51147702+dylanwhawk@users.noreply.github.com> Co-authored-by: Tao He <sighingnow@gmail.com> Co-authored-by: Ganesh Jagadeesan <ganesh.jcs@gmail.com> Co-authored-by: Allen.Dou <allen.dou@hotmail.com> Co-authored-by: Liangfu Chen <liangfc@amazon.com> Co-authored-by: CHU Tianxiang <tianxiang.ctx@alibaba-inc.com> Co-authored-by: Jae-Won Chung <jwnchung@umich.edu> Co-authored-by: Seonghyeon <seonghyeon.drew@gmail.com> Co-authored-by: Billy Cao <aliencaocao@gmail.com> Co-authored-by: Nick Hill <nickhill@us.ibm.com> Co-authored-by: felixzhu555 <79335195+felixzhu555@users.noreply.github.com> Co-authored-by: br3no <breno@veltefaria.de> Co-authored-by: simon-mo <simon.mo@hey.com> Co-authored-by: Sherry <503147114@qq.com> Co-authored-by: Yuan Tang <terrytangyuan@gmail.com> Co-authored-by: Huarong <huohuarong@gmail.com> Co-authored-by: huohuarong <huohuarong@zuoshouyisheng.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Co-authored-by: Robert Shaw <114415538+rib-2@users.noreply.github.com> Co-authored-by: alexm <alexm@neuralmagic.com> Co-authored-by: zixiao <shunli.dsl@alibaba-inc.com> Co-authored-by: cloudhan <cloudhan@outlook.com> Co-authored-by: Sage Moore <sagemoore@utexas.edu> Co-authored-by: ElizaWszola <eliza@neuralmagic.com> Co-authored-by: Michael Goin <michael@neuralmagic.com> Co-authored-by: Jason Cox <jason@jasonacox.com> Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: TianYu GUO <guoty9@mail2.sysu.edu.cn> Co-authored-by: Jialun Lyu <43287111+pian13131@users.noreply.github.com> Co-authored-by: ttbachyinsda <ttbachyinsda@outlook.com> Co-authored-by: guofangze <guofangze@kuaishou.com> Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Avnish Narayan <avnish@anyscale.com> Co-authored-by: Chen Wang <Chen.Wang1@ibm.com> Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Co-authored-by: lcskrishna <lollachaitanya@gmail.com> Co-authored-by: SangBin Cho <rkooo567@gmail.com> Co-authored-by: Chujie Zheng <chujiezhengchn@gmail.com> Co-authored-by: TechxGenus <jianghao0728@mail.ustc.edu.cn> Co-authored-by: Michael Goin <mgoin64@gmail.com> Co-authored-by: jacobthebanana <50071502+jacobthebanana@users.noreply.github.com> Co-authored-by: whyiug <whyiug@hotmail.com> Co-authored-by: Terry <149540247+tterrysun@users.noreply.github.com> Co-authored-by: Douglas Lehr <91553416+dllehr-amd@users.noreply.github.com> Co-authored-by: andy-neuma <andy@neuralmagic.com>
1 parent aebf20b commit 7bbd2cc

File tree

126 files changed

+5123
-813
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+5123
-813
lines changed

.buildkite/test-pipeline.yaml

+7-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ steps:
1313

1414
- label: Basic Correctness Test
1515
command: pytest -v -s --forked basic_correctness
16+
17+
- label: Core Test
18+
command: pytest -v -s core
1619

1720
- label: Distributed Comm Ops Test
1821
command: pytest -v -s --forked test_comm_ops.py
@@ -25,7 +28,7 @@ steps:
2528
num_gpus: 2 # only support 1 or 2 for now.
2629

2730
- label: Engine Test
28-
command: pytest -v -s engine
31+
command: pytest -v -s engine test_sequence.py
2932

3033
- label: Entrypoints Test
3134
command: pytest -v -s entrypoints
@@ -49,6 +52,9 @@ steps:
4952
- label: Worker Test
5053
command: pytest -v -s worker
5154

55+
- label: Speculative decoding tests
56+
command: pytest -v -s spec_decode
57+
5258
- label: LoRA Test
5359
command: pytest -v -s lora --forked
5460

.github/workflows/remote-push.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
uses: ./.github/workflows/build-test.yml
2222
with:
2323
label: aws-avx2-192G-4-a10g-96G
24-
timeout: 180
24+
timeout: 240
2525
gitref: '${{ github.ref }}'
2626
Gi_per_thread: 4
2727
python: ${{ matrix.python }}

Dockerfile.rocm

+25-5
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
2323
# In that case, we need to use the python reference attention implementation in vllm
2424
ARG BUILD_FA="1"
2525

26+
# whether to build cupy on rocm
27+
ARG BUILD_CUPY="1"
28+
2629
# Install some basic utilities
2730
RUN apt-get update && apt-get install python3 python3-pip -y
2831

@@ -70,16 +73,33 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
7073
&& cd ..; \
7174
fi
7275

73-
COPY ./ /app/vllm
74-
75-
RUN python3 -m pip install --upgrade pip
76-
RUN python3 -m pip install xformers==0.0.23 --no-deps
77-
7876
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
7977
# Manually removed it so that later steps of numpy upgrade can continue
8078
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
8179
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
8280

81+
# build cupy
82+
RUN if [ "$BUILD_CUPY" = "1" ]; then \
83+
mkdir -p libs \
84+
&& cd libs \
85+
&& git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
86+
&& cd cupy \
87+
&& pip install mpi4py-mpich \
88+
&& pip install scipy==1.9.3 \
89+
&& pip install cython==0.29.* \
90+
&& env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
91+
&& export CUPY_INSTALL_USE_HIP=1 \
92+
&& export ROCM_HOME=/opt/rocm \
93+
&& export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
94+
&& pip install . \
95+
&& cd ..; \
96+
fi
97+
98+
COPY ./ /app/vllm
99+
100+
RUN python3 -m pip install --upgrade pip
101+
RUN python3 -m pip install xformers==0.0.23 --no-deps
102+
83103
RUN cd /app \
84104
&& cd vllm \
85105
&& pip install -U -r requirements-rocm.txt \

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ pip install -e .
2727

2828
## Quickstart
2929

30-
Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing).
30+
Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing).
3131

3232
A collection of ready-to-use SparseGPT and GPTQ models in inference optimized marlin format are [available on Hugging Face](https://huggingface.co/collections/neuralmagic/compressed-llms-for-nm-vllm-65e73e3d51d3200e34b77431)
3333

@@ -63,7 +63,7 @@ For a quick demonstration, here's how to run a small [50% sparse llama2-110M](ht
6363
from vllm import LLM, SamplingParams
6464

6565
model = LLM(
66-
"neuralmagic/llama2.c-stories110M-pruned50",
66+
"neuralmagic/llama2.c-stories110M-pruned50",
6767
sparsity="sparse_w16a16", # If left off, model will be loaded as dense
6868
)
6969

benchmarks/backend_request_func.py

+70
Original file line numberDiff line numberDiff line change
@@ -277,10 +277,80 @@ async def async_request_openai_completions(
277277
return output
278278

279279

280+
async def async_request_openai_chat_completions(
281+
request_func_input: RequestFuncInput,
282+
pbar: Optional[tqdm] = None,
283+
) -> RequestFuncOutput:
284+
api_url = request_func_input.api_url
285+
assert api_url.endswith(
286+
"v1/chat/completions"
287+
), "OpenAI Chat API URL must end with 'v1/chat/completions'."
288+
289+
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
290+
assert not request_func_input.use_beam_search
291+
payload = {
292+
"model": request_func_input.model,
293+
"messages": [
294+
{
295+
"role": "user",
296+
"content": request_func_input.prompt,
297+
},
298+
],
299+
"temperature": 0.0,
300+
"max_tokens": request_func_input.output_len,
301+
"stream": True,
302+
}
303+
headers = {
304+
"Content-Type": "application/json",
305+
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
306+
}
307+
308+
output = RequestFuncOutput()
309+
output.prompt_len = request_func_input.prompt_len
310+
311+
generated_text = ""
312+
ttft = 0
313+
st = time.perf_counter()
314+
try:
315+
async with session.post(url=api_url, json=payload,
316+
headers=headers) as response:
317+
if response.status == 200:
318+
async for chunk in response.content:
319+
if ttft == 0:
320+
ttft = time.perf_counter() - st
321+
output.ttft = ttft
322+
323+
chunk = chunk.strip()
324+
if not chunk:
325+
continue
326+
327+
chunk = chunk.decode("utf-8").lstrip("data: ")
328+
if chunk == "[DONE]":
329+
latency = time.perf_counter() - st
330+
else:
331+
body = json.loads(chunk)
332+
if "content" in body["choices"][0]["delta"]:
333+
generated_text += body["choices"][0]["delta"][
334+
"content"]
335+
336+
output.generated_text = generated_text
337+
output.success = True
338+
output.latency = latency
339+
else:
340+
output.success = False
341+
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
342+
output.success = False
343+
344+
if pbar:
345+
pbar.update(1)
346+
return output
347+
348+
280349
ASYNC_REQUEST_FUNCS = {
281350
"tgi": async_request_tgi,
282351
"vllm": async_request_vllm,
283352
"deepspeed-mii": async_request_deepspeed_mii,
284353
"openai": async_request_openai_completions,
354+
"openai-chat": async_request_openai_chat_completions,
285355
"tensorrt-llm": async_request_trt_llm,
286356
}

benchmarks/benchmark_serving.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
On the client side, run:
1313
python benchmarks/benchmark_serving.py \
1414
--backend <backend> \
15-
--tokenizer <your_model> --dataset <target_dataset> \
15+
--model <your_model> --dataset <target_dataset> \
1616
--request-rate <request_rate>
1717
"""
1818
import argparse
@@ -171,10 +171,10 @@ async def benchmark(
171171
else:
172172
raise ValueError(f"Unknown backend: {backend}")
173173

174-
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
175-
176174
print(f"Traffic request rate: {request_rate}")
177175

176+
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
177+
178178
benchmark_start_time = time.perf_counter()
179179
tasks = []
180180
async for request in get_request(input_requests, request_rate):

csrc/attention/attention_kernels.cu

-8
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@
1515
* See the License for the specific language governing permissions and
1616
* limitations under the License.
1717
*/
18-
#ifdef USE_ROCM
19-
#include <hip/hip_runtime.h>
20-
#endif
2118

2219
#include <torch/extension.h>
2320
#include <ATen/cuda/CUDAContext.h>
@@ -31,11 +28,6 @@
3128

3229
#include <algorithm>
3330

34-
#ifndef USE_ROCM
35-
#define WARP_SIZE 32
36-
#else
37-
#define WARP_SIZE warpSize
38-
#endif
3931
#define MAX(a, b) ((a) > (b) ? (a) : (b))
4032
#define MIN(a, b) ((a) < (b) ? (a) : (b))
4133
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))

csrc/cuda_compat.h

+10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
#pragma once
22

3+
#ifdef USE_ROCM
4+
#include <hip/hip_runtime.h>
5+
#endif
6+
7+
#ifndef USE_ROCM
8+
#define WARP_SIZE 32
9+
#else
10+
#define WARP_SIZE warpSize
11+
#endif
12+
313
#ifndef USE_ROCM
414
#define VLLM_LDG(arg) __ldg(arg)
515
#else

csrc/punica/bgmv/bgmv_config.h

+3
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,15 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
1414
f(in_T, out_T, W_T, narrow, 128) \
1515
f(in_T, out_T, W_T, narrow, 256) \
1616
f(in_T, out_T, W_T, narrow, 512) \
17+
f(in_T, out_T, W_T, narrow, 768) \
1718
f(in_T, out_T, W_T, narrow, 1024) \
1819
f(in_T, out_T, W_T, narrow, 1280) \
1920
f(in_T, out_T, W_T, narrow, 1728) \
2021
f(in_T, out_T, W_T, narrow, 1792) \
2122
f(in_T, out_T, W_T, narrow, 2048) \
2223
f(in_T, out_T, W_T, narrow, 2560) \
2324
f(in_T, out_T, W_T, narrow, 2752) \
25+
f(in_T, out_T, W_T, narrow, 2816) \
2426
f(in_T, out_T, W_T, narrow, 3072) \
2527
f(in_T, out_T, W_T, narrow, 3456) \
2628
f(in_T, out_T, W_T, narrow, 3584) \
@@ -36,6 +38,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
3638
f(in_T, out_T, W_T, narrow, 10240) \
3739
f(in_T, out_T, W_T, narrow, 11008) \
3840
f(in_T, out_T, W_T, narrow, 12288) \
41+
f(in_T, out_T, W_T, narrow, 13696) \
3942
f(in_T, out_T, W_T, narrow, 13824) \
4043
f(in_T, out_T, W_T, narrow, 14336) \
4144
f(in_T, out_T, W_T, narrow, 16384) \

csrc/reduction_utils.cuh

+3-3
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ namespace vllm {
2424
template<typename T>
2525
__inline__ __device__ T warpReduceSum(T val) {
2626
#pragma unroll
27-
for (int mask = 16; mask > 0; mask >>= 1)
27+
for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1)
2828
val += VLLM_SHFL_XOR_SYNC(val, mask);
2929
return val;
3030
}
3131

3232
/* Calculate the sum of all elements in a block */
3333
template<typename T>
3434
__inline__ __device__ T blockReduceSum(T val) {
35-
static __shared__ T shared[32];
35+
static __shared__ T shared[WARP_SIZE];
3636
int lane = threadIdx.x & 0x1f;
3737
int wid = threadIdx.x >> 5;
3838

@@ -45,7 +45,7 @@ __inline__ __device__ T blockReduceSum(T val) {
4545

4646
// Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
4747
// blockDim.x is not divided by 32
48-
val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
48+
val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f);
4949
val = warpReduceSum<T>(val);
5050
return val;
5151
}

docs/source/conf.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,15 @@
7272

7373
# Mock out external dependencies here.
7474
autodoc_mock_imports = [
75-
"torch", "transformers", "psutil", "prometheus_client", "sentencepiece",
76-
"vllm.cuda_utils", "vllm._C"
75+
"torch",
76+
"transformers",
77+
"psutil",
78+
"prometheus_client",
79+
"sentencepiece",
80+
"vllm.cuda_utils",
81+
"vllm._C",
82+
"numpy",
83+
"tqdm",
7784
]
7885

7986
for mock_target in autodoc_mock_imports:

docs/source/models/lora.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,9 @@ Requests can specify the LoRA adapter as if it were any other model via the ``mo
9090
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
9191
LoRA adapter requests if they were provided and ``max_loras`` is set high enough).
9292

93-
The following is an example request
93+
The following is an example request
9494

95-
.. code-block::bash
95+
.. code-block::bash
9696
curl http://localhost:8000/v1/completions \
9797
-H "Content-Type: application/json" \
9898
-d '{

0 commit comments

Comments
 (0)