Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

upstream merge sync 2024-03-11 #108

Merged
merged 90 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
90 commits
Select commit Hold shift + click to select a range
d7f3964
Update comment (#2934)
ronensc Feb 22, 2024
5574081
Added early stopping to completion APIs (#2939)
Maxusmusti Feb 22, 2024
344020c
Migrate MistralForCausalLM to LlamaForCausalLM (#2868)
esmeetu Feb 22, 2024
95529e3
Use Llama RMSNorm custom op for Gemma (#2974)
WoosukKwon Feb 22, 2024
93dc5a2
chore(vllm): codespell for spell checking (#2820)
mspronesti Feb 22, 2024
fd5dcc5
Optimize GeGLU layer in Gemma (#2975)
WoosukKwon Feb 22, 2024
c530e2c
[FIX] Fix a bug in initializing Yarn RoPE (#2983)
44670 Feb 22, 2024
6f32cdd
Remove Flash Attention in test env (#2982)
WoosukKwon Feb 22, 2024
4caf704
Include tokens from prompt phase in `counter_generation_tokens` (#2802)
ronensc Feb 22, 2024
57f0449
Fix nvcc not found in vlm-openai image (#2781)
zhaoyang-star Feb 22, 2024
f7c1234
[Fix] Fissertion on YaRN model len (#2984)
WoosukKwon Feb 23, 2024
ef978fe
Port metrics from `aioprometheus` to `prometheus_client` (#2730)
hmellor Feb 25, 2024
70f3e8e
Add LogProbs for Chat Completions in OpenAI (#2918)
jlcmoore Feb 26, 2024
cfc15a1
Optimize Triton MoE Kernel (#2979)
pcmoritz Feb 26, 2024
d6e4a13
[Minor] Remove gather_cached_kv kernel (#3043)
WoosukKwon Feb 26, 2024
d9f726c
[Minor] Remove unused config files (#3039)
esmeetu Feb 27, 2024
c1c0d00
Don't use cupy when `enforce_eager=True` (#3037)
esmeetu Feb 27, 2024
4dd6416
Fix stablelm (#3038)
esmeetu Feb 27, 2024
48a8f4a
Support Orion model (#2539)
dachengai Feb 27, 2024
2410e32
fix `get_ip` error in pure ipv6 environment (#2931)
Jingru Feb 27, 2024
4bd18ec
[Minor] Fix type annotation in fused moe (#3045)
WoosukKwon Feb 27, 2024
e0ade06
Support logit bias for OpenAI API (#3027)
dylanwhawk Feb 27, 2024
8b430d7
[Minor] Fix StableLMEpochForCausalLM -> StableLmForCausalLM (#3046)
WoosukKwon Feb 27, 2024
71bcaf9
Enable GQA support in the prefix prefill kernels (#3007)
sighingnow Feb 27, 2024
a868310
multi-lora documentation fix (#3064)
ElefHead Feb 28, 2024
e46fa5d
Restrict prometheus_client >= 0.18.0 to prevent errors when importing…
AllenDou Feb 28, 2024
3b7178c
[Neuron] Support inference with transformers-neuronx (#2569)
liangfu Feb 28, 2024
929b4f2
Add LoRA support for Gemma (#3050)
WoosukKwon Feb 28, 2024
01a5d18
Add Support for 2/3/8-bit GPTQ Quantization Models (#2330)
chu-tianxiang Feb 29, 2024
a6d471c
Fix: `AttributeError` in OpenAI-compatible server (#3018)
jaywonchung Feb 29, 2024
9289e57
add cache_config's info to prometheus metrics. (#3100)
AllenDou Feb 29, 2024
bfdcfa6
Support starcoder2 architecture (#3089)
sh0416 Feb 29, 2024
2c08ff2
Fix building from source on WSL (#3112)
aliencaocao Feb 29, 2024
29a8d6a
[Fix] Don't deep-copy LogitsProcessors when copying SamplingParams (#…
njhill Feb 29, 2024
703e42e
Add guided decoding for OpenAI API server (#2819)
felixzhu555 Feb 29, 2024
54d3544
Fix: Output text is always truncated in some models (#3016)
HyperdriveHustle Mar 1, 2024
27ca23d
Remove exclude_unset in streaming response (#3143)
sh0416 Mar 1, 2024
49d849b
docs: Add tutorial on deploying vLLM model with KServe (#2586)
terrytangyuan Mar 1, 2024
90fbf12
fix relative import path of protocol.py (#3134)
Huarong Mar 1, 2024
c0c2335
Integrate Marlin Kernels for Int4 GPTQ inference (#2497)
robertgshaw2-redhat Mar 1, 2024
82091b8
Bump up to v0.3.3 (#3129)
WoosukKwon Mar 1, 2024
29e70e3
allow user chose log level by --log-level instead of fixed 'info'. (#…
AllenDou Mar 1, 2024
baee28c
Reorder kv dtype check to avoid nvcc not found error on AMD platform …
cloudhan Mar 2, 2024
ce4f5a2
Add Automatic Prefix Caching (#2762)
SageMoore Mar 2, 2024
d65fac2
Add vLLM version info to logs and openai API server (#3161)
jasonacox Mar 3, 2024
996d095
[FIX] Fix styles in automatic prefix caching & add a automatic prefix…
zhuohan123 Mar 3, 2024
17c3103
Make it easy to profile workers with nsight (#3162)
pcmoritz Mar 4, 2024
d0fae88
[DOC] add setup document to support neuron backend (#2777)
liangfu Mar 4, 2024
901cf4c
[Minor Fix] Remove unused code in benchmark_prefix_caching.py (#3171)
gty111 Mar 4, 2024
27a7b07
Add document for vllm paged attention kernel. (#2978)
pian13131 Mar 4, 2024
9cbc7e5
enable --gpu-memory-utilization in benchmark_throughput.py (#3175)
AllenDou Mar 4, 2024
76e8a70
[Minor fix] The domain dns.google may cause a socket.gaierror excepti…
ttbachyinsda Mar 4, 2024
22de452
Push logprob generation to LLMEngine (#3065)
Yard1 Mar 4, 2024
ff578ca
Add health check, make async Engine more robust (#3015)
Yard1 Mar 4, 2024
9a4548b
Fix the openai benchmarking requests to work with latest OpenAI apis …
wangchen615 Mar 4, 2024
05af6da
[ROCm] enable cupy in order to enable cudagraph mode for AMD GPUs (#…
hongxiayang Mar 5, 2024
8999ec3
Store `eos_token_id` in `Sequence` for easy access (#3166)
njhill Mar 5, 2024
2efce05
[Fix] Avoid pickling entire LLMEngine for Ray workers (#3207)
njhill Mar 6, 2024
24aecf4
[Tests] Add block manager and scheduler tests (#3108)
rkooo567 Mar 6, 2024
a33ce60
[Testing] Fix core tests (#3224)
cadedaniel Mar 6, 2024
4cb3b92
Add tqdm `dynamic_ncols=True` (#3242)
chujiezheng Mar 6, 2024
d3c04b6
Add GPTQ support for Gemma (#3200)
TechxGenus Mar 7, 2024
cbf4c05
Update requirements-dev.txt to include package for benchmarking scrip…
wangchen615 Mar 7, 2024
2daf23a
Separate attention backends (#3005)
WoosukKwon Mar 7, 2024
385da2d
Measure model memory usage (#3120)
mgoin Mar 7, 2024
8cbba46
Possible fix for conflict between Automated Prefix Caching (#2762) an…
jacobthebanana Mar 7, 2024
b35cc93
Fix auto prefix bug (#3239)
ElizaWszola Mar 8, 2024
d2339d6
Connect engine healthcheck to openai server (#3260)
njhill Mar 8, 2024
c59e120
Feature add lora support for Qwen2 (#3177)
whyiug Mar 8, 2024
1ece1ae
[Minor Fix] Fix comments in benchmark_serving (#3252)
gty111 Mar 8, 2024
99c3cfb
[Docs] Fix Unmocked Imports (#3275)
ywang96 Mar 8, 2024
1cb0cc2
[FIX] Make `flash_attn` optional (#3269)
WoosukKwon Mar 8, 2024
c2c5e09
Move model filelocks from `/tmp/` to `~/.cache/vllm/locks/` dir (#3241)
mgoin Mar 8, 2024
f48c679
[FIX] Fix prefix test error on main (#3286)
zhuohan123 Mar 9, 2024
8437bae
[Speculative decoding 3/9] Worker which speculates, scores, and appli…
cadedaniel Mar 9, 2024
0bba88d
Enhance lora tests with more layer and rank variations (#3243)
tterrysun Mar 10, 2024
e4a28e5
[ROCM] Fix blockReduceSum to use correct warp counts for ROCm and CUD…
dllehr-amd Mar 10, 2024
9e8744a
[BugFix] Fix get tokenizer when using ray (#3301)
esmeetu Mar 11, 2024
4b59f00
[Fix] Fix best_of behavior when n=1 (#3298)
njhill Mar 11, 2024
2f8844b
Re-enable the 80 char line width limit (#3305)
zhuohan123 Mar 11, 2024
657061f
[docs] Add LoRA support information for models (#3299)
pcmoritz Mar 11, 2024
46d67bf
Merge remote-tracking branch 'upstream/main' into upstream-sync-2024-…
Mar 11, 2024
9179ac9
yapf formatting
Mar 11, 2024
44eb021
ruff and yapf patches
Mar 11, 2024
1d64afb
local yapf version mismatch with automation
Mar 11, 2024
5c41a4d
undo bad merge
Mar 11, 2024
8180044
typo
Mar 11, 2024
f2dcba7
adjust timeout
Mar 11, 2024
a78d827
out of memory
Mar 12, 2024
6b9a5a4
Merge remote-tracking branch 'origin/main' into upstream-sync-2024-03-11
Mar 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ steps:

- label: Basic Correctness Test
command: pytest -v -s --forked basic_correctness

- label: Core Test
command: pytest -v -s core

- label: Distributed Comm Ops Test
command: pytest -v -s --forked test_comm_ops.py
Expand All @@ -25,7 +28,7 @@ steps:
num_gpus: 2 # only support 1 or 2 for now.

- label: Engine Test
command: pytest -v -s engine
command: pytest -v -s engine test_sequence.py

- label: Entrypoints Test
command: pytest -v -s entrypoints
Expand All @@ -49,6 +52,9 @@ steps:
- label: Worker Test
command: pytest -v -s worker

- label: Speculative decoding tests
command: pytest -v -s spec_decode

- label: LoRA Test
command: pytest -v -s lora --forked

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/remote-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
uses: ./.github/workflows/build-test.yml
with:
label: aws-avx2-192G-4-a10g-96G
timeout: 180
timeout: 240
gitref: '${{ github.ref }}'
Gi_per_thread: 4
python: ${{ matrix.python }}
Expand Down
30 changes: 25 additions & 5 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
# In that case, we need to use the python reference attention implementation in vllm
ARG BUILD_FA="1"

# whether to build cupy on rocm
ARG BUILD_CUPY="1"

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y

Expand Down Expand Up @@ -70,16 +73,33 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
&& cd ..; \
fi

COPY ./ /app/vllm

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install xformers==0.0.23 --no-deps

# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually removed it so that later steps of numpy upgrade can continue
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi

# build cupy
RUN if [ "$BUILD_CUPY" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
&& cd cupy \
&& pip install mpi4py-mpich \
&& pip install scipy==1.9.3 \
&& pip install cython==0.29.* \
&& env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
&& export CUPY_INSTALL_USE_HIP=1 \
&& export ROCM_HOME=/opt/rocm \
&& export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
&& pip install . \
&& cd ..; \
fi

COPY ./ /app/vllm

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install xformers==0.0.23 --no-deps

RUN cd /app \
&& cd vllm \
&& pip install -U -r requirements-rocm.txt \
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pip install -e .

## Quickstart

Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing).
Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing).

A collection of ready-to-use SparseGPT and GPTQ models in inference optimized marlin format are [available on Hugging Face](https://huggingface.co/collections/neuralmagic/compressed-llms-for-nm-vllm-65e73e3d51d3200e34b77431)

Expand Down Expand Up @@ -63,7 +63,7 @@ For a quick demonstration, here's how to run a small [50% sparse llama2-110M](ht
from vllm import LLM, SamplingParams

model = LLM(
"neuralmagic/llama2.c-stories110M-pruned50",
"neuralmagic/llama2.c-stories110M-pruned50",
sparsity="sparse_w16a16", # If left off, model will be loaded as dense
)

Expand Down
70 changes: 70 additions & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,80 @@ async def async_request_openai_completions(
return output


async def async_request_openai_chat_completions(
request_func_input: RequestFuncInput,
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
"v1/chat/completions"
), "OpenAI Chat API URL must end with 'v1/chat/completions'."

async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
payload = {
"model": request_func_input.model,
"messages": [
{
"role": "user",
"content": request_func_input.prompt,
},
],
"temperature": 0.0,
"max_tokens": request_func_input.output_len,
"stream": True,
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
}

output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len

generated_text = ""
ttft = 0
st = time.perf_counter()
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk in response.content:
if ttft == 0:
ttft = time.perf_counter() - st
output.ttft = ttft

chunk = chunk.strip()
if not chunk:
continue

chunk = chunk.decode("utf-8").lstrip("data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
body = json.loads(chunk)
if "content" in body["choices"][0]["delta"]:
generated_text += body["choices"][0]["delta"][
"content"]

output.generated_text = generated_text
output.success = True
output.latency = latency
else:
output.success = False
except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
output.success = False

if pbar:
pbar.update(1)
return output


ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
"vllm": async_request_vllm,
"deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
"tensorrt-llm": async_request_trt_llm,
}
6 changes: 3 additions & 3 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
On the client side, run:
python benchmarks/benchmark_serving.py \
--backend <backend> \
--tokenizer <your_model> --dataset <target_dataset> \
--model <your_model> --dataset <target_dataset> \
--request-rate <request_rate>
"""
import argparse
Expand Down Expand Up @@ -171,10 +171,10 @@ async def benchmark(
else:
raise ValueError(f"Unknown backend: {backend}")

pbar = None if disable_tqdm else tqdm(total=len(input_requests))

print(f"Traffic request rate: {request_rate}")

pbar = None if disable_tqdm else tqdm(total=len(input_requests))

benchmark_start_time = time.perf_counter()
tasks = []
async for request in get_request(input_requests, request_rate):
Expand Down
8 changes: 0 additions & 8 deletions csrc/attention/attention_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#endif

#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
Expand All @@ -31,11 +28,6 @@

#include <algorithm>

#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
Expand Down
10 changes: 10 additions & 0 deletions csrc/cuda_compat.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
#pragma once

#ifdef USE_ROCM
#include <hip/hip_runtime.h>
#endif

#ifndef USE_ROCM
#define WARP_SIZE 32
#else
#define WARP_SIZE warpSize
#endif

#ifndef USE_ROCM
#define VLLM_LDG(arg) __ldg(arg)
#else
Expand Down
3 changes: 3 additions & 0 deletions csrc/punica/bgmv/bgmv_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
f(in_T, out_T, W_T, narrow, 128) \
f(in_T, out_T, W_T, narrow, 256) \
f(in_T, out_T, W_T, narrow, 512) \
f(in_T, out_T, W_T, narrow, 768) \
f(in_T, out_T, W_T, narrow, 1024) \
f(in_T, out_T, W_T, narrow, 1280) \
f(in_T, out_T, W_T, narrow, 1728) \
f(in_T, out_T, W_T, narrow, 1792) \
f(in_T, out_T, W_T, narrow, 2048) \
f(in_T, out_T, W_T, narrow, 2560) \
f(in_T, out_T, W_T, narrow, 2752) \
f(in_T, out_T, W_T, narrow, 2816) \
f(in_T, out_T, W_T, narrow, 3072) \
f(in_T, out_T, W_T, narrow, 3456) \
f(in_T, out_T, W_T, narrow, 3584) \
Expand All @@ -36,6 +38,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
f(in_T, out_T, W_T, narrow, 10240) \
f(in_T, out_T, W_T, narrow, 11008) \
f(in_T, out_T, W_T, narrow, 12288) \
f(in_T, out_T, W_T, narrow, 13696) \
f(in_T, out_T, W_T, narrow, 13824) \
f(in_T, out_T, W_T, narrow, 14336) \
f(in_T, out_T, W_T, narrow, 16384) \
Expand Down
6 changes: 3 additions & 3 deletions csrc/reduction_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ namespace vllm {
template<typename T>
__inline__ __device__ T warpReduceSum(T val) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1)
for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1)
val += VLLM_SHFL_XOR_SYNC(val, mask);
return val;
}

/* Calculate the sum of all elements in a block */
template<typename T>
__inline__ __device__ T blockReduceSum(T val) {
static __shared__ T shared[32];
static __shared__ T shared[WARP_SIZE];
int lane = threadIdx.x & 0x1f;
int wid = threadIdx.x >> 5;

Expand All @@ -45,7 +45,7 @@ __inline__ __device__ T blockReduceSum(T val) {

// Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
// blockDim.x is not divided by 32
val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f);
val = warpReduceSum<T>(val);
return val;
}
Expand Down
11 changes: 9 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,15 @@

# Mock out external dependencies here.
autodoc_mock_imports = [
"torch", "transformers", "psutil", "prometheus_client", "sentencepiece",
"vllm.cuda_utils", "vllm._C"
"torch",
"transformers",
"psutil",
"prometheus_client",
"sentencepiece",
"vllm.cuda_utils",
"vllm._C",
"numpy",
"tqdm",
]

for mock_target in autodoc_mock_imports:
Expand Down
4 changes: 2 additions & 2 deletions docs/source/models/lora.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ Requests can specify the LoRA adapter as if it were any other model via the ``mo
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
LoRA adapter requests if they were provided and ``max_loras`` is set high enough).

The following is an example request
The following is an example request

.. code-block::bash
.. code-block::bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
Expand Down
Loading
Loading