Skip to content

Commit

Permalink
Merge branch 'main' into fix-spec-decode-seed-test
Browse files Browse the repository at this point in the history
  • Loading branch information
tdoublep committed Jul 24, 2024
2 parents 4f2c900 + ccc4a73 commit f8dbc0f
Show file tree
Hide file tree
Showing 23 changed files with 365 additions and 245 deletions.
2 changes: 1 addition & 1 deletion .buildkite/check-wheel-size.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import zipfile

MAX_SIZE_MB = 200
MAX_SIZE_MB = 250


def print_top_10_largest_files(zip_file):
Expand Down
29 changes: 29 additions & 0 deletions docs/source/getting_started/amd-installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,35 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
$ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
.. tip::

For example, vLLM v0.5.3 on ROCM 6.1 can be built with the following steps:

.. code-block:: console
$ pip install --upgrade pip
$ # Install PyTorch
$ pip uninstall torch -y
$ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
$ # Build & install AMD SMI
$ pip install /opt/rocm/share/amd_smi
$ # Install dependencies
$ pip install --upgrade numba scipy huggingface-hub[cli]
$ pip install "numpy<2"
$ pip install -r requirements-rocm.txt
$ # Apply the patch to ROCM 6.1 (requires root permission)
$ wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
$ rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
$ # Build vLLM for MI210/MI250/MI300.
$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
$ python3 setup.py develop
.. tip::

- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
Expand Down
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ Documentation

quantization/supported_hardware
quantization/auto_awq
quantization/bnb
quantization/fp8
quantization/fp8_e5m2_kvcache
quantization/fp8_e4m3_kvcache
Expand Down
43 changes: 43 additions & 0 deletions docs/source/quantization/bnb.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
.. _bits_and_bytes:

BitsAndBytes
==================

vLLM now supports `BitsAndBytes <https://github.com/TimDettmers/bitsandbytes>`_ for more efficient model inference.
BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
Compared to other quantization methods, BitsAndBytes eliminates the need for calibrating the quantized model with input data.

Below are the steps to utilize BitsAndBytes with vLLM.

.. code-block:: console
$ pip install bitsandbytes>=0.42.0
vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.

You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes.
And usually, these repositories have a config.json file that includes a quantization_config section.

Read quantized checkpoint.
--------------------------

.. code-block:: python
from vllm import LLM
import torch
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes")
Inflight quantization: load as 4bit quantization
------------------------------------------------

.. code-block:: python
from vllm import LLM
import torch
model_id = "huggyllama/llama-7b"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes")
2 changes: 1 addition & 1 deletion requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ numpy < 2.0.0
requests
tqdm
py-cpuinfo
transformers >= 4.42.4 # Required for Gemma 2 and for additional chat template parameters.
transformers >= 4.43.1 # Required for Chameleon and Llama 3.1 hotfox.
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
Expand Down
24 changes: 24 additions & 0 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,27 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
tp_args.append("--enforce-eager")

compare_two_settings(MODEL_NAME, pp_args, tp_args)


@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
(2, "JackFram/llama-160m"),
])
@pytest.mark.parametrize("ATTN_BACKEND", [
"FLASH_ATTN",
"FLASHINFER",
])
def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
cudagraph_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--pipeline-parallel-size",
str(PP_SIZE),
"--distributed-executor-backend",
"ray",
]
os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND

eager_args = cudagraph_args + ["--enforce-eager"]

compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
91 changes: 91 additions & 0 deletions tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import time
from typing import List

import pytest
Expand All @@ -10,6 +11,8 @@
from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams

from ..conftest import cleanup

MODELS = [
"facebook/opt-125m",
]
Expand Down Expand Up @@ -219,6 +222,94 @@ def test_metric_spec_decode(
"does not meet expectation")


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [10])
@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
def test_metric_spec_decode_interval(
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
log_interval: int,
) -> None:
k = 5

engine_args = EngineArgs(model=model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
use_v2_block_manager=True,
enforce_eager=True)

engine = LLMEngine.from_engine_args(engine_args)

try:

engine.add_request(
"request-id-0",
example_prompts[0],
SamplingParams(max_tokens=max_tokens),
)

# set log internal
stat_logger = engine.stat_loggers['prometheus']
stat_logger.local_interval = log_interval

# prefill
engine.step()

# wait for 5 seconds to ensure that spec decode metrics
# get triggered in first decode step
time.sleep(5)

# first decode step should trigger async collection of metrics
engine.step()

# wait one second to allow H2D transfer to finish
time.sleep(1)

# second decode step should now be able to collect the spec
# decode stats and the request should also be finished
engine.step()

# must have finisehd now
assert not engine.has_unfinished_requests()

# wait to ensure logging occurs
time.sleep(log_interval)

# force logging
engine.step()

# Note that the purpose of this test is to verify spec decode
# metrics instead of functional correctness, so the expected values
# are intended to be loose.
metric_name_to_expected_fn = {
"gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
"gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
"counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
"counter_spec_decode_num_draft_tokens": lambda v: v == k,
"counter_spec_decode_num_emitted_tokens":
lambda v: 0 <= v <= k + 1,
}

for metric_name, is_expected in metric_name_to_expected_fn.items():
metric_val = getattr(
stat_logger.metrics,
metric_name).labels(**stat_logger.labels)._value.get()
assert is_expected(metric_val), (
f"the value of metric {metric_name} ({metric_val}) "
"does not meet expectation")

finally:
del engine
cleanup()


def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
num_requests: int) -> None:
if disable_log_stats:
Expand Down
18 changes: 14 additions & 4 deletions tests/quantization/test_bitsandbytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,20 @@
from tests.quantization.utils import is_quant_method_supported
from vllm import SamplingParams

models_to_test = [
('huggyllama/llama-7b', 'quantize model inflight'),
('lllyasviel/omost-llama-3-8b-4bits', 'read pre-quantized model'),
]


@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
reason='bitsandbytes is not supported on this GPU type.')
def test_load_bnb_model(vllm_runner) -> None:
with vllm_runner('huggyllama/llama-7b',
@pytest.mark.parametrize("model_name, description", models_to_test)
def test_load_bnb_model(vllm_runner, model_name, description) -> None:
with vllm_runner(model_name,
quantization='bitsandbytes',
load_format='bitsandbytes',
enforce_eager=True) as llm:

model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501

# check the weights in MLP & SelfAttention are quantized to torch.uint8
Expand Down Expand Up @@ -65,12 +70,17 @@ def test_load_bnb_model(vllm_runner) -> None:
'To be or not to be, that is the question.'
]
outputs = llm.generate(prompts, sampling_params=sampling_params)

assert len(outputs) == len(prompts)

for index in range(len(outputs)):
# compare the first line of the output
actual_output = outputs[index][1][0].split('\n', 1)[0]
expected_output = expected_outputs[index].split('\n', 1)[0]

assert len(actual_output) >= len(expected_output), (
f'Actual {actual_output} should be larger than or equal to '
f'expected {expected_output}')
actual_output = actual_output[:len(expected_output)]

assert actual_output == expected_output, (
f'Expected: {expected_output}, but got: {actual_output}')
1 change: 1 addition & 0 deletions tests/quantization/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
MODELS = [
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
]


Expand Down
6 changes: 3 additions & 3 deletions tests/spec_decode/e2e/test_mlp_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
from .conftest import run_greedy_equality_correctness_test

# main model
MAIN_MODEL = "ibm-granite/granite-3b-code-instruct"
MAIN_MODEL = "JackFram/llama-160m"

# speculative model
SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator"
SPEC_MODEL = "ibm-fms/llama-160m-accelerator"

# max. number of speculative tokens: this corresponds to
# n_predict in the config.json of the speculator model.
MAX_SPEC_TOKENS = 5
MAX_SPEC_TOKENS = 3

# precision
PRECISION = "float32"
Expand Down
53 changes: 27 additions & 26 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,8 @@ def test_get_sliding_window():


def test_rope_customization():
TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
TEST_ROPE_THETA = 16_000_000.0
LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}

llama_model_config = ModelConfig(
"meta-llama/Meta-Llama-3-8B-Instruct",
Expand Down Expand Up @@ -96,27 +95,29 @@ def test_rope_customization():
None) == TEST_ROPE_THETA
assert llama_model_config.max_model_len == 16384

longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
"lmsys/longchat-13b-16k",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
)
assert getattr(longchat_model_config.hf_config, "rope_scaling",
None) == LONGCHAT_ROPE_SCALING
assert longchat_model_config.max_model_len == 16384

longchat_model_config = ModelConfig(
"lmsys/longchat-13b-16k",
"lmsys/longchat-13b-16k",
tokenizer_mode="auto",
trust_remote_code=False,
dtype="float16",
seed=0,
rope_scaling=TEST_ROPE_SCALING,
)
assert getattr(longchat_model_config.hf_config, "rope_scaling",
None) == TEST_ROPE_SCALING
assert longchat_model_config.max_model_len == 4096
# TODO: add these back when the rope configs are fixed
# LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
# longchat_model_config = ModelConfig(
# "lmsys/longchat-13b-16k",
# "lmsys/longchat-13b-16k",
# tokenizer_mode="auto",
# trust_remote_code=False,
# dtype="float16",
# seed=0,
# )
# assert getattr(longchat_model_config.hf_config, "rope_scaling",
# None) == LONGCHAT_ROPE_SCALING
# assert longchat_model_config.max_model_len == 16384

# longchat_model_config = ModelConfig(
# "lmsys/longchat-13b-16k",
# "lmsys/longchat-13b-16k",
# tokenizer_mode="auto",
# trust_remote_code=False,
# dtype="float16",
# seed=0,
# rope_scaling=TEST_ROPE_SCALING,
# )
# assert getattr(longchat_model_config.hf_config, "rope_scaling",
# None) == TEST_ROPE_SCALING
# assert longchat_model_config.max_model_len == 4096
2 changes: 2 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,9 +591,11 @@ class LoadConfig:
mainly for profiling.
"tensorizer" will use CoreWeave's tensorizer library for
fast weight loading.
"bitsandbytes" will load nf4 type weights.
ignore_patterns: The list of patterns to ignore when loading the model.
Default to "original/**/*" to avoid repeated loading of llama's
checkpoints.
"""

load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
Expand Down
4 changes: 2 additions & 2 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,8 +676,8 @@ def create_engine_config(self, ) -> EngineConfig:
# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
self.qlora_adapter_name_or_path is not None) and \
self.load_format != "bitsandbytes":
raise ValueError(
"BitsAndBytes quantization and QLoRA adapter only support "
f"'bitsandbytes' load format, but got {self.load_format}")
Expand Down
Loading

0 comments on commit f8dbc0f

Please sign in to comment.