Merge branch 'main' into fix-spec-decode-seed-test

vllm-project · Jul 24, 2024 · f8dbc0f · f8dbc0f
2 parents 4f2c900 + ccc4a73
commit f8dbc0f
Show file tree

Hide file tree

Showing 23 changed files with 365 additions and 245 deletions.
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
@@ -1,7 +1,7 @@
 import os
 import zipfile
 
-MAX_SIZE_MB = 200
+MAX_SIZE_MB = 250
 
 
 def print_top_10_largest_files(zip_file):

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
@@ -107,6 +107,35 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
     $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
 
 
+.. tip::
+
+    For example, vLLM v0.5.3 on ROCM 6.1 can be built with the following steps:
+
+    .. code-block:: console
+
+        $ pip install --upgrade pip
+
+        $ # Install PyTorch
+        $ pip uninstall torch -y
+        $ pip install --no-cache-dir --pre torch==2.5.0.dev20240710 --index-url https://download.pytorch.org/whl/nightly/rocm6.1
+
+        $ # Build & install AMD SMI
+        $ pip install /opt/rocm/share/amd_smi
+
+        $ # Install dependencies
+        $ pip install --upgrade numba scipy huggingface-hub[cli]
+        $ pip install "numpy<2"
+        $ pip install -r requirements-rocm.txt
+
+        $ # Apply the patch to ROCM 6.1 (requires root permission)
+        $ wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
+        $ rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
+
+        $ # Build vLLM for MI210/MI250/MI300.
+        $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+        $ python3 setup.py develop
+
+
 .. tip::
 
     - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -105,6 +105,7 @@ Documentation
 
    quantization/supported_hardware
    quantization/auto_awq
+   quantization/bnb
    quantization/fp8
    quantization/fp8_e5m2_kvcache
    quantization/fp8_e4m3_kvcache

diff --git a/docs/source/quantization/bnb.rst b/docs/source/quantization/bnb.rst
@@ -0,0 +1,43 @@
+.. _bits_and_bytes:
+
+BitsAndBytes
+==================
+
+vLLM now supports `BitsAndBytes <https://github.com/TimDettmers/bitsandbytes>`_ for more efficient model inference.
+BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
+Compared to other quantization methods,  BitsAndBytes eliminates the need for calibrating the quantized model with input data.
+
+Below are the steps to utilize BitsAndBytes with vLLM.
+
+.. code-block:: console
+
+    $ pip install bitsandbytes>=0.42.0
+
+vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
+
+You can find bitsandbytes quantized models on https://huggingface.co/models?other=bitsandbytes.
+And usually, these repositories have a config.json file that includes a quantization_config section.
+
+Read quantized checkpoint.
+--------------------------
+
+.. code-block:: python
+
+    from vllm import LLM
+    import torch
+    # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
+    model_id = "unsloth/tinyllama-bnb-4bit"
+    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+    quantization="bitsandbytes", load_format="bitsandbytes")
+
+Inflight quantization: load as 4bit quantization
+------------------------------------------------
+
+.. code-block:: python
+
+    from vllm import LLM
+    import torch
+    model_id = "huggyllama/llama-7b"
+    llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
+    quantization="bitsandbytes", load_format="bitsandbytes")
+
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.42.4  # Required for Gemma 2 and for additional chat template parameters.
+transformers >= 4.43.1  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
@@ -61,3 +61,27 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
         tp_args.append("--enforce-eager")
 
     compare_two_settings(MODEL_NAME, pp_args, tp_args)
+
+
+@pytest.mark.parametrize("PP_SIZE, MODEL_NAME", [
+    (2, "JackFram/llama-160m"),
+])
+@pytest.mark.parametrize("ATTN_BACKEND", [
+    "FLASH_ATTN",
+    "FLASHINFER",
+])
+def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
+    cudagraph_args = [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "float16",
+        "--pipeline-parallel-size",
+        str(PP_SIZE),
+        "--distributed-executor-backend",
+        "ray",
+    ]
+    os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
+
+    eager_args = cudagraph_args + ["--enforce-eager"]
+
+    compare_two_settings(MODEL_NAME, eager_args, cudagraph_args)
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
@@ -1,3 +1,4 @@
+import time
 from typing import List
 
 import pytest
@@ -10,6 +11,8 @@
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import cleanup
+
 MODELS = [
     "facebook/opt-125m",
 ]
@@ -219,6 +222,94 @@ def test_metric_spec_decode(
                 "does not meet expectation")
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("log_interval", [1, 3, 5, 7])
+def test_metric_spec_decode_interval(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    log_interval: int,
+) -> None:
+    k = 5
+
+    engine_args = EngineArgs(model=model,
+                             dtype=dtype,
+                             disable_log_stats=False,
+                             gpu_memory_utilization=0.4,
+                             speculative_model=model,
+                             num_speculative_tokens=k,
+                             use_v2_block_manager=True,
+                             enforce_eager=True)
+
+    engine = LLMEngine.from_engine_args(engine_args)
+
+    try:
+
+        engine.add_request(
+            "request-id-0",
+            example_prompts[0],
+            SamplingParams(max_tokens=max_tokens),
+        )
+
+        # set log internal
+        stat_logger = engine.stat_loggers['prometheus']
+        stat_logger.local_interval = log_interval
+
+        # prefill
+        engine.step()
+
+        # wait for 5 seconds to ensure that spec decode metrics
+        # get triggered in first decode step
+        time.sleep(5)
+
+        # first decode step should trigger async collection of metrics
+        engine.step()
+
+        # wait one second to allow H2D transfer to finish
+        time.sleep(1)
+
+        # second decode step should now be able to collect the spec
+        # decode stats and the request should also be finished
+        engine.step()
+
+        # must have finisehd now
+        assert not engine.has_unfinished_requests()
+
+        # wait to ensure logging occurs
+        time.sleep(log_interval)
+
+        # force logging
+        engine.step()
+
+        # Note that the purpose of this test is to verify spec decode
+        # metrics instead of functional correctness, so the expected values
+        # are intended to be loose.
+        metric_name_to_expected_fn = {
+            "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1,
+            "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1,
+            "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k,
+            "counter_spec_decode_num_draft_tokens": lambda v: v == k,
+            "counter_spec_decode_num_emitted_tokens":
+            lambda v: 0 <= v <= k + 1,
+        }
+
+        for metric_name, is_expected in metric_name_to_expected_fn.items():
+            metric_val = getattr(
+                stat_logger.metrics,
+                metric_name).labels(**stat_logger.labels)._value.get()
+            assert is_expected(metric_val), (
+                f"the value of metric {metric_name} ({metric_val}) "
+                "does not meet expectation")
+
+    finally:
+        del engine
+        cleanup()
+
+
 def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
                    num_requests: int) -> None:
     if disable_log_stats:

diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
@@ -8,15 +8,20 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm import SamplingParams
 
+models_to_test = [
+    ('huggyllama/llama-7b', 'quantize model inflight'),
+    ('lllyasviel/omost-llama-3-8b-4bits', 'read pre-quantized model'),
+]
+
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                     reason='bitsandbytes is not supported on this GPU type.')
-def test_load_bnb_model(vllm_runner) -> None:
-    with vllm_runner('huggyllama/llama-7b',
+@pytest.mark.parametrize("model_name, description", models_to_test)
+def test_load_bnb_model(vllm_runner, model_name, description) -> None:
+    with vllm_runner(model_name,
                      quantization='bitsandbytes',
                      load_format='bitsandbytes',
                      enforce_eager=True) as llm:
-
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
 
         # check the weights in MLP & SelfAttention are quantized to torch.uint8
@@ -65,12 +70,17 @@ def test_load_bnb_model(vllm_runner) -> None:
             'To be or not to be, that is the question.'
         ]
         outputs = llm.generate(prompts, sampling_params=sampling_params)
-
         assert len(outputs) == len(prompts)
 
         for index in range(len(outputs)):
             # compare the first line of the output
             actual_output = outputs[index][1][0].split('\n', 1)[0]
             expected_output = expected_outputs[index].split('\n', 1)[0]
+
+            assert len(actual_output) >= len(expected_output), (
+                f'Actual {actual_output} should be larger than or equal to '
+                f'expected {expected_output}')
+            actual_output = actual_output[:len(expected_output)]
+
             assert actual_output == expected_output, (
                 f'Expected: {expected_output}, but got: {actual_output}')
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
@@ -13,6 +13,7 @@
 MODELS = [
     "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
     "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
 ]
 
 

diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -24,14 +24,14 @@
 from .conftest import run_greedy_equality_correctness_test
 
 # main model
-MAIN_MODEL = "ibm-granite/granite-3b-code-instruct"
+MAIN_MODEL = "JackFram/llama-160m"
 
 # speculative model
-SPEC_MODEL = "ibm-granite/granite-3b-code-instruct-accelerator"
+SPEC_MODEL = "ibm-fms/llama-160m-accelerator"
 
 # max. number of speculative tokens: this corresponds to
 # n_predict in the config.json of the speculator model.
-MAX_SPEC_TOKENS = 5
+MAX_SPEC_TOKENS = 3
 
 # precision
 PRECISION = "float32"

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -64,9 +64,8 @@ def test_get_sliding_window():
 
 
 def test_rope_customization():
-    TEST_ROPE_SCALING = {"type": "dynamic", "factor": 2.0}
+    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
     TEST_ROPE_THETA = 16_000_000.0
-    LONGCHAT_ROPE_SCALING = {"type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
@@ -96,27 +95,29 @@ def test_rope_customization():
                    None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
-    longchat_model_config = ModelConfig(
-        "lmsys/longchat-13b-16k",
-        "lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-    )
-    assert getattr(longchat_model_config.hf_config, "rope_scaling",
-                   None) == LONGCHAT_ROPE_SCALING
-    assert longchat_model_config.max_model_len == 16384
-
-    longchat_model_config = ModelConfig(
-        "lmsys/longchat-13b-16k",
-        "lmsys/longchat-13b-16k",
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        dtype="float16",
-        seed=0,
-        rope_scaling=TEST_ROPE_SCALING,
-    )
-    assert getattr(longchat_model_config.hf_config, "rope_scaling",
-                   None) == TEST_ROPE_SCALING
-    assert longchat_model_config.max_model_len == 4096
+    # TODO: add these back when the rope configs are fixed
+    # LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
+    # longchat_model_config = ModelConfig(
+    #     "lmsys/longchat-13b-16k",
+    #     "lmsys/longchat-13b-16k",
+    #     tokenizer_mode="auto",
+    #     trust_remote_code=False,
+    #     dtype="float16",
+    #     seed=0,
+    # )
+    # assert getattr(longchat_model_config.hf_config, "rope_scaling",
+    #                None) == LONGCHAT_ROPE_SCALING
+    # assert longchat_model_config.max_model_len == 16384
+
+    # longchat_model_config = ModelConfig(
+    #     "lmsys/longchat-13b-16k",
+    #     "lmsys/longchat-13b-16k",
+    #     tokenizer_mode="auto",
+    #     trust_remote_code=False,
+    #     dtype="float16",
+    #     seed=0,
+    #     rope_scaling=TEST_ROPE_SCALING,
+    # )
+    # assert getattr(longchat_model_config.hf_config, "rope_scaling",
+    #                None) == TEST_ROPE_SCALING
+    # assert longchat_model_config.max_model_len == 4096
diff --git a/vllm/config.py b/vllm/config.py
@@ -591,9 +591,11 @@ class LoadConfig:
                 mainly for profiling.
             "tensorizer" will use CoreWeave's tensorizer library for
                 fast weight loading.
+            "bitsandbytes" will load nf4 type weights.
         ignore_patterns: The list of patterns to ignore when loading the model.
             Default to "original/**/*" to avoid repeated loading of llama's 
             checkpoints.
+            
     """
 
     load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -676,8 +676,8 @@ def create_engine_config(self, ) -> EngineConfig:
         # bitsandbytes quantization needs a specific model loader
         # so we make sure the quant method and the load format are consistent
         if (self.quantization == "bitsandbytes" or
-            self.qlora_adapter_name_or_path is not None) and \
-            self.load_format != "bitsandbytes":
+           self.qlora_adapter_name_or_path is not None) and \
+           self.load_format != "bitsandbytes":
             raise ValueError(
                 "BitsAndBytes quantization and QLoRA adapter only support "
                 f"'bitsandbytes' load format, but got {self.load_format}")