Merge branch 'vllm-project:main' into main

sfc-gh-mkeralapura · Aug 15, 2024 · bec7bc6 · bec7bc6
2 parents 42a7229 + f4da5f7
commit bec7bc6
Show file tree

Hide file tree

Showing 21 changed files with 230 additions and 31 deletions.
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -22,7 +22,7 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest Pillow protobuf
+  pip install pytest
   pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
 
 # online inference

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt \
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install intel-openmp
 
-ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD"
+ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
 
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 

diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
@@ -12,3 +12,4 @@ Integrations
    deploying_with_lws
    deploying_with_dstack
    serving_with_langchain
+   serving_with_llamaindex
diff --git a/docs/source/serving/serving_with_llamaindex.rst b/docs/source/serving/serving_with_llamaindex.rst
@@ -0,0 +1,27 @@
+.. _run_on_llamaindex:
+
+Serving with llama_index
+============================
+
+vLLM is also available via `llama_index <https://github.com/run-llama/llama_index>`_ .
+
+To install llamaindex, run
+
+.. code-block:: console
+
+    $ pip install llama-index-llms-vllm -q
+
+To run inference on a single or multiple GPUs, use ``Vllm`` class from ``llamaindex``.
+
+.. code-block:: python
+
+    from llama_index.llms.vllm import Vllm
+
+    llm = Vllm(
+        model="microsoft/Orca-2-7b",
+        tensor_parallel_size=4,
+        max_new_tokens=100,
+        vllm_kwargs={"swap_space": 1, "gpu_memory_utilization": 0.5},
+    )
+
+Please refer to this `Tutorial <https://docs.llamaindex.ai/en/latest/examples/llm/vllm/>`_ for more details.
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -6,6 +6,7 @@ tqdm
 py-cpuinfo
 transformers >= 4.43.2  # Required for Chameleon and Llama 3.1 hotfox.
 tokenizers >= 0.19.1  # Required for Llama 3.
+protobuf # Required by LlamaTokenizer.
 fastapi
 aiohttp
 openai
@@ -24,4 +25,3 @@ librosa # Required for audio processing
 soundfile # Required for audio processing
 gguf == 0.9.1
 importlib_metadata
-compressed-tensors == 0.5.0
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -17,7 +17,7 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
-compressed-tensors==0.5.0 # required for compressed-tensors
+compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 
 # TODO: Add this after fully implementing llava(mantis)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
@@ -5,12 +5,13 @@
 
 import pytest
 import torch
-from compressed_tensors.quantization import QuantizationType
 
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
     CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationType)
 
 
 @pytest.mark.parametrize("model_args", [

diff --git a/vllm/config.py b/vllm/config.py
@@ -847,7 +847,8 @@ def __init__(self,
                  delay_factor: float = 0.0,
                  enable_chunked_prefill: bool = False,
                  embedding_mode: Optional[bool] = False,
-                 preemption_mode: Optional[str] = None) -> None:
+                 preemption_mode: Optional[str] = None,
+                 num_scheduler_steps: int = 1) -> None:
         if max_num_batched_tokens is not None:
             self.max_num_batched_tokens = max_num_batched_tokens
         else:
@@ -876,6 +877,7 @@ def __init__(self,
         self.chunked_prefill_enabled = enable_chunked_prefill
         self.embedding_mode = embedding_mode
         self.preemption_mode = preemption_mode
+        self.num_scheduler_steps = num_scheduler_steps
         self._verify_args()
 
     def _verify_args(self) -> None:
@@ -901,6 +903,16 @@ def _verify_args(self) -> None:
                 f"({self.num_lookahead_slots}) must be greater than or "
                 "equal to 0.")
 
+        if self.num_scheduler_steps < 1:
+            raise ValueError(
+                "num_scheduler_steps "
+                f"({self.num_scheduler_steps}) must be greater than or "
+                "equal to 1.")
+
+    @property
+    def is_multi_step(self) -> bool:
+        return self.num_scheduler_steps > 1
+
 
 class DeviceConfig:
     device: Optional[torch.device]

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -805,6 +805,9 @@ def _schedule_prefills(
                 curr_loras.add(lora_int_id)
             waiting_queue.popleft()
             self._allocate_and_set_running(seq_group)
+            seq_group.init_multi_step(
+                num_scheduler_steps=self._get_num_lookahead_slots(
+                    is_prefill=True) + 1)
             seq_groups.append(
                 ScheduledSequenceGroup(seq_group=seq_group,
                                        token_chunk_size=num_new_tokens))
@@ -1108,6 +1111,7 @@ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
                 computed_block_nums=common_computed_block_nums,
                 encoder_seq_data=encoder_seq_data,
                 cross_block_table=cross_block_table,
+                state=seq_group.state,
                 # `multi_modal_data` will only be present for the 1st comm
                 # between engine and worker.
                 # the subsequent comms can still use delta, but
@@ -1184,6 +1188,7 @@ def _append_slots(
                 slots.
         """
         num_lookahead_slots = self._get_num_lookahead_slots(is_prefill=False)
+        seq_group.init_multi_step(num_scheduler_steps=num_lookahead_slots + 1)
 
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             cows = self.block_manager.append_slots(seq, num_lookahead_slots)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -115,6 +115,7 @@ class EngineArgs:
     lora_dtype: str = 'auto'
     max_cpu_loras: Optional[int] = None
     device: str = 'auto'
+    num_scheduler_steps: int = 1
     ray_workers_use_nsight: bool = False
     num_gpu_blocks_override: Optional[int] = None
     num_lookahead_slots: int = 0
@@ -315,7 +316,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--block-size',
                             type=int,
                             default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 128, 256, 512, 1024, 2048],
                             help='Token block size for contiguous chunks of '
                             'tokens.')
 
@@ -543,6 +544,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                 "tpu", "xpu"
                             ],
                             help='Device type for vLLM execution.')
+        parser.add_argument('--num-scheduler-steps',
+                            type=int,
+                            default=1,
+                            help=('Maximum number of forward steps per '
+                                  'scheduler call.'))
 
         parser.add_argument(
             '--scheduler-delay-factor',
@@ -858,18 +864,34 @@ def create_engine_config(self, ) -> EngineConfig:
             disable_logprobs=self.disable_logprobs_during_spec_decoding,
         )
 
+        if self.num_scheduler_steps > 1:
+            raise NotImplementedError("Multi-step is not yet supported.")
+            if speculative_config is not None:
+                raise ValueError("Speculative decoding is not supported with "
+                                 "multi-step (--num-scheduler-steps > 1)")
+            if self.enable_chunked_prefill:
+                raise ValueError("Chunked prefill is not supported with "
+                                 "multi-step (--num-scheduler-steps > 1)")
+
+        # make sure num_lookahead_slots is set the higher value depending on
+        # if we are using speculative decoding or multi-step
+        num_lookahead_slots = max(self.num_lookahead_slots,
+                                  self.num_scheduler_steps - 1)
+        num_lookahead_slots = num_lookahead_slots \
+            if speculative_config is None \
+            else speculative_config.num_lookahead_slots
+
         scheduler_config = SchedulerConfig(
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
             use_v2_block_manager=self.use_v2_block_manager,
-            num_lookahead_slots=(self.num_lookahead_slots
-                                 if speculative_config is None else
-                                 speculative_config.num_lookahead_slots),
+            num_lookahead_slots=num_lookahead_slots,
             delay_factor=self.scheduler_delay_factor,
             enable_chunked_prefill=self.enable_chunked_prefill,
             embedding_mode=model_config.embedding_mode,
             preemption_mode=self.preemption_mode,
+            num_scheduler_steps=self.num_scheduler_steps,
         )
         lora_config = LoRAConfig(
             max_lora_rank=self.max_lora_rank,

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -334,7 +334,7 @@ def get_default_config_root():
     "VLLM_XLA_CACHE_PATH":
     lambda: os.path.expanduser(
         os.getenv(
-            "VLLM_ASSETS_CACHE",
+            "VLLM_XLA_CACHE_PATH",
             os.path.join(get_default_cache_root(), "vllm", "xla_cache"),
         )),
     "VLLM_FUSED_MOE_CHUNK_SIZE":

diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
@@ -100,9 +100,8 @@ async def execute_model_async(
         self,
         execute_model_req: ExecuteModelRequest,
     ) -> List[SamplerOutput]:
-        output = await make_async(
-            self.driver_worker.execute_model
-        )(seq_group_metadata_list=execute_model_req.seq_group_metadata_list, )
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
         return output
 
     async def check_health_async(self) -> None:

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,10 +1,6 @@
 from typing import Any, Dict, List, Optional
 
 import torch
-from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization import (QuantizationArgs,
-                                             QuantizationStrategy,
-                                             QuantizationType)
 from pydantic import BaseModel
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@@ -17,7 +13,8 @@
     CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
     CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target, is_activation_quantization_format,
+    CompressionFormat, QuantizationArgs, QuantizationStrategy,
+    QuantizationType, find_matched_target, is_activation_quantization_format,
     should_ignore_layer)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import current_platform

diff --git a/...l_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/...l_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py
@@ -1,10 +1,11 @@
 from typing import Callable, List, Optional
 
 import torch
-from compressed_tensors.quantization import QuantizationStrategy
 
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (

diff --git a/...el_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/...el_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -1,11 +1,12 @@
 from typing import Callable, List, Optional
 
 import torch
-from compressed_tensors.quantization import QuantizationStrategy
 from torch.nn import Parameter
 
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,

diff --git a/...l_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/...l_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,11 +1,12 @@
 from typing import Callable, List, Optional
 
 import torch
-from compressed_tensors.quantization import QuantizationStrategy
 from torch.nn import Parameter
 
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationStrategy)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     apply_int8_linear, convert_to_channelwise)
 from vllm.model_executor.parameter import (BasevLLMParameter,

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,13 +1,85 @@
 import re
-from typing import Iterable, Optional
+from enum import Enum
+from typing import Any, Dict, Iterable, Optional
 
-from compressed_tensors import CompressionFormat
+from pydantic import BaseModel, Field
 from torch.nn import Module
 
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     FUSED_LAYER_NAME_MAPPING)
 
 
+class CompressionFormat(Enum):
+    dense = "dense"
+    sparse_bitmask = "sparse-bitmask"
+    naive_quantized = "naive-quantized"
+    float_quantized = "float-quantized"
+    int_quantized = "int-quantized"
+    pack_quantized = "pack-quantized"
+    marlin_24 = "marlin-24"
+
+
+class QuantizationType(str, Enum):
+    """
+    Enum storing quantization type options
+    """
+
+    INT = "int"
+    FLOAT = "float"
+
+
+class QuantizationStrategy(str, Enum):
+    """
+    Enum storing quantization strategy options
+    """
+
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+    TOKEN = "token"
+
+
+class QuantizationArgs(BaseModel):
+    """
+    User facing arguments used to define a quantization config 
+    for weights or activations
+
+    :param num_bits: quantization bit depth
+    :param type: dtype to quantized to, either int or float
+    :param symmetric: whether or not quantization scale is symmetric
+    :param strategy: string determining the scope of scale/zero-point to apply
+    :param group_size: group length to use for the group strategy
+    :param block_structure: 2d block structure to use for the block 
+    strategy, must be of the format "2x4", "8x16", etc.
+    :param dynamic: set True to perform dynamic quantization -
+        values will not be calibrated during calibration phase, 
+        instead during inference new quantization ranges will be 
+        observed with every sample. Defaults to False for static
+        quantization. Note that enabling dynamic quantization 
+        will change the default observer to a memoryless one
+    """
+
+    num_bits: int = 8
+    type: QuantizationType = QuantizationType.INT
+    symmetric: bool = True
+    group_size: Optional[int] = None
+    strategy: Optional[QuantizationStrategy] = None
+    block_structure: Optional[str] = None
+    dynamic: bool = False
+    observer: str = Field(
+        default="minmax",
+        description=("The class to use to compute the quantization param - "
+                     "scale and zero-point'"),
+    )
+    observer_kwargs: Dict[str, Any] = Field(
+        default_factory=dict,
+        description=
+        ("optional dict of kwargs to be passed directly to torch quantization "
+         "Observers constructor excluding quantization range or symmetry"),
+    )
+
+
 def is_activation_quantization_format(format: str) -> bool:
     _ACTIVATION_QUANTIZATION_FORMATS = [
         CompressionFormat.naive_quantized.value,