sroy745 · sroy745 · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jun 28, 2024
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
@@ -0,0 +1,14 @@
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
@@ -0,0 +1,26 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+FROM ubuntu:22.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git
+WORKDIR /workspace
+
+# copy requirements
+COPY requirements-build.txt /workspace/vllm/
+COPY requirements-common.txt /workspace/vllm/
+COPY requirements-openvino.txt /workspace/vllm/
+
+COPY vllm/ /workspace/vllm/vllm
+COPY setup.py /workspace/vllm/
+
+# install build requirements
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+# build vLLM with OpenVINO backend
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+
+COPY examples/ /workspace/vllm/examples
+COPY benchmarks/ /workspace/vllm/benchmarks
+
+CMD ["/bin/bash"]
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -207,9 +207,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
-        choices=["cuda", "cpu", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument('--block-size',
                         type=int,
                         default=16,

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -349,9 +349,10 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--device",
         type=str,
-        default="cuda",
-        choices=["cuda", "cpu", "tpu", "xpu"],
-        help='device type for vLLM execution, supporting CUDA and CPU.')
+        default="auto",
+        choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"],
+        help='device type for vLLM execution, supporting CUDA, OpenVINO and '
+        'CPU.')
     parser.add_argument(
         "--enable-prefix-caching",
         action='store_true',

diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -0,0 +1,20 @@
+.. _input_processing_pipeline:
+
+Input Processing Pipeline
+=========================
+
+1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
+
+2. Tokenize the data if necessary.
+
+3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
+4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
+
+5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
+6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+
+   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.
diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -0,0 +1,39 @@
+.. _input_processing:
+
+Input Processing
+================
+
+.. currentmodule:: vllm.inputs
+
+vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
+in :class:`~vllm.LLMEngine` before they are passed to model executors. 
+
+Currently, this mechanism is only utilized in **multi-modal models** for preprocessing multi-modal input 
+data in addition to input prompt, but it can be extended to text-only language models when needed.
+
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   input_processing_pipeline
+
+Module Contents
++++++++++++++++
+
+LLM Engine Inputs
+-----------------
+
+.. autoclass:: vllm.inputs.LLMInputs
+    :members:
+    :show-inheritance:
+
+Registry
+--------
+
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
@@ -12,10 +12,6 @@ By default, vLLM models do not support multi-modal inputs. To enable multi-modal
 you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
 as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
 
-.. contents::
-   :local:
-   :backlinks: none
-
 Module Contents
 +++++++++++++++
 
@@ -24,9 +20,7 @@ Module Contents
 Registry
 --------
 
-.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
-
-    The global :class:`MultiModalRegistry` which is used by model runners.
+.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 
 .. autoclass:: vllm.multimodal.MultiModalRegistry
     :members:

diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
@@ -0,0 +1,95 @@
+.. _installation_openvino:
+
+Installation with OpenVINO
+==========================
+
+vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features:
+
+- Prefix caching (``--enable-prefix-caching``)
+- Chunked prefill (``--enable-chunked-prefill``)
+
+**Table of contents**:
+
+- :ref:`Requirements <openvino_backend_requirements>`
+- :ref:`Quick start using Dockerfile <openvino_backend_quick_start_dockerfile>`
+- :ref:`Build from source <install_openvino_backend_from_source>`
+- :ref:`Performance tips <openvino_backend_performance_tips>`
+- :ref:`Limitations <openvino_backend_limitations>`
+
+.. _openvino_backend_requirements:
+
+Requirements
+------------
+
+* OS: Linux
+* Instruction set architecture (ISA) requirement: at least AVX2.
+
+.. _openvino_backend_quick_start_dockerfile:
+
+Quick start using Dockerfile
+----------------------------
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.openvino -t vllm-openvino-env .
+    $ docker run -it --rm vllm-openvino-env
+
+.. _install_openvino_backend_from_source:
+
+Install from source
+-------------------
+
+- First, install Python. For example, on Ubuntu 22.04, you can run:
+
+  .. code-block:: console
+
+      $ sudo apt-get update  -y
+      $ sudo apt-get install python3
+
+- Second, install prerequisites vLLM OpenVINO backend installation:
+
+  .. code-block:: console
+
+      $ pip install --upgrade pip
+      $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+- Finally, install vLLM with OpenVINO backend: 
+
+  .. code-block:: console
+
+      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+
+.. _openvino_backend_performance_tips:
+
+Performance tips
+----------------
+
+vLLM OpenVINO backend uses the following environment variables to control behavior:
+
+- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
+
+- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
+
+- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off.
+
+To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
+
+OpenVINO best known configuration is:
+
+.. code-block:: console
+
+    $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
+        python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
+
+.. _openvino_backend_limitations:
+
+Limitations
+-----------
+
+- LoRA serving is not supported.
+
+- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
+
+- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
+
+- Speculative sampling is not tested within vLLM integration.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -63,6 +63,7 @@ Documentation
 
    getting_started/installation
    getting_started/amd-installation
+   getting_started/openvino-installation
    getting_started/cpu-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
@@ -120,6 +121,7 @@ Documentation
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/input_processing/model_inputs_index
    dev/multimodal/multimodal_index
    dev/dockerfile/dockerfile
 

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
@@ -37,7 +37,7 @@ For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/
 2. Rewrite the :code:`forward` methods
 --------------------------------------
 
-Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
+Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps:
 
 1. Remove any unnecessary code, such as the code only used for training.
 2. Change the input parameters:
@@ -75,7 +75,7 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
 
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
+For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
@@ -11,14 +11,15 @@ def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
 
     # Note: The model has 128k context length by default which may cause OOM
-    # If that's the case, override `max_model_len` with a smaller value via args
+    # In this example, we override max_model_len to 2048.
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
         image_input_type="pixel_values",
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
+        max_model_len=2048,
     )
 
     image = Image.open("images/cherry_blossom.jpg")

diff --git a/requirements-openvino.txt b/requirements-openvino.txt
@@ -0,0 +1,9 @@
+# Common dependencies
+-r requirements-common.txt
+
+# OpenVINO dependencies
+torch >= 2.1.2
+openvino ~= 2024.3.0.dev
+optimum-intel[openvino] >= 1.17.2
+
+triton >= 2.2.0  # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/setup.py b/setup.py
@@ -233,6 +233,10 @@ def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
+def _is_openvino() -> bool:
+    return VLLM_TARGET_DEVICE == "openvino"
+
+
 def _is_xpu() -> bool:
     return VLLM_TARGET_DEVICE == "xpu"
 
@@ -337,6 +341,8 @@ def get_vllm_version() -> str:
         if neuron_version != MAIN_CUDA_VERSION:
             neuron_version_str = neuron_version.replace(".", "")[:3]
             version += f"+neuron{neuron_version_str}"
+    elif _is_openvino():
+        version += "+openvino"
     elif _is_tpu():
         version += "+tpu"
     elif _is_cpu():
@@ -388,6 +394,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-rocm.txt")
     elif _is_neuron():
         requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_openvino():
+        requirements = _read_requirements("requirements-openvino.txt")
     elif _is_tpu():
         requirements = _read_requirements("requirements-tpu.txt")
     elif _is_cpu():
@@ -396,7 +404,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
+            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "OpenVINO, or CPU.")
     return requirements
 
 

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
@@ -9,8 +9,8 @@
 
 
 @pytest.mark.parametrize(
-    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
-@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
+    "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER", "OPENVINO"])
+@pytest.mark.parametrize("device", ["cpu", "openvino", "hip", "cuda"])
 def test_env(name: str, device: str, monkeypatch):
     """Test that the attention selector can be set via environment variable.
     Note that we do not test FlashAttn because it is the default backend.
@@ -28,6 +28,11 @@ def test_env(name: str, device: str, monkeypatch):
             backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                         torch.float16, 16)
         assert backend.name == "ROCM_FLASH"
+    elif device == "openvino":
+        with patch("vllm.attention.selector.is_openvino", return_value=True):
+            backend = which_attn_to_use(8, 16, 8, None, torch.float16,
+                                        torch.float16, 16)
+        assert backend.name == "OPENVINO"
     else:
         backend = which_attn_to_use(8, 16, 8, None, torch.float16,
                                     torch.float16, 16)