vllm-project · ywang96 · Jun 7, 2024 · Jun 2, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
@@ -3,7 +3,7 @@
 Using VLMs
 ==========
 
-This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
+vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
 
 Engine Arguments
 ----------------
@@ -54,3 +54,69 @@ For now, we only support a single image per text prompt. To pass an image to the
         print(generated_text)
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
+
+Online OpenAI Vision API Compatible Inference
+----------------------------------------------
+
+You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
+
+.. note::
+    Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
+    added in the future.
+
+Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
+
+.. important::
+    Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template 
+    is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the 
+    HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.openai.api_server \
+        --model llava-hf/llava-1.5-7b-hf \
+        --image-input-type pixel_values \
+        --image-token-id 32000 \
+        --image-input-shape 1,3,336,336 \
+        --image-feature-size 576 \
+        --chat-template template_llava.jinja
+
+To consume the server, you can use the OpenAI client like in the example below:
+
+.. code-block:: python
+
+    from openai import OpenAI
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    chat_response = client.chat.completions.create(
+        model="llava-hf/llava-1.5-7b-hf",
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    },
+                },
+            ],
+        }],
+    )
+    print("Chat response:", chat_response)
+
+.. note::
+
+    By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
+
+    .. code-block:: shell
+
+        export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+
+.. note::
+    The prompt formatting with the image token ``<image>`` is not needed when serving VLMs with the API server since the prompt will be 
+    processed automatically by the server.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
@@ -30,6 +30,8 @@ Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-refer
 - Chat: `tools`, and `tool_choice`.
 - Completions: `suffix`.
 
+vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst).
+
 ## Extra Parameters
 vLLM supports a set of parameters that are not part of the OpenAI API.
 In order to use them, you can pass them as extra parameters in the OpenAI client.
@@ -120,4 +122,4 @@ It is the callers responsibility to prompt the model with the tool information,
 
 vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
 
-Please refer to the OpenAI API reference documentation for more information.
+Please refer to the OpenAI API reference documentation for more information.
diff --git a/examples/template_llava.jinja b/examples/template_llava.jinja
@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+
+    {%- if message['role'] == 'user' -%}
+        {{ 'USER: ' + message['content'] + '\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{ 'ASSISTANT:' }}
+{% endif %}
@@ -0,0 +1,275 @@
+from pathlib import Path
+
+import openai
+import pytest
+import ray
+
+from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
+
+from ..utils import ServerRunner
+
+MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
+                       "examples/template_llava.jinja")
+assert LLAVA_CHAT_TEMPLATE.exists()
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+pytestmark = pytest.mark.openai
+
+
+@pytest.fixture(scope="module")
+def server():
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        MODEL_NAME,
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
+        "--image-input-type",
+        "pixel_values",
+        "--image-token-id",
+        "32000",
+        "--image-input-shape",
+        "1,3,336,336",
+        "--image-feature-size",
+        "576",
+        "--chat-template",
+        str(LLAVA_CHAT_TEMPLATE),
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+
+
+@pytest.fixture(scope="session")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
+                                         model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded(
+        server, client: openai.AsyncOpenAI, model_name: str, image_url: str):
+
+    image_encoded = encode_image_base64(
+        await ImageFetchAiohttp.fetch_image(image_url=image_url))
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_encoded}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
+                                    model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_multi_image_input(server, client: openai.AsyncOpenAI,
+                                 model_name: str, image_url: str):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    with pytest.raises(openai.BadRequestError):  # test multi-image input
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+        )
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    completion = completion.choices[0].text
+    assert completion is not None and len(completion) >= 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])