Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Frontend] Add OpenAI Vision API Support #5237

Merged
merged 54 commits into from
Jun 7, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
8ba11d4
initial
ywang96 Jun 2, 2024
d361d20
iterate
ywang96 Jun 3, 2024
fd5aba5
Merge branch 'main' into gpt4v-fe
ywang96 Jun 3, 2024
730cda7
iterate
ywang96 Jun 3, 2024
1c0b89d
iterate
ywang96 Jun 4, 2024
520f5a0
iterate
ywang96 Jun 4, 2024
3a57a6d
iterate
ywang96 Jun 4, 2024
31b941b
adding test
ywang96 Jun 4, 2024
9b3cf48
iterate
ywang96 Jun 4, 2024
af94f8c
docstring
ywang96 Jun 4, 2024
332dd10
remove unused lib
ywang96 Jun 4, 2024
d52a907
revert hardcoded chat template
ywang96 Jun 4, 2024
58746fc
address feedback
ywang96 Jun 4, 2024
99d9197
update pytestmark
ywang96 Jun 4, 2024
0b65271
apply asyncio mark
ywang96 Jun 4, 2024
3a965d9
update doc
ywang96 Jun 4, 2024
f9b9707
update test
ywang96 Jun 5, 2024
04ebbf7
minor doc update
ywang96 Jun 5, 2024
0cdd54f
minor doc update
ywang96 Jun 5, 2024
82a0052
Clarify experiment support
ywang96 Jun 5, 2024
dd01246
note regarding prompt format when using API server
ywang96 Jun 5, 2024
e40da86
Merge branch 'main' into gpt4v-fe
ywang96 Jun 5, 2024
088ad81
fix typo
ywang96 Jun 5, 2024
daa7085
update template
ywang96 Jun 5, 2024
1b32e2f
revert and update token count
ywang96 Jun 5, 2024
c45b34e
update template
ywang96 Jun 5, 2024
d6c1322
update
ywang96 Jun 5, 2024
05fe635
update
ywang96 Jun 5, 2024
938e5c9
template format
ywang96 Jun 5, 2024
b9318bc
correct and add test for multi image
ywang96 Jun 5, 2024
199ced7
fix test
ywang96 Jun 5, 2024
9e686e0
Add unit test for `fetch_image`
DarkLight1337 Jun 5, 2024
d9fbb17
Apply formatter
DarkLight1337 Jun 5, 2024
2833ba0
address feedback
ywang96 Jun 6, 2024
6c365bd
fix notes
ywang96 Jun 6, 2024
26c38f1
use aiohttp
ywang96 Jun 6, 2024
734e50b
fix test
ywang96 Jun 6, 2024
0cd2931
test
ywang96 Jun 6, 2024
561f07f
fix test
ywang96 Jun 6, 2024
481fea8
update test
ywang96 Jun 6, 2024
9585cc6
update fixture
ywang96 Jun 6, 2024
7f9500d
fix field
ywang96 Jun 6, 2024
32d1a25
fix field
ywang96 Jun 6, 2024
1e665b7
format
ywang96 Jun 6, 2024
cce804e
fix image loading
ywang96 Jun 6, 2024
31b219c
revert change that merges fetch and parse
ywang96 Jun 6, 2024
dcf8c8d
add encoded image fixture
ywang96 Jun 6, 2024
a9a9712
Merge branch 'main' into gpt4v-fe
ywang96 Jun 6, 2024
89a452a
update fetch image and remove unused fixture
ywang96 Jun 6, 2024
4e3eca9
cleanup
ywang96 Jun 6, 2024
afadfac
fix fixture
ywang96 Jun 6, 2024
d3bae73
remove unused client close
ywang96 Jun 6, 2024
a149368
add TODO and format
ywang96 Jun 6, 2024
72d4bc4
address comment
ywang96 Jun 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions examples/template_vicuna.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{% if messages[0]['role'] == 'system' %}
{% set system_message = messages[0]['content'] | trim + '\n\n' %}
{% set messages = messages[1:] %}
{% else %}
{% set system_message = '' %}
{% endif %}

{{ bos_token + system_message }}
{% for message in messages %}
{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
{% endif %}

{% if message['role'] == 'user' %}
{{ 'USER: ' + message['content'] | trim + '\n' }}
{% elif message['role'] == 'assistant' %}
{{ 'ASSISTANT: ' + message['content'] | trim + eos_token + '\n' }}
{% endif %}
{% endfor %}

{% if add_generation_prompt %}
{{ 'ASSISTANT:' }}
{% endif %}
167 changes: 167 additions & 0 deletions tests/entrypoints/test_openai_vision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from pathlib import Path

import openai
import pytest
import ray

from ..utils import ServerRunner

MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
VICUNA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
"examples/template_vicuna.jinja")
assert VICUNA_CHAT_TEMPLATE.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]

pytestmark = pytest.mark.openai


@pytest.fixture(scope="module")
def server():
ray.init()
server_runner = ServerRunner.remote([
"--model",
MODEL_NAME,
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--image-input-type",
"pixel_values",
"--image-token-id",
"32000",
"--image-input-shape",
"1,3,336,336",
"--image-feature-size",
"576",
"--chat-template",
str(VICUNA_CHAT_TEMPLATE),
])
ray.get(server_runner.ready.remote())
yield server_runner
ray.shutdown()


@pytest.fixture(scope="session")
def client():
client = openai.AsyncOpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
yield client


@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]

# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
messages=messages,
max_tokens=10,
logprobs=True,
top_logprobs=5)
assert chat_completion.id is not None
ywang96 marked this conversation as resolved.
Show resolved Hide resolved
assert chat_completion.choices is not None and len(
chat_completion.choices) == 1
assert chat_completion.choices[0].message is not None
assert chat_completion.choices[0].logprobs is not None
assert chat_completion.choices[0].logprobs.top_logprobs is not None
assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})

# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0


@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
model_name: str, image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]

# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason

# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
temperature=0.0,
stream=True,
)
chunks = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output


if __name__ == "__main__":
pytest.main([__file__])
10 changes: 9 additions & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Union

import torch
from transformers import PretrainedConfig
from transformers import PretrainedConfig, PreTrainedTokenizerBase

from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
Expand Down Expand Up @@ -1108,6 +1108,14 @@ def get_image_input_enum_type(cls, value: str) -> ImageInputType:
f"Expecting to choose from "
f"{[x.name for x in cls.ImageInputType]}.") from e

def get_image_token_text(
ywang96 marked this conversation as resolved.
Show resolved Hide resolved
self, tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
"""Get the image token placeholder text to be inserted into the
text prompt and the string representation of the image token id.
"""
image_token_str = tokenizer.decode(self.image_token_id)
return image_token_str * self.image_feature_size, image_token_str


_STR_DTYPE_TO_TORCH_DTYPE = {
"half": torch.float16,
Expand Down
Loading
Loading