From 2e19b90b17c4dd9a898da81da50b40d89660aa11 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 24 May 2024 08:49:42 +0000 Subject: [PATCH 01/16] Fix logprobs for chat completion API --- vllm/entrypoints/openai/protocol.py | 27 +++- vllm/entrypoints/openai/serving_chat.py | 4 +- vllm/entrypoints/openai/serving_completion.py | 11 +- vllm/entrypoints/openai/serving_engine.py | 129 +++++++++++++----- 4 files changed, 125 insertions(+), 46 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 41e2f77fe56f..dc2e3496ee2a 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -416,17 +416,18 @@ def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) -class LogProbs(OpenAIBaseModel): +class CompletionLogProbs(OpenAIBaseModel): text_offset: List[int] = Field(default_factory=list) token_logprobs: List[Optional[float]] = Field(default_factory=list) tokens: List[str] = Field(default_factory=list) - top_logprobs: Optional[List[Optional[Dict[str, float]]]] = None + top_logprobs: List[Optional[Dict[str, + float]]] = Field(default_factory=list) class CompletionResponseChoice(OpenAIBaseModel): index: int text: str - logprobs: Optional[LogProbs] = None + logprobs: Optional[CompletionLogProbs] = None finish_reason: Optional[str] = None stop_reason: Optional[Union[int, str]] = Field( default=None, @@ -449,7 +450,7 @@ class CompletionResponse(OpenAIBaseModel): class CompletionResponseStreamChoice(OpenAIBaseModel): index: int text: str - logprobs: Optional[LogProbs] = None + logprobs: Optional[CompletionLogProbs] = None finish_reason: Optional[str] = None stop_reason: Optional[Union[int, str]] = Field( default=None, @@ -489,10 +490,24 @@ class ChatMessage(OpenAIBaseModel): content: str +class ChatCompletionTopLogprob(OpenAIBaseModel): + token: str + bytes: Optional[List[int]] = None + logprob: Optional[float] + + +class ChatCompletionLogProb(ChatCompletionTopLogprob): + top_logprobs: List[ChatCompletionTopLogprob] = Field(default_factory=list) + + +class ChatCompletionLogProbs(OpenAIBaseModel): + content: Optional[List[ChatCompletionLogProb]] = None + + class ChatCompletionResponseChoice(OpenAIBaseModel): index: int message: ChatMessage - logprobs: Optional[LogProbs] = None + logprobs: Optional[ChatCompletionLogProbs] = None finish_reason: Optional[str] = None stop_reason: Optional[Union[int, str]] = None @@ -514,7 +529,7 @@ class DeltaMessage(OpenAIBaseModel): class ChatCompletionResponseStreamChoice(OpenAIBaseModel): index: int delta: DeltaMessage - logprobs: Optional[LogProbs] = None + logprobs: Optional[ChatCompletionLogProbs] = None finish_reason: Optional[str] = None stop_reason: Optional[Union[int, str]] = None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 7e179362eef8..9b7fafea0552 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -277,7 +277,7 @@ async def chat_completion_stream_generator( previous_num_tokens[i]:] if output.logprobs else None if request.logprobs: - logprobs = self._create_logprobs( + logprobs = self._create_chat_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, num_output_top_logprobs=request.logprobs, @@ -364,7 +364,7 @@ async def chat_completion_full_generator( top_logprobs = output.logprobs if request.logprobs: - logprobs = self._create_logprobs( + logprobs = self._create_chat_logprobs( token_ids=token_ids, top_logprobs=top_logprobs, num_output_top_logprobs=request.logprobs, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 158d8ed7fbbf..8a41895c236c 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -6,12 +6,13 @@ from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import (CompletionRequest, +from vllm.entrypoints.openai.protocol import (CompletionLogProbs, + CompletionRequest, CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, - LogProbs, UsageInfo) + UsageInfo) from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, OpenAIServing) from vllm.logger import init_logger @@ -25,7 +26,7 @@ TypeTokenIDs = List[int] TypeTopLogProbs = List[Optional[Dict[int, float]]] TypeCreateLogProbsFn = Callable[ - [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], LogProbs] + [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], CompletionLogProbs] def parse_prompt_format(prompt) -> Tuple[bool, list]: @@ -230,7 +231,7 @@ async def completion_stream_generator( i]:] if output.logprobs else None if request.logprobs is not None: - logprobs = self._create_logprobs( + logprobs = self._create_completion_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, num_output_top_logprobs=request.logprobs, @@ -312,7 +313,7 @@ def request_output_to_completion_response( assert top_logprobs is not None, ( "top_logprobs must be provided when logprobs " "is requested") - logprobs = self._create_logprobs( + logprobs = self._create_completion_logprobs( token_ids=token_ids, top_logprobs=top_logprobs, num_output_top_logprobs=request.logprobs, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index db3fc85decd7..7e7484530c07 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,17 +1,23 @@ import json from dataclasses import dataclass from http import HTTPStatus -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional +from typing import Sequence as GenericSequence +from typing import Tuple, TypeVar, Union from pydantic import Field from typing_extensions import Annotated from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, +from vllm.entrypoints.openai.protocol import (ChatCompletionLogProb, + ChatCompletionLogProbs, + ChatCompletionRequest, + ChatCompletionTopLogprob, + CompletionLogProbs, CompletionRequest, EmbeddingRequest, ErrorResponse, - LogProbs, ModelCard, ModelList, + ModelCard, ModelList, ModelPermission) from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -20,6 +26,8 @@ logger = init_logger(__name__) +T = TypeVar("T") + @dataclass class LoRAModulePath: @@ -75,49 +83,104 @@ async def show_available_models(self) -> ModelList: model_cards.extend(lora_cards) return ModelList(data=model_cards) - def _create_logprobs( + def _assert_not_none(self, v: Optional[T]) -> T: + assert v is not None + return v + + def _create_completion_logprobs( self, - token_ids: List[int], - top_logprobs: List[Optional[Dict[int, Logprob]]], - num_output_top_logprobs: Optional[int] = None, + token_ids: GenericSequence[int], + top_logprobs: Optional[GenericSequence[Optional[Dict[int, Logprob]]]], + num_output_top_logprobs: int, initial_text_offset: int = 0, - ) -> LogProbs: - """Create OpenAI-style logprobs.""" - logprobs = LogProbs() + ) -> CompletionLogProbs: + """Create logprobs for OpenAI Completion API.""" + if top_logprobs is None: + top_logprobs = [] + + _assert_not_none = self._assert_not_none + + out_text_offset: List[int] = [] + out_token_logprobs: List[Optional[float]] = [] + out_tokens: List[str] = [] + out_top_logprobs: List[Optional[Dict[str, float]]] = [] + last_token_len = 0 - if num_output_top_logprobs: - logprobs.top_logprobs = [] for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is None: token = self.tokenizer.decode(token_id) - logprobs.tokens.append(token) - logprobs.token_logprobs.append(None) - assert logprobs.top_logprobs is not None - logprobs.top_logprobs.append(None) + out_tokens.append(token) + out_token_logprobs.append(None) + out_top_logprobs.append(None) else: token_logprob = step_top_logprobs[token_id].logprob + assert len(step_top_logprobs) == num_output_top_logprobs, \ + "Failed to set SamplingParams.logprob" + token = step_top_logprobs[token_id].decoded_token - logprobs.tokens.append(token) - logprobs.token_logprobs.append(token_logprob) - - if num_output_top_logprobs: - assert logprobs.top_logprobs is not None - logprobs.top_logprobs.append({ - # Convert float("-inf") to the - # JSON-serializable float that OpenAI uses - p.decoded_token: max(p.logprob, -9999.0) - for i, p in step_top_logprobs.items() - } if step_top_logprobs else None) - - if len(logprobs.text_offset) == 0: - logprobs.text_offset.append(initial_text_offset) + assert token is not None + + out_tokens.append(token) + out_token_logprobs.append(token_logprob) + out_top_logprobs.append({ + # Convert float("-inf") to the + # JSON-serializable float that OpenAI uses + _assert_not_none(p.decoded_token): max(p.logprob, -9999.0) + for p in step_top_logprobs.values() + }) + + if len(out_text_offset) == 0: + out_text_offset.append(initial_text_offset) else: - logprobs.text_offset.append(logprobs.text_offset[-1] + - last_token_len) + out_text_offset.append(out_text_offset[-1] + last_token_len) + last_token_len = len(token) - return logprobs + + return CompletionLogProbs( + text_offset=out_text_offset, + token_logprobs=out_token_logprobs, + tokens=out_tokens, + top_logprobs=out_top_logprobs, + ) + + def _token_to_int_array(self, token: str) -> List[int]: + return list(token.encode("utf-8")) + + def _create_chat_logprobs( + self, + token_ids: GenericSequence[int], + top_logprobs: Optional[GenericSequence[Optional[Dict[int, Logprob]]]], + num_output_top_logprobs: int, + initial_text_offset: int = 0, + ) -> ChatCompletionLogProbs: + """Create logprobs for OpenAI Chat Completion API.""" + completion_output = self._create_completion_logprobs( + token_ids, + top_logprobs, + num_output_top_logprobs, + initial_text_offset=initial_text_offset) + + _token_to_int_array = self._token_to_int_array + + return ChatCompletionLogProbs(content=[ + ChatCompletionLogProb( + token=token, + bytes=_token_to_int_array(token), + logprob=logprob, + top_logprobs=[] if top_logprobs is None else [ + ChatCompletionTopLogprob( + token=top_token, + bytes=_token_to_int_array(token), + logprob=top_logprob, + ) for top_token, top_logprob in top_logprobs.items() + ]) for logprob, token, top_logprobs in zip( + completion_output.token_logprobs, + completion_output.tokens, + completion_output.top_logprobs, + ) + ]) def create_error_response( self, From 08e41d750b54ce8cf1ee0a580aa4f32d1b567a8c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 24 May 2024 14:11:35 +0000 Subject: [PATCH 02/16] Update and fix tests --- tests/entrypoints/test_openai_server.py | 42 ++++++++++++++++--- vllm/entrypoints/openai/serving_completion.py | 2 + vllm/entrypoints/openai/serving_engine.py | 8 +++- 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 1b04e3205c4b..d2e5abd7af43 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -204,6 +204,33 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora"], ) +async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=0) + + choice = chat_completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.content is not None + assert choice.logprobs.content[0].top_logprobs is None + + +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) async def test_single_chat_session(server, client: openai.AsyncOpenAI, model_name: str): messages = [{ @@ -221,12 +248,12 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, logprobs=True, top_logprobs=5) assert chat_completion.id is not None - assert chat_completion.choices is not None and len( - chat_completion.choices) == 1 + assert len(chat_completion.choices) == 1 assert chat_completion.choices[0].message is not None assert chat_completion.choices[0].logprobs is not None - assert chat_completion.choices[0].logprobs.top_logprobs is not None - assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5 + assert chat_completion.choices[0].logprobs.content is not None + assert len( + chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5 message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" @@ -723,7 +750,10 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, top_logprobs=5, extra_body=dict(guided_choice=TEST_CHOICE, guided_decoding_backend=guided_decoding_backend)) - top_logprobs = chat_completion.choices[0].logprobs.top_logprobs + + assert chat_completion.choices[0].logprobs is not None + assert chat_completion.choices[0].logprobs.content is not None + top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs # -9999.0 is the minimum logprob returned by OpenAI assert all( @@ -745,6 +775,8 @@ async def test_response_format_json_object(server, client: openai.AsyncOpenAI): response_format={"type": "json_object"}) content = resp.choices[0].message.content + assert content is not None + loaded = json.loads(content) assert loaded == {"result": 2}, loaded diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 8a41895c236c..2cf972113b76 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -6,6 +6,8 @@ from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (CompletionLogProbs, CompletionRequest, CompletionResponse, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 7e7484530c07..f7e45565df1f 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -10,6 +10,8 @@ from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionLogProb, ChatCompletionLogProbs, ChatCompletionRequest, @@ -116,8 +118,10 @@ def _create_completion_logprobs( out_top_logprobs.append(None) else: token_logprob = step_top_logprobs[token_id].logprob - assert len(step_top_logprobs) == num_output_top_logprobs, \ - "Failed to set SamplingParams.logprob" + assert len(step_top_logprobs) <= num_output_top_logprobs + 1, ( + f"Failed to set SamplingParams.logprob. Expected at most: " + f"{num_output_top_logprobs + 1}; received length: " + f"{len(step_top_logprobs)}") token = step_top_logprobs[token_id].decoded_token assert token is not None From bbd4415e893a60effc30e552504b4648af4aa00b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 24 May 2024 14:50:51 +0000 Subject: [PATCH 03/16] Fix and refine tests --- tests/entrypoints/test_openai_server.py | 56 +++++++++++++++++++++-- vllm/entrypoints/openai/serving_engine.py | 6 +-- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index d2e5abd7af43..16c5440b73b2 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -183,6 +183,27 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, "model_name", [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], ) +async def test_no_logprobs(server, client: openai.AsyncOpenAI, + model_name: str): + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=None, + ) + choice = completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.token_logprobs is not None + assert choice.logprobs.top_logprobs is None + + +@pytest.mark.parametrize( + # just test 1 lora hereafter + "model_name", + [MODEL_NAME, "zephyr-lora"], +) async def test_zero_logprobs(server, client: openai.AsyncOpenAI, model_name: str): # test using token IDs @@ -196,11 +217,37 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, choice = completion.choices[0] assert choice.logprobs is not None assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is None + assert choice.logprobs.top_logprobs is not None + assert len(choice.logprobs.top_logprobs) <= 1 + + +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=False) + + choice = chat_completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.content is not None + assert len(choice.logprobs.content[0].top_logprobs) == 0 @pytest.mark.parametrize( - # just test 1 lora hereafter "model_name", [MODEL_NAME, "zephyr-lora"], ) @@ -224,7 +271,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, choice = chat_completion.choices[0] assert choice.logprobs is not None assert choice.logprobs.content is not None - assert choice.logprobs.content[0].top_logprobs is None + assert len(choice.logprobs.content[0].top_logprobs) <= 1 @pytest.mark.parametrize( @@ -253,7 +300,7 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, assert chat_completion.choices[0].logprobs is not None assert chat_completion.choices[0].logprobs.content is not None assert len( - chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5 + chat_completion.choices[0].logprobs.content[0].top_logprobs) <= 6 message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" @@ -326,7 +373,6 @@ async def test_too_many_logprobs(server, client: openai.AsyncOpenAI, @pytest.mark.parametrize( - # just test 1 lora hereafter "model_name", [MODEL_NAME, "zephyr-lora"], ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index f7e45565df1f..f584e784af5c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -117,11 +117,11 @@ def _create_completion_logprobs( out_token_logprobs.append(None) out_top_logprobs.append(None) else: + # There can be up to logprobs+1 elements in the response token_logprob = step_top_logprobs[token_id].logprob assert len(step_top_logprobs) <= num_output_top_logprobs + 1, ( - f"Failed to set SamplingParams.logprob. Expected at most: " - f"{num_output_top_logprobs + 1}; received length: " - f"{len(step_top_logprobs)}") + f"Expected at most {num_output_top_logprobs + 1} logprobs, " + f"but received {len(step_top_logprobs)} logprobs") token = step_top_logprobs[token_id].decoded_token assert token is not None From 504dd492a443401096dfe8744ad0fa09efb1e6fb Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 24 May 2024 16:06:00 +0000 Subject: [PATCH 04/16] Fix incorrect parameters to `_create_chat_logprobs` --- vllm/entrypoints/openai/serving_chat.py | 14 ++++++++++---- vllm/entrypoints/openai/serving_completion.py | 3 +++ vllm/entrypoints/openai/serving_engine.py | 7 ++----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9b7fafea0552..a459c5b4c263 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -276,11 +276,14 @@ async def chat_completion_stream_generator( top_logprobs = output.logprobs[ previous_num_tokens[i]:] if output.logprobs else None - if request.logprobs: + if request.logprobs and request.top_logprobs is not None: + assert top_logprobs is not None, ( + "top_logprobs must be provided when logprobs " + "is requested") logprobs = self._create_chat_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, + num_output_top_logprobs=request.top_logprobs, initial_text_offset=len(previous_texts[i]), ) else: @@ -363,11 +366,14 @@ async def chat_completion_full_generator( token_ids = output.token_ids top_logprobs = output.logprobs - if request.logprobs: + if request.logprobs and request.top_logprobs is not None: + assert top_logprobs is not None, ( + "top_logprobs must be provided when logprobs " + "is requested") logprobs = self._create_chat_logprobs( token_ids=token_ids, top_logprobs=top_logprobs, - num_output_top_logprobs=request.logprobs, + num_output_top_logprobs=request.top_logprobs, ) else: logprobs = None diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 2cf972113b76..2d5fb4aaacfa 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -233,6 +233,9 @@ async def completion_stream_generator( i]:] if output.logprobs else None if request.logprobs is not None: + assert top_logprobs is not None, ( + "top_logprobs must be provided when logprobs " + "is requested") logprobs = self._create_completion_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index f584e784af5c..d894fe5423fc 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -92,14 +92,11 @@ def _assert_not_none(self, v: Optional[T]) -> T: def _create_completion_logprobs( self, token_ids: GenericSequence[int], - top_logprobs: Optional[GenericSequence[Optional[Dict[int, Logprob]]]], + top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], num_output_top_logprobs: int, initial_text_offset: int = 0, ) -> CompletionLogProbs: """Create logprobs for OpenAI Completion API.""" - if top_logprobs is None: - top_logprobs = [] - _assert_not_none = self._assert_not_none out_text_offset: List[int] = [] @@ -155,7 +152,7 @@ def _token_to_int_array(self, token: str) -> List[int]: def _create_chat_logprobs( self, token_ids: GenericSequence[int], - top_logprobs: Optional[GenericSequence[Optional[Dict[int, Logprob]]]], + top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]], num_output_top_logprobs: int, initial_text_offset: int = 0, ) -> ChatCompletionLogProbs: From 390e93d819df189f3df1aa834983aeb5b3706643 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 24 May 2024 16:44:18 +0000 Subject: [PATCH 05/16] Allow `logprobs=True` when `top_logprobs=0` or `top_logprobs=None` (#4795) --- vllm/entrypoints/openai/protocol.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index dc2e3496ee2a..79e34afff6ae 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -192,9 +192,6 @@ class ChatCompletionRequest(OpenAIBaseModel): # doc: end-chat-completion-extra-params def to_sampling_params(self) -> SamplingParams: - if self.logprobs and not self.top_logprobs: - raise ValueError("Top logprobs must be set when logprobs is.") - logits_processors = None if self.logit_bias: From cbed5ecd5c6d4816ae0da07efdfa869cb4ac021f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 24 May 2024 15:03:11 +0000 Subject: [PATCH 06/16] Refine tests and fix them --- tests/async_engine/test_openapi_server_ray.py | 15 ++-- tests/entrypoints/test_openai_server.py | 89 ++++++++++++++----- 2 files changed, 75 insertions(+), 29 deletions(-) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index ace4c53916c7..bfec09cb58bf 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -90,13 +90,14 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI): logprobs=True, top_logprobs=5) assert chat_completion.id is not None - assert chat_completion.choices is not None and len( - chat_completion.choices) == 1 - assert chat_completion.choices[0].message is not None - assert chat_completion.choices[0].logprobs is not None - assert chat_completion.choices[0].logprobs.top_logprobs is not None - assert len(chat_completion.choices[0].logprobs.top_logprobs[0]) == 5 - message = chat_completion.choices[0].message + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=13, total_tokens=23) + + message = choice.message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" messages.append({"role": "assistant", "content": message.content}) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 16c5440b73b2..52f5da584190 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -161,9 +161,10 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, assert completion.id is not None assert completion.choices is not None and len(completion.choices) == 1 - assert completion.choices[0].text is not None and len( - completion.choices[0].text) >= 5 - assert completion.choices[0].finish_reason == "length" + + choice = completion.choices[0] + assert len(choice.text) >= 5 + assert choice.finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( completion_tokens=5, prompt_tokens=6, total_tokens=11) @@ -174,8 +175,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, ) - assert completion.choices[0].text is not None and len( - completion.choices[0].text) >= 5 + assert len(completion.choices[0].text) >= 5 @pytest.mark.parametrize( @@ -194,9 +194,7 @@ async def test_no_logprobs(server, client: openai.AsyncOpenAI, logprobs=None, ) choice = completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.token_logprobs is not None - assert choice.logprobs.top_logprobs is None + assert choice.logprobs is None @pytest.mark.parametrize( @@ -218,13 +216,35 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI, assert choice.logprobs is not None assert choice.logprobs.token_logprobs is not None assert choice.logprobs.top_logprobs is not None - assert len(choice.logprobs.top_logprobs) <= 1 + assert len(choice.logprobs.top_logprobs[0]) <= 1 @pytest.mark.parametrize( "model_name", [MODEL_NAME, "zephyr-lora"], ) +async def test_some_logprobs(server, client: openai.AsyncOpenAI, + model_name: str): + # test using token IDs + completion = await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + logprobs=5, + ) + choice = completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.token_logprobs is not None + assert choice.logprobs.top_logprobs is not None + assert len(choice.logprobs.top_logprobs[0]) <= 6 + + +@pytest.mark.parametrize( + # first test base model, then test loras + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-lora2"], +) async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, model_name: str): messages = [{ @@ -242,12 +262,11 @@ async def test_no_logprobs_chat(server, client: openai.AsyncOpenAI, logprobs=False) choice = chat_completion.choices[0] - assert choice.logprobs is not None - assert choice.logprobs.content is not None - assert len(choice.logprobs.content[0].top_logprobs) == 0 + assert choice.logprobs is None @pytest.mark.parametrize( + # just test 1 lora hereafter "model_name", [MODEL_NAME, "zephyr-lora"], ) @@ -274,6 +293,33 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, assert len(choice.logprobs.content[0].top_logprobs) <= 1 +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) +async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "system", + "content": "you are a helpful assistant" + }, { + "role": "user", + "content": "what is 1+1?" + }] + + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=5) + + choice = chat_completion.choices[0] + assert choice.logprobs is not None + assert choice.logprobs.content is not None + assert len(choice.logprobs.content[0].top_logprobs) <= 6 + + @pytest.mark.parametrize( "model_name", [MODEL_NAME, "zephyr-lora"], @@ -296,12 +342,13 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, top_logprobs=5) assert chat_completion.id is not None assert len(chat_completion.choices) == 1 - assert chat_completion.choices[0].message is not None - assert chat_completion.choices[0].logprobs is not None - assert chat_completion.choices[0].logprobs.content is not None - assert len( - chat_completion.choices[0].logprobs.content[0].top_logprobs) <= 6 - message = chat_completion.choices[0].message + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=37, total_tokens=47) + + message = choice.message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" messages.append({"role": "assistant", "content": message.content}) @@ -802,10 +849,8 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI, top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs # -9999.0 is the minimum logprob returned by OpenAI - assert all( - isinstance(logprob, float) and logprob >= -9999.0 - for token_dict in top_logprobs - for token, logprob in token_dict.items()) + for item in top_logprobs: + assert item.logprob >= -9999.0, f"Failed (top_logprobs={top_logprobs})" async def test_response_format_json_object(server, client: openai.AsyncOpenAI): From a72b33cdec45e8e91874041710fba4e6bb4f0bb1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 28 May 2024 13:10:08 +0000 Subject: [PATCH 07/16] Use stricter test for Chat Completions API --- tests/entrypoints/test_openai_server.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 52f5da584190..3d85f252b7c1 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -290,7 +290,7 @@ async def test_zero_logprobs_chat(server, client: openai.AsyncOpenAI, choice = chat_completion.choices[0] assert choice.logprobs is not None assert choice.logprobs.content is not None - assert len(choice.logprobs.content[0].top_logprobs) <= 1 + assert len(choice.logprobs.content[0].top_logprobs) == 1 @pytest.mark.parametrize( @@ -317,7 +317,7 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, choice = chat_completion.choices[0] assert choice.logprobs is not None assert choice.logprobs.content is not None - assert len(choice.logprobs.content[0].top_logprobs) <= 6 + assert len(choice.logprobs.content[0].top_logprobs) == 5 @pytest.mark.parametrize( From 5ed37cd266fce52ccc6696001f9b7d671521e651 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 30 May 2024 10:42:50 +0000 Subject: [PATCH 08/16] Update tests --- tests/entrypoints/test_openai_server.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 7736a6202ea3..c4ca9a4eb84e 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -375,7 +375,10 @@ async def test_some_logprobs_chat(server, client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora"], +) async def test_too_many_chat_logprobs(server, client: openai.AsyncOpenAI, model_name: str): messages = [{ From edeb3f69a6c0db25844046be42e4fb745a5f8ba7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 30 May 2024 10:43:13 +0000 Subject: [PATCH 09/16] Apply formatter --- vllm/entrypoints/openai/serving_engine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 1be1da44c9bb..abf97b678e33 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,8 +1,7 @@ import json from dataclasses import dataclass from http import HTTPStatus -from typing import Any, Dict, List, Optional -from typing import Tuple, TypeVar, Union +from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union from pydantic import Field from typing_extensions import Annotated From 6584a5169b0648543597e7d0ec37a0cdd51e5fab Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 30 May 2024 10:45:49 +0000 Subject: [PATCH 10/16] Remove unused typevar --- vllm/entrypoints/openai/serving_engine.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index abf97b678e33..228e780b5720 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,7 +1,7 @@ import json from dataclasses import dataclass from http import HTTPStatus -from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union +from typing import Any, Dict, List, Optional, Tuple, Union from pydantic import Field from typing_extensions import Annotated @@ -22,8 +22,6 @@ logger = init_logger(__name__) -T = TypeVar("T") - @dataclass class LoRAModulePath: From 2b6b3d8d646c365a3762effc775f170cd327e32e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 30 May 2024 10:46:19 +0000 Subject: [PATCH 11/16] Remove unnecessary disable --- vllm/entrypoints/openai/serving_engine.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 228e780b5720..066acdf1c019 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -8,8 +8,6 @@ from vllm.config import ModelConfig from vllm.engine.async_llm_engine import AsyncLLMEngine -# yapf conflicts with isort for this block -# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, CompletionRequest, EmbeddingRequest, ErrorResponse, From fcf4d6fafb555f9ed46a3ebe3a137b638bf17cdd Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 30 May 2024 10:53:14 +0000 Subject: [PATCH 12/16] Update `test_single_chat_session` --- tests/entrypoints/test_openai_server.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index c4ca9a4eb84e..46e3da6c6ba3 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -439,15 +439,14 @@ async def test_single_chat_session(server, client: openai.AsyncOpenAI, logprobs=True, top_logprobs=5) assert chat_completion.id is not None - assert chat_completion.choices is not None and len( - chat_completion.choices) == 1 - assert chat_completion.choices[0].message is not None - assert chat_completion.choices[0].logprobs is not None - assert chat_completion.choices[0].logprobs.content[ - 0].top_logprobs is not None - assert len( - chat_completion.choices[0].logprobs.content[0].top_logprobs) == 5 - message = chat_completion.choices[0].message + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=37, total_tokens=47) + + message = choice.message assert message.content is not None and len(message.content) >= 10 assert message.role == "assistant" messages.append({"role": "assistant", "content": message.content}) From fed335bdf42476fad439d8284d9147ac441155a5 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 1 Jun 2024 02:48:44 +0000 Subject: [PATCH 13/16] Use strict equality tests for length, and remove unnecessary non-null checks --- tests/async_engine/test_openapi_server_ray.py | 8 ++--- tests/entrypoints/test_openai_server.py | 29 ++++++++----------- tests/tensorizer_loader/test_tensorizer.py | 5 ++-- 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index b8b7ec5e630f..940d1406dc97 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -55,9 +55,8 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): temperature=0.0) assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - assert completion.choices[0].text is not None and len( - completion.choices[0].text) >= 5 + assert len(completion.choices) == 1 + assert len(completion.choices[0].text) == 5 assert completion.choices[0].finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( completion_tokens=5, prompt_tokens=6, total_tokens=11) @@ -69,8 +68,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): max_tokens=5, temperature=0.0, ) - assert completion.choices[0].text is not None and len( - completion.choices[0].text) >= 5 + assert len(completion.choices[0].text) == 5 @pytest.mark.asyncio diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 46e3da6c6ba3..ee643f039ba9 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -169,7 +169,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, assert completion.choices is not None and len(completion.choices) == 1 choice = completion.choices[0] - assert len(choice.text) >= 5 + assert len(choice.text) == 5 assert choice.finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( completion_tokens=5, prompt_tokens=6, total_tokens=11) @@ -181,7 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, ) - assert len(completion.choices[0].text) >= 5 + assert len(completion.choices[0].text) == 5 @pytest.mark.asyncio @@ -287,8 +287,7 @@ async def test_too_many_completion_logprobs(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, ) - completion = completion.choices[0].text - assert completion is not None and len(completion) >= 0 + assert len(completion.choices[0].text) >= 0 @pytest.mark.asyncio @@ -620,8 +619,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): logit_bias={str(token_id): 100}, seed=42, ) - assert completion.choices[0].text is not None and len( - completion.choices[0].text) >= 5 + assert len(completion.choices[0].text) == 5 response_tokens = tokenizer(completion.choices[0].text, add_special_tokens=False)["input_ids"] expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), @@ -668,9 +666,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI, guided_decoding_backend=guided_decoding_backend)) assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 3 + assert len(completion.choices) == 3 for i in range(3): - assert completion.choices[i].text is not None output_json = json.loads(completion.choices[i].text) jsonschema.validate(instance=output_json, schema=TEST_SCHEMA) @@ -737,9 +734,8 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI, guided_decoding_backend=guided_decoding_backend)) assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 3 + assert len(completion.choices) == 3 for i in range(3): - assert completion.choices[i].text is not None assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None @@ -796,7 +792,7 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI, guided_decoding_backend=guided_decoding_backend)) assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 2 + assert len(completion.choices) == 2 for i in range(2): assert completion.choices[i].text in TEST_CHOICE @@ -1050,8 +1046,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, prompt_text = tokenizer.decode(prompt) if isinstance(prompt, list) else prompt - assert (completion.choices[0].text is not None - and re.search(r"^" + prompt_text, completion.choices[0].text)) + assert re.search(r"^" + prompt_text, completion.choices[0].text) logprobs = completion.choices[0].logprobs assert logprobs is not None assert len(logprobs.text_offset) > 5 @@ -1100,7 +1095,7 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, encoding_format="float", ) assert embeddings.id is not None - assert embeddings.data is not None and len(embeddings.data) == 1 + assert len(embeddings.data) == 1 assert len(embeddings.data[0].embedding) == 4096 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 9 @@ -1114,7 +1109,7 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, encoding_format="float", ) assert embeddings.id is not None - assert embeddings.data is not None and len(embeddings.data) == 1 + assert len(embeddings.data) == 1 assert len(embeddings.data[0].embedding) == 4096 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 5 @@ -1139,7 +1134,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, encoding_format="float", ) assert embeddings.id is not None - assert embeddings.data is not None and len(embeddings.data) == 3 + assert len(embeddings.data) == 3 assert len(embeddings.data[0].embedding) == 4096 # test List[List[int]] @@ -1151,7 +1146,7 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, encoding_format="float", ) assert embeddings.id is not None - assert embeddings.data is not None and len(embeddings.data) == 4 + assert len(embeddings.data) == 4 assert len(embeddings.data[0].embedding) == 4096 assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.prompt_tokens == 17 diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 1579d53a7fe2..445000ce8cc5 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -224,9 +224,8 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): temperature=0.0) assert completion.id is not None - assert completion.choices is not None and len(completion.choices) == 1 - assert completion.choices[0].text is not None and len( - completion.choices[0].text) >= 5 + assert len(completion.choices) == 1 + assert len(completion.choices[0].text) == 5 assert completion.choices[0].finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( completion_tokens=5, prompt_tokens=6, total_tokens=11) From ecef58418adf8b6681567cd0a197c20886e0103e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 1 Jun 2024 08:16:37 +0000 Subject: [PATCH 14/16] Revert use strict equality tests --- tests/async_engine/test_openapi_server_ray.py | 4 ++-- tests/entrypoints/test_openai_server.py | 6 +++--- tests/tensorizer_loader/test_tensorizer.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 940d1406dc97..c25875bd1b7f 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -56,7 +56,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): assert completion.id is not None assert len(completion.choices) == 1 - assert len(completion.choices[0].text) == 5 + assert len(completion.choices[0].text) >= 5 assert completion.choices[0].finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( completion_tokens=5, prompt_tokens=6, total_tokens=11) @@ -68,7 +68,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI): max_tokens=5, temperature=0.0, ) - assert len(completion.choices[0].text) == 5 + assert len(completion.choices[0].text) >= 5 @pytest.mark.asyncio diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index ee643f039ba9..645ee7f9c61d 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -169,7 +169,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, assert completion.choices is not None and len(completion.choices) == 1 choice = completion.choices[0] - assert len(choice.text) == 5 + assert len(choice.text) >= 5 assert choice.finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( completion_tokens=5, prompt_tokens=6, total_tokens=11) @@ -181,7 +181,7 @@ async def test_single_completion(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, ) - assert len(completion.choices[0].text) == 5 + assert len(completion.choices[0].text) >= 5 @pytest.mark.asyncio @@ -619,7 +619,7 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): logit_bias={str(token_id): 100}, seed=42, ) - assert len(completion.choices[0].text) == 5 + assert len(completion.choices[0].text) >= 5 response_tokens = tokenizer(completion.choices[0].text, add_special_tokens=False)["input_ids"] expected_tokens = tokenizer(tokenizer.decode([token_id] * 5), diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 445000ce8cc5..171c4ac37554 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -225,7 +225,7 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): assert completion.id is not None assert len(completion.choices) == 1 - assert len(completion.choices[0].text) == 5 + assert len(completion.choices[0].text) >= 5 assert completion.choices[0].finish_reason == "length" assert completion.usage == openai.types.CompletionUsage( completion_tokens=5, prompt_tokens=6, total_tokens=11) From 72d58e17c7135407704c70f49c1b16ea419f55a9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 3 Jun 2024 01:44:10 +0000 Subject: [PATCH 15/16] Fix bad types caused by reassignment of same variable --- tests/entrypoints/test_openai_server.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 645ee7f9c61d..e496c8e58c5b 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1084,14 +1084,14 @@ async def test_long_seed(server, client: openai.AsyncOpenAI): ) async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, model_name: str): - input = [ + input_texts = [ "The chef prepared a delicious meal.", ] # test single embedding embeddings = await client.embeddings.create( model=model_name, - input=input, + input=input_texts, encoding_format="float", ) assert embeddings.id is not None @@ -1102,10 +1102,10 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 9 # test using token IDs - input = [1, 1, 1, 1, 1] + input_tokens = [1, 1, 1, 1, 1] embeddings = await client.embeddings.create( model=model_name, - input=input, + input=input_tokens, encoding_format="float", ) assert embeddings.id is not None @@ -1124,13 +1124,13 @@ async def test_single_embedding(embedding_server, client: openai.AsyncOpenAI, async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, model_name: str): # test List[str] - inputs = [ + input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] embeddings = await client.embeddings.create( model=model_name, - input=inputs, + input=input_texts, encoding_format="float", ) assert embeddings.id is not None @@ -1138,11 +1138,11 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert len(embeddings.data[0].embedding) == 4096 # test List[List[int]] - inputs = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], - [25, 32, 64, 77]] + input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], + [25, 32, 64, 77]] embeddings = await client.embeddings.create( model=model_name, - input=inputs, + input=input_tokens, encoding_format="float", ) assert embeddings.id is not None From 908cac42d10c145ddd7f9ca453254c8fa0cdce3e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 4 Jun 2024 23:39:51 +0000 Subject: [PATCH 16/16] Fix confusing assertion and variable name --- vllm/entrypoints/openai/serving_chat.py | 17 ++++++------- vllm/entrypoints/openai/serving_completion.py | 25 ++++++++----------- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 0868c221f670..b9eaf8195cd9 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -284,16 +284,15 @@ async def chat_completion_stream_generator( continue delta_token_ids = output.token_ids[previous_num_tokens[i]:] - top_logprobs = output.logprobs[ + out_logprobs = output.logprobs[ previous_num_tokens[i]:] if output.logprobs else None if request.logprobs and request.top_logprobs is not None: - assert top_logprobs is not None, ( - "top_logprobs must be provided when logprobs " - "is requested") + assert out_logprobs is not None, ( + "Did not output logprobs") logprobs = self._create_chat_logprobs( token_ids=delta_token_ids, - top_logprobs=top_logprobs, + top_logprobs=out_logprobs, num_output_top_logprobs=request.top_logprobs, ) else: @@ -387,15 +386,13 @@ async def chat_completion_full_generator( role = self.get_chat_request_role(request) for output in final_res.outputs: token_ids = output.token_ids - top_logprobs = output.logprobs + out_logprobs = output.logprobs if request.logprobs and request.top_logprobs is not None: - assert top_logprobs is not None, ( - "top_logprobs must be provided when logprobs " - "is requested") + assert out_logprobs is not None, "Did not output logprobs" logprobs = self._create_chat_logprobs( token_ids=token_ids, - top_logprobs=top_logprobs, + top_logprobs=out_logprobs, num_output_top_logprobs=request.top_logprobs, ) else: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index c39c820f6e5b..14bb68945f7a 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -221,7 +221,7 @@ async def completion_stream_generator( # only return the prompt delta_text = res.prompt delta_token_ids = res.prompt_token_ids - top_logprobs = res.prompt_logprobs + out_logprobs = res.prompt_logprobs has_echoed[i] = True elif (request.echo and request.max_tokens > 0 and not has_echoed[i]): @@ -229,7 +229,7 @@ async def completion_stream_generator( delta_text = res.prompt + output.text delta_token_ids = (res.prompt_token_ids + output.token_ids) - top_logprobs = res.prompt_logprobs + (output.logprobs + out_logprobs = res.prompt_logprobs + (output.logprobs or []) has_echoed[i] = True else: @@ -237,16 +237,15 @@ async def completion_stream_generator( delta_text = output.text[len(previous_texts[i]):] delta_token_ids = output.token_ids[ previous_num_tokens[i]:] - top_logprobs = output.logprobs[previous_num_tokens[ + out_logprobs = output.logprobs[previous_num_tokens[ i]:] if output.logprobs else None if request.logprobs is not None: - assert top_logprobs is not None, ( - "top_logprobs must be provided when logprobs " - "is requested") + assert out_logprobs is not None, ( + "Did not output logprobs") logprobs = self._create_completion_logprobs( token_ids=delta_token_ids, - top_logprobs=top_logprobs, + top_logprobs=out_logprobs, num_output_top_logprobs=request.logprobs, initial_text_offset=len(previous_texts[i]), ) @@ -310,25 +309,23 @@ def request_output_to_completion_response( assert request.max_tokens is not None if request.echo and request.max_tokens == 0: token_ids = prompt_token_ids - top_logprobs = prompt_logprobs + out_logprobs = prompt_logprobs output_text = prompt_text elif request.echo and request.max_tokens > 0: token_ids = prompt_token_ids + output.token_ids - top_logprobs = (prompt_logprobs + output.logprobs + out_logprobs = (prompt_logprobs + output.logprobs if request.logprobs is not None else None) output_text = prompt_text + output.text else: token_ids = output.token_ids - top_logprobs = output.logprobs + out_logprobs = output.logprobs output_text = output.text if request.logprobs is not None: - assert top_logprobs is not None, ( - "top_logprobs must be provided when logprobs " - "is requested") + assert out_logprobs is not None, "Did not output logprobs" logprobs = self._create_completion_logprobs( token_ids=token_ids, - top_logprobs=top_logprobs, + top_logprobs=out_logprobs, num_output_top_logprobs=request.logprobs, ) else: