From d36ba548d36959b546937981037de6edcbc18e9b Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 1 Mar 2024 14:03:11 +0000
Subject: [PATCH 01/10] Add truncate_input_tokens to openai completions api

---
 vllm/entrypoints/openai/protocol.py           |  2 ++
 vllm/entrypoints/openai/serving_completion.py |  4 ++--
 vllm/entrypoints/openai/serving_engine.py     | 20 +++++++++++++++----
 vllm/sampling_params.py                       |  8 +++++++-
 4 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 26499b8d7a66..bbb732c9ba70 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -183,6 +183,7 @@ class CompletionRequest(BaseModel):
     guided_json: Optional[Union[str, dict, BaseModel]] = None
     guided_regex: Optional[str] = None
     guided_choice: Optional[List[str]] = None
+    truncate_input_tokens: Optional[int] = None
 
     def to_sampling_params(self):
         echo_without_generation = self.echo and self.max_tokens == 0
@@ -225,6 +226,7 @@ def logit_bias_logits_processor(
             include_stop_str_in_output=self.include_stop_str_in_output,
             length_penalty=self.length_penalty,
             logits_processors=logits_processors,
+            truncate_input_tokens=self.truncate_input_tokens,
         )
 
     @model_validator(mode="before")
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 713e67793b29..c3037d109667 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -300,10 +300,10 @@ async def create_completion(self, request: CompletionRequest,
             for i, prompt in enumerate(prompts):
                 if prompt_is_tokens:
                     input_ids = self._validate_prompt_and_tokenize(
-                        request, prompt_ids=prompt)
+                        request, prompt_ids=prompt, truncate_input_tokens=sampling_params.truncate_input_tokens)
                 else:
                     input_ids = self._validate_prompt_and_tokenize(
-                        request, prompt=prompt)
+                        request, prompt=prompt, truncate_input_tokens=sampling_params.truncate_input_tokens)
 
                 generators.append(
                     self.engine.generate(None,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 09945471e9af..64b244094adb 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -62,7 +62,8 @@ async def _post_init(self):
         self.tokenizer = get_tokenizer(
             engine_model_config.tokenizer,
             tokenizer_mode=engine_model_config.tokenizer_mode,
-            trust_remote_code=engine_model_config.trust_remote_code)
+            trust_remote_code=engine_model_config.trust_remote_code,
+            truncation_side="left")
 
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
@@ -147,15 +148,26 @@ def _validate_prompt_and_tokenize(
             self,
             request: Union[ChatCompletionRequest, CompletionRequest],
             prompt: Optional[str] = None,
-            prompt_ids: Optional[List[int]] = None) -> List[int]:
+            prompt_ids: Optional[List[int]] = None,
+            truncate_input_tokens: Optional[int] = None) -> List[int]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
         if (prompt and prompt_ids):
             raise ValueError(
                 "Only one of prompt or prompt_ids should be provided.")
 
-        input_ids = prompt_ids if prompt_ids is not None else self.tokenizer(
-            prompt).input_ids
+        if prompt_ids is None:
+            tokenizer_kwargs = {}
+            if truncate_input_tokens is not None:
+                tokenizer_kwargs["truncation"] = True
+                tokenizer_kwargs["max_length"] = truncate_input_tokens
+            input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
+        else:
+            if truncate_input_tokens is not None:
+                input_ids = prompt_ids[-truncate_input_tokens:]
+            else:
+                input_ids = prompt_ids
+
         token_num = len(input_ids)
 
         if request.max_tokens is None:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8103f3c2b24b..912fb51cd966 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -91,6 +91,7 @@ class SamplingParams:
             tokens in the output.  Defaults to True.
         logits_processors: List of functions that modify logits based on
             previously generated tokens.
+        truncate_input_tokens: Truncate input tokens using left-truncation.
     """
 
     def __init__(
@@ -118,6 +119,7 @@ def __init__(
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
         logits_processors: Optional[List[LogitsProcessor]] = None,
+        truncate_input_tokens: Optional[int] = None,
     ) -> None:
         self.n = n
         self.best_of = best_of if best_of is not None else n
@@ -150,6 +152,7 @@ def __init__(
         self.spaces_between_special_tokens = spaces_between_special_tokens
         self.logits_processors = logits_processors
         self.include_stop_str_in_output = include_stop_str_in_output
+        self.truncate_input_tokens = truncate_input_tokens
         self._verify_args()
         if self.use_beam_search:
             self._verify_beam_search()
@@ -197,6 +200,8 @@ def _verify_args(self) -> None:
         if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
             raise ValueError(f"prompt_logprobs must be non-negative, got "
                              f"{self.prompt_logprobs}.")
+        if self.truncate_input_tokens is not None and self.truncate_input_tokens < 1:
+            raise ValueError(f"truncate_input_tokens must be >= 1, got {self.truncate_input_tokens}")
 
     def _verify_beam_search(self) -> None:
         if self.best_of == 1:
@@ -276,4 +281,5 @@ def __repr__(self) -> str:
             f"prompt_logprobs={self.prompt_logprobs}, "
             f"skip_special_tokens={self.skip_special_tokens}, "
             "spaces_between_special_tokens="
-            f"{self.spaces_between_special_tokens})")
+            f"{self.spaces_between_special_tokens}, "
+            f"truncate_input_tokens={self.truncate_input_tokens})")

From 586b4a0f196b15a1b47cfd946c1ed1e37ebbe8bc Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 1 Mar 2024 14:28:31 +0000
Subject: [PATCH 02/10] Better docstring

---
 vllm/sampling_params.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 912fb51cd966..8df8f7a7409a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -91,7 +91,8 @@ class SamplingParams:
             tokens in the output.  Defaults to True.
         logits_processors: List of functions that modify logits based on
             previously generated tokens.
-        truncate_input_tokens: Truncate input tokens using left-truncation.
+        truncate_input_tokens: If set to an integer k, will use only the last k
+            tokens from the prompt. Defaults to None (no truncation).
     """
 
     def __init__(

From 559e4410c0f974c641660e89cdb7c6fc6068f5b8 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 1 Mar 2024 15:03:52 +0000
Subject: [PATCH 03/10] Apply formatting

---
 vllm/entrypoints/openai/serving_completion.py | 10 ++++++++--
 vllm/sampling_params.py                       |  4 +++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index c3037d109667..aceb11a3782b 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -300,10 +300,16 @@ async def create_completion(self, request: CompletionRequest,
             for i, prompt in enumerate(prompts):
                 if prompt_is_tokens:
                     input_ids = self._validate_prompt_and_tokenize(
-                        request, prompt_ids=prompt, truncate_input_tokens=sampling_params.truncate_input_tokens)
+                        request,
+                        prompt_ids=prompt,
+                        truncate_input_tokens=sampling_params.
+                        truncate_input_tokens)
                 else:
                     input_ids = self._validate_prompt_and_tokenize(
-                        request, prompt=prompt, truncate_input_tokens=sampling_params.truncate_input_tokens)
+                        request,
+                        prompt=prompt,
+                        truncate_input_tokens=sampling_params.
+                        truncate_input_tokens)
 
                 generators.append(
                     self.engine.generate(None,
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 8df8f7a7409a..88110e50bd86 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -202,7 +202,9 @@ def _verify_args(self) -> None:
             raise ValueError(f"prompt_logprobs must be non-negative, got "
                              f"{self.prompt_logprobs}.")
         if self.truncate_input_tokens is not None and self.truncate_input_tokens < 1:
-            raise ValueError(f"truncate_input_tokens must be >= 1, got {self.truncate_input_tokens}")
+            raise ValueError(
+                f"truncate_input_tokens must be >= 1, got {self.truncate_input_tokens}"
+            )
 
     def _verify_beam_search(self) -> None:
         if self.best_of == 1:

From 32bc1f8ed59c961e0ff6afdd6f26d2ba75456af9 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tom.parnell@gmail.com>
Date: Fri, 1 Mar 2024 19:52:13 +0100
Subject: [PATCH 04/10] Update vllm/entrypoints/openai/serving_engine.py

tokenizer_kwargs: more efficient allocation

Co-authored-by: Nick Hill <nickhill@us.ibm.com>
---
 vllm/entrypoints/openai/serving_engine.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 64b244094adb..eee58ba2a7c7 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -157,10 +157,9 @@ def _validate_prompt_and_tokenize(
                 "Only one of prompt or prompt_ids should be provided.")
 
         if prompt_ids is None:
-            tokenizer_kwargs = {}
-            if truncate_input_tokens is not None:
-                tokenizer_kwargs["truncation"] = True
-                tokenizer_kwargs["max_length"] = truncate_input_tokens
+            tokenizer_kwargs = {} if truncate_input_tokens is not None else {
+                "truncation": True, "max_length": truncate_input_tokens,
+            }
             input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
         else:
             if truncate_input_tokens is not None:

From 4b92849513d6910f189c2a0295ead7b928e67861 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 1 Mar 2024 18:57:11 +0000
Subject: [PATCH 05/10] Use truncate_prompt_tokens naming

---
 vllm/entrypoints/openai/protocol.py           |  4 ++--
 vllm/entrypoints/openai/serving_completion.py |  8 ++++----
 vllm/entrypoints/openai/serving_engine.py     | 14 +++++++-------
 vllm/sampling_params.py                       | 12 ++++++------
 4 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index bbb732c9ba70..51f9659c4c52 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -183,7 +183,7 @@ class CompletionRequest(BaseModel):
     guided_json: Optional[Union[str, dict, BaseModel]] = None
     guided_regex: Optional[str] = None
     guided_choice: Optional[List[str]] = None
-    truncate_input_tokens: Optional[int] = None
+    truncate_prompt_tokens: Optional[int] = None
 
     def to_sampling_params(self):
         echo_without_generation = self.echo and self.max_tokens == 0
@@ -226,7 +226,7 @@ def logit_bias_logits_processor(
             include_stop_str_in_output=self.include_stop_str_in_output,
             length_penalty=self.length_penalty,
             logits_processors=logits_processors,
-            truncate_input_tokens=self.truncate_input_tokens,
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
         )
 
     @model_validator(mode="before")
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index aceb11a3782b..e82e43d27fc5 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -302,14 +302,14 @@ async def create_completion(self, request: CompletionRequest,
                     input_ids = self._validate_prompt_and_tokenize(
                         request,
                         prompt_ids=prompt,
-                        truncate_input_tokens=sampling_params.
-                        truncate_input_tokens)
+                        truncate_prompt_tokens=sampling_params.
+                        truncate_prompt_tokens)
                 else:
                     input_ids = self._validate_prompt_and_tokenize(
                         request,
                         prompt=prompt,
-                        truncate_input_tokens=sampling_params.
-                        truncate_input_tokens)
+                        truncate_prompt_tokens=sampling_params.
+                        truncate_prompt_tokens)
 
                 generators.append(
                     self.engine.generate(None,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index eee58ba2a7c7..0647d4fe0dd1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -149,7 +149,7 @@ def _validate_prompt_and_tokenize(
             request: Union[ChatCompletionRequest, CompletionRequest],
             prompt: Optional[str] = None,
             prompt_ids: Optional[List[int]] = None,
-            truncate_input_tokens: Optional[int] = None) -> List[int]:
+            truncate_prompt_tokens: Optional[int] = None) -> List[int]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
         if (prompt and prompt_ids):
@@ -157,15 +157,15 @@ def _validate_prompt_and_tokenize(
                 "Only one of prompt or prompt_ids should be provided.")
 
         if prompt_ids is None:
-            tokenizer_kwargs = {} if truncate_input_tokens is not None else {
-                "truncation": True, "max_length": truncate_input_tokens,
+            tokenizer_kwargs = {} if truncate_prompt_tokens is not None else {
+                "truncation": True,
+                "max_length": truncate_prompt_tokens,
             }
             input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
+        elif truncate_prompt_tokens is not None:
+            input_ids = prompt_ids[-truncate_prompt_tokens:]
         else:
-            if truncate_input_tokens is not None:
-                input_ids = prompt_ids[-truncate_input_tokens:]
-            else:
-                input_ids = prompt_ids
+            input_ids = prompt_ids
 
         token_num = len(input_ids)
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 88110e50bd86..7870d417768a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -91,7 +91,7 @@ class SamplingParams:
             tokens in the output.  Defaults to True.
         logits_processors: List of functions that modify logits based on
             previously generated tokens.
-        truncate_input_tokens: If set to an integer k, will use only the last k
+        truncate_prompt_tokens: If set to an integer k, will use only the last k
             tokens from the prompt. Defaults to None (no truncation).
     """
 
@@ -120,7 +120,7 @@ def __init__(
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
         logits_processors: Optional[List[LogitsProcessor]] = None,
-        truncate_input_tokens: Optional[int] = None,
+        truncate_prompt_tokens: Optional[int] = None,
     ) -> None:
         self.n = n
         self.best_of = best_of if best_of is not None else n
@@ -153,7 +153,7 @@ def __init__(
         self.spaces_between_special_tokens = spaces_between_special_tokens
         self.logits_processors = logits_processors
         self.include_stop_str_in_output = include_stop_str_in_output
-        self.truncate_input_tokens = truncate_input_tokens
+        self.truncate_prompt_tokens = truncate_prompt_tokens
         self._verify_args()
         if self.use_beam_search:
             self._verify_beam_search()
@@ -201,9 +201,9 @@ def _verify_args(self) -> None:
         if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
             raise ValueError(f"prompt_logprobs must be non-negative, got "
                              f"{self.prompt_logprobs}.")
-        if self.truncate_input_tokens is not None and self.truncate_input_tokens < 1:
+        if self.truncate_prompt_tokens is not None and self.truncate_prompt_tokens < 1:
             raise ValueError(
-                f"truncate_input_tokens must be >= 1, got {self.truncate_input_tokens}"
+                f"truncate_prompt_tokens must be >= 1, got {self.truncate_prompt_tokens}"
             )
 
     def _verify_beam_search(self) -> None:
@@ -285,4 +285,4 @@ def __repr__(self) -> str:
             f"skip_special_tokens={self.skip_special_tokens}, "
             "spaces_between_special_tokens="
             f"{self.spaces_between_special_tokens}, "
-            f"truncate_input_tokens={self.truncate_input_tokens})")
+            f"truncate_prompt_tokens={self.truncate_prompt_tokens})")

From b6554a81cfa56fa37ef79c6eb0cbf6f4c6d76379 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Fri, 1 Mar 2024 19:03:08 +0000
Subject: [PATCH 06/10] serving_engine.py: fix bug

---
 vllm/entrypoints/openai/serving_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 0647d4fe0dd1..7aca2a15fc2d 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -157,7 +157,7 @@ def _validate_prompt_and_tokenize(
                 "Only one of prompt or prompt_ids should be provided.")
 
         if prompt_ids is None:
-            tokenizer_kwargs = {} if truncate_prompt_tokens is not None else {
+            tokenizer_kwargs = {} if truncate_prompt_tokens is None else {
                 "truncation": True,
                 "max_length": truncate_prompt_tokens,
             }

From 6efc8d152b7d1bc5c79b9464b8f6038d88926c8c Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 5 Mar 2024 09:00:20 +0000
Subject: [PATCH 07/10] sampling_params: Docstring update

---
 vllm/sampling_params.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 7870d417768a..5a500e94725e 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -92,7 +92,8 @@ class SamplingParams:
         logits_processors: List of functions that modify logits based on
             previously generated tokens.
         truncate_prompt_tokens: If set to an integer k, will use only the last k
-            tokens from the prompt. Defaults to None (no truncation).
+            tokens from the prompt (i.e., left truncation). Defaults to None (i.e.,
+            no truncation).
     """
 
     def __init__(

From 520f8bb9b0739097cd0e72ca65dd426a55defc68 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 26 Mar 2024 11:26:17 +0000
Subject: [PATCH 08/10] Use pydantic.conint

---
 vllm/entrypoints/openai/protocol.py       | 4 ++--
 vllm/entrypoints/openai/serving_engine.py | 4 +++-
 vllm/sampling_params.py                   | 3 ++-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index fbe721133ee9..ddd105c2012c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -4,7 +4,7 @@
 from typing import Dict, List, Literal, Optional, Union
 
 import torch
-from pydantic import BaseModel, Field, model_validator
+from pydantic import BaseModel, Field, model_validator, conint
 
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
@@ -229,7 +229,7 @@ class CompletionRequest(BaseModel):
     min_tokens: Optional[int] = 0
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
-    truncate_prompt_tokens: Optional[int] = None
+    truncate_prompt_tokens: Optional[conint(ge=1)] = None
     # doc: end-completion-sampling-params
 
     # doc: begin-completion-extra-params
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 858439d47c2b..16b621419351 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -4,6 +4,8 @@
 from http import HTTPStatus
 from typing import Dict, List, Optional, Union
 
+from pydantic import conint
+
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest, ErrorResponse,
@@ -174,7 +176,7 @@ def _validate_prompt_and_tokenize(
             request: Union[ChatCompletionRequest, CompletionRequest],
             prompt: Optional[str] = None,
             prompt_ids: Optional[List[int]] = None,
-            truncate_prompt_tokens: Optional[int] = None) -> List[int]:
+            truncate_prompt_tokens: Optional[conint(ge=1)] = None) -> List[int]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
         if (prompt and prompt_ids):
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 6aa8a942fcea..510489c5e42f 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -5,6 +5,7 @@
 from typing import Callable, List, Optional, Union
 
 import torch
+from pydantic import conint
 
 _SAMPLING_EPS = 1e-5
 
@@ -124,7 +125,7 @@ def __init__(
         skip_special_tokens: bool = True,
         spaces_between_special_tokens: bool = True,
         logits_processors: Optional[List[LogitsProcessor]] = None,
-        truncate_prompt_tokens: Optional[int] = None,
+        truncate_prompt_tokens: Optional[conint(ge=1)] = None,
     ) -> None:
         self.n = n
         self.best_of = best_of if best_of is not None else n

From 08b3e191667a56c91967cd42d1b4c660ca193ba5 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 26 Mar 2024 11:39:05 +0000
Subject: [PATCH 09/10] fix formatting

---
 vllm/entrypoints/openai/serving_engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 16b621419351..ed7facd85c6d 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -176,7 +176,8 @@ def _validate_prompt_and_tokenize(
             request: Union[ChatCompletionRequest, CompletionRequest],
             prompt: Optional[str] = None,
             prompt_ids: Optional[List[int]] = None,
-            truncate_prompt_tokens: Optional[conint(ge=1)] = None) -> List[int]:
+            truncate_prompt_tokens: Optional[conint(ge=1)] = None
+    ) -> List[int]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
         if (prompt and prompt_ids):

From c8f3429b2649bff42036e221bb8ca9e97abe40b4 Mon Sep 17 00:00:00 2001
From: Thomas Parnell <tpa@zurich.ibm.com>
Date: Tue, 26 Mar 2024 11:43:36 +0000
Subject: [PATCH 10/10] Fix formatting

---
 vllm/entrypoints/openai/protocol.py |  2 +-
 vllm/sampling_params.py             | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index ddd105c2012c..e9ab10490f96 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -4,7 +4,7 @@
 from typing import Dict, List, Literal, Optional, Union
 
 import torch
-from pydantic import BaseModel, Field, model_validator, conint
+from pydantic import BaseModel, Field, conint, model_validator
 
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 510489c5e42f..51f34ce03203 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -95,8 +95,8 @@ class SamplingParams:
         logits_processors: List of functions that modify logits based on
             previously generated tokens.
         truncate_prompt_tokens: If set to an integer k, will use only the last k
-            tokens from the prompt (i.e., left truncation). Defaults to None (i.e.,
-            no truncation).
+            tokens from the prompt (i.e., left truncation). Defaults to None
+            (i.e., no truncation).
     """
 
     def __init__(
@@ -216,10 +216,10 @@ def _verify_args(self) -> None:
         if self.prompt_logprobs is not None and self.prompt_logprobs < 0:
             raise ValueError(f"prompt_logprobs must be non-negative, got "
                              f"{self.prompt_logprobs}.")
-        if self.truncate_prompt_tokens is not None and self.truncate_prompt_tokens < 1:
-            raise ValueError(
-                f"truncate_prompt_tokens must be >= 1, got {self.truncate_prompt_tokens}"
-            )
+        if (self.truncate_prompt_tokens is not None
+                and self.truncate_prompt_tokens < 1):
+            raise ValueError(f"truncate_prompt_tokens must be >= 1, "
+                             f"got {self.truncate_prompt_tokens}")
 
     def _verify_beam_search(self) -> None:
         if self.best_of == 1: