Skip to content

Commit

Permalink
Fix prompt token count and provide completion usage in OpenAI response (
Browse files Browse the repository at this point in the history
kserve#3712)

* Fix input token count and add completion usage

Signed-off-by: Sivanantham Chinnaiyan <sivanantham.chinnaiyan@ideas2it.com>

* Add max_length for test models

Signed-off-by: Sivanantham Chinnaiyan <sivanantham.chinnaiyan@ideas2it.com>

---------

Signed-off-by: Sivanantham Chinnaiyan <sivanantham.chinnaiyan@ideas2it.com>
  • Loading branch information
sivanantha321 authored Jun 3, 2024
1 parent ff744c6 commit edac2c3
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 16 deletions.
29 changes: 24 additions & 5 deletions python/huggingfaceserver/huggingfaceserver/generative_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,11 @@
ChatCompletionRequestMessage,
Completion,
CompletionChoice,
CompletionUsage,
CreateCompletionRequest,
)
from kserve.utils.utils import generate_uuid
from kserve.constants.constants import LLM_STATS_KEY
from transformers import (
AutoConfig,
AutoModel,
Expand All @@ -56,6 +58,7 @@
TextIteratorStreamer,
set_seed,
)
from kserve.metrics import LLMStats

from .stop_sequence_stopping_criteria import StopSequenceStoppingCriteria
from .task import (
Expand Down Expand Up @@ -299,6 +302,13 @@ def queue_put(outputs):
0 if echo or self.is_encoder_decoder else kwargs["input_ids"].shape[-1]
)
outputs = self._model.generate(**kwargs)
stats: LLMStats = request.context[LLM_STATS_KEY]
stats.num_generation_tokens = (
outputs.shape[-1] * outputs.shape[0]
if self.is_encoder_decoder
else outputs[:, kwargs["input_ids"].shape[-1] :].shape[-1]
* outputs.shape[0]
)
outputs = self._tokenizer.batch_decode(
outputs[:, output_start:], skip_special_tokens=True
)
Expand Down Expand Up @@ -388,6 +398,8 @@ async def create_completion(
params = request.params
if params.prompt is None:
raise ValueError("prompt is required")
stats = LLMStats()
request.context[LLM_STATS_KEY] = stats
prompt = params.prompt
prompts = (
prompt
Expand All @@ -402,14 +414,16 @@ async def create_completion(
inputs = self._tokenizer(
prompts, padding=True, return_tensors=TensorType.PYTORCH
).to(self._device)
num_input_tokens = len(inputs["input_ids"])
num_input_tokens_per_prompt = inputs["input_ids"].shape[-1]
num_input_tokens = num_input_tokens_per_prompt * inputs["input_ids"].shape[0]
stats.num_prompt_tokens = num_input_tokens
if params.max_tokens is None:
params.max_tokens = self.max_length - num_input_tokens
if num_input_tokens + params.max_tokens > self.max_length:
params.max_tokens = self.max_length - num_input_tokens_per_prompt
if num_input_tokens_per_prompt + params.max_tokens > self.max_length:
raise ValueError(
f"This model's maximum context length is {self.max_length} tokens. "
f"However, you requested {params.max_tokens + num_input_tokens} tokens "
f"({num_input_tokens} in the messages, "
f"However, you requested {params.max_tokens + num_input_tokens_per_prompt} tokens "
f"({num_input_tokens_per_prompt} in the messages, "
f"{params.max_tokens} in the completion). "
f"Please reduce the length of the messages or completion.",
)
Expand Down Expand Up @@ -472,4 +486,9 @@ async def create_completion(
object="text_completion",
model=params.model,
system_fingerprint=self.system_fingerprint,
usage=CompletionUsage(
prompt_tokens=stats.num_prompt_tokens,
completion_tokens=stats.num_generation_tokens,
total_tokens=stats.num_prompt_tokens + stats.num_generation_tokens,
),
)
26 changes: 16 additions & 10 deletions python/huggingfaceserver/huggingfaceserver/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def bloom_model():
model = HuggingfaceGenerativeModel(
"bloom-560m",
model_id_or_path="bigscience/bloom-560m",
max_length=512,
dtype=torch.float32,
)
model.load()
Expand Down Expand Up @@ -152,9 +153,10 @@ async def test_t5(t5_model: HuggingfaceGenerativeModel):
prompt="translate from English to German: we are making words",
stream=False,
)
request = CompletionRequest(params=params)
request = CompletionRequest(params=params, context={})
response = await t5_model.create_completion(request)
assert response.choices[0].text == "wir setzen Worte"
assert response.usage.completion_tokens == 7


@pytest.mark.asyncio
Expand All @@ -165,7 +167,7 @@ async def test_t5_stopping_criteria(t5_model: HuggingfaceGenerativeModel):
stop=["setzen "],
stream=False,
)
request = CompletionRequest(params=params)
request = CompletionRequest(params=params, context={})
response = await t5_model.create_completion(request)
assert response.choices[0].text == "wir setzen"

Expand All @@ -178,7 +180,7 @@ async def test_t5_bad_params(t5_model: HuggingfaceGenerativeModel):
echo=True,
stream=False,
)
request = CompletionRequest(params=params)
request = CompletionRequest(params=params, context={})
with pytest.raises(ValueError) as err_info:
await t5_model.create_completion(request)
assert err_info.value.args[0] == "'echo' is not supported by encoder-decoder models"
Expand Down Expand Up @@ -313,7 +315,7 @@ async def test_bloom_completion(bloom_model: HuggingfaceGenerativeModel):
stream=False,
echo=True,
)
request = CompletionRequest(params=params)
request = CompletionRequest(params=params, context={})
response = await bloom_model.create_completion(request)
assert (
response.choices[0].text
Expand All @@ -330,7 +332,7 @@ async def test_bloom_completion_max_tokens(bloom_model: HuggingfaceGenerativeMod
echo=True,
max_tokens=100, # bloom doesn't have any field specifying context length. Our implementation would default to 2048. Testing with something longer than HF's default max_length of 20
)
request = CompletionRequest(params=params)
request = CompletionRequest(params=params, context={})
response = await bloom_model.create_completion(request)
assert (
response.choices[0].text
Expand All @@ -346,7 +348,7 @@ async def test_bloom_completion_streaming(bloom_model: HuggingfaceGenerativeMode
stream=True,
echo=False,
)
request = CompletionRequest(params=params)
request = CompletionRequest(params=params, context={})
response = await bloom_model.create_completion(request)
output = ""
async for chunk in response:
Expand All @@ -367,14 +369,18 @@ async def test_bloom_chat_completion(bloom_model: HuggingfaceGenerativeModel):
},
]
params = CreateChatCompletionRequest(
model="bloom-560m", messages=messages, stream=False, max_tokens=20
model="bloom-560m",
messages=messages,
stream=False,
max_tokens=20,
)
request = ChatCompletionRequest(params=params)
request = ChatCompletionRequest(params=params, context={})
response = await bloom_model.create_chat_completion(request)
assert (
response.choices[0].message.content
== "The first thing you need to do is to get a good idea of what you are looking for."
)
assert response.usage.completion_tokens == 20


@pytest.mark.asyncio
Expand All @@ -395,7 +401,7 @@ async def test_bloom_chat_completion_streaming(bloom_model: HuggingfaceGenerativ
stream=True,
max_tokens=20,
)
request = ChatCompletionRequest(params=params)
request = ChatCompletionRequest(params=params, context={})
response = await bloom_model.create_chat_completion(request)
output = ""
async for chunk in response:
Expand Down Expand Up @@ -443,7 +449,7 @@ async def test_input_padding_with_pad_token_not_specified(
stream=False,
temperature=0,
)
request = CompletionRequest(params=params)
request = CompletionRequest(params=params, context={})
response = await openai_gpt_model.create_completion(request)
assert (
response.choices[0].text
Expand Down
3 changes: 3 additions & 0 deletions python/kserve/kserve/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,6 @@

# K8S metadata key constants
GENERATION = "generation"

# LLM stats map key
LLM_STATS_KEY = "llm-stats"
8 changes: 8 additions & 0 deletions python/kserve/kserve/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.

from prometheus_client import Histogram
from pydantic import BaseModel

PROM_LABELS = ["model_name"]
PRE_HIST_TIME = Histogram(
Expand All @@ -29,5 +30,12 @@
)


class LLMStats(BaseModel):
"""LLM metrics data class."""

num_prompt_tokens: int = 0
num_generation_tokens: int = 0


def get_labels(model_name):
return {PROM_LABELS[0]: model_name}
2 changes: 2 additions & 0 deletions python/kserve/kserve/protocol/rest/openai/types/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
)
from kserve.protocol.rest.openai.types.openapi import TopLogprob
from kserve.protocol.rest.openai.types.openapi import ErrorResponse
from kserve.protocol.rest.openai.types.openapi import CompletionUsage

ChatCompletionRequestMessage = Union[
ChatCompletionRequestSystemMessage,
Expand Down Expand Up @@ -78,4 +79,5 @@
"ErrorResponse",
"Logprobs",
"TopLogprob",
"CompletionUsage",
]
11 changes: 10 additions & 1 deletion test/e2e/predictor/test_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def test_huggingface_openai_chat_completions():
"27dcfa74d334bc871f3234de431e71c6eeba5dd6",
"--backend",
"huggingface",
"--max_length",
"512",
],
resources=V1ResourceRequirements(
requests={"cpu": "1", "memory": "2Gi"},
Expand Down Expand Up @@ -241,7 +243,14 @@ def test_huggingface_openai_text_2_text():
model_format=V1beta1ModelFormat(
name="huggingface",
),
args=["--model_id", "t5-small", "--backend", "huggingface"],
args=[
"--model_id",
"t5-small",
"--backend",
"huggingface",
"--max_length",
"512",
],
resources=V1ResourceRequirements(
requests={"cpu": "1", "memory": "2Gi"},
limits={"cpu": "1", "memory": "4Gi"},
Expand Down

0 comments on commit edac2c3

Please sign in to comment.