diff --git a/python/huggingfaceserver/huggingfaceserver/generative_model.py b/python/huggingfaceserver/huggingfaceserver/generative_model.py index a9eace4bc12..c62eb30438e 100644 --- a/python/huggingfaceserver/huggingfaceserver/generative_model.py +++ b/python/huggingfaceserver/huggingfaceserver/generative_model.py @@ -40,9 +40,11 @@ ChatCompletionRequestMessage, Completion, CompletionChoice, + CompletionUsage, CreateCompletionRequest, ) from kserve.utils.utils import generate_uuid +from kserve.constants.constants import LLM_STATS_KEY from transformers import ( AutoConfig, AutoModel, @@ -56,6 +58,7 @@ TextIteratorStreamer, set_seed, ) +from kserve.metrics import LLMStats from .stop_sequence_stopping_criteria import StopSequenceStoppingCriteria from .task import ( @@ -299,6 +302,13 @@ def queue_put(outputs): 0 if echo or self.is_encoder_decoder else kwargs["input_ids"].shape[-1] ) outputs = self._model.generate(**kwargs) + stats: LLMStats = request.context[LLM_STATS_KEY] + stats.num_generation_tokens = ( + outputs.shape[-1] * outputs.shape[0] + if self.is_encoder_decoder + else outputs[:, kwargs["input_ids"].shape[-1] :].shape[-1] + * outputs.shape[0] + ) outputs = self._tokenizer.batch_decode( outputs[:, output_start:], skip_special_tokens=True ) @@ -388,6 +398,8 @@ async def create_completion( params = request.params if params.prompt is None: raise ValueError("prompt is required") + stats = LLMStats() + request.context[LLM_STATS_KEY] = stats prompt = params.prompt prompts = ( prompt @@ -402,14 +414,16 @@ async def create_completion( inputs = self._tokenizer( prompts, padding=True, return_tensors=TensorType.PYTORCH ).to(self._device) - num_input_tokens = len(inputs["input_ids"]) + num_input_tokens_per_prompt = inputs["input_ids"].shape[-1] + num_input_tokens = num_input_tokens_per_prompt * inputs["input_ids"].shape[0] + stats.num_prompt_tokens = num_input_tokens if params.max_tokens is None: - params.max_tokens = self.max_length - num_input_tokens - if num_input_tokens + params.max_tokens > self.max_length: + params.max_tokens = self.max_length - num_input_tokens_per_prompt + if num_input_tokens_per_prompt + params.max_tokens > self.max_length: raise ValueError( f"This model's maximum context length is {self.max_length} tokens. " - f"However, you requested {params.max_tokens + num_input_tokens} tokens " - f"({num_input_tokens} in the messages, " + f"However, you requested {params.max_tokens + num_input_tokens_per_prompt} tokens " + f"({num_input_tokens_per_prompt} in the messages, " f"{params.max_tokens} in the completion). " f"Please reduce the length of the messages or completion.", ) @@ -472,4 +486,9 @@ async def create_completion( object="text_completion", model=params.model, system_fingerprint=self.system_fingerprint, + usage=CompletionUsage( + prompt_tokens=stats.num_prompt_tokens, + completion_tokens=stats.num_generation_tokens, + total_tokens=stats.num_prompt_tokens + stats.num_generation_tokens, + ), ) diff --git a/python/huggingfaceserver/huggingfaceserver/test_model.py b/python/huggingfaceserver/huggingfaceserver/test_model.py index 96c90fa03b6..d02ba1dbe18 100644 --- a/python/huggingfaceserver/huggingfaceserver/test_model.py +++ b/python/huggingfaceserver/huggingfaceserver/test_model.py @@ -37,6 +37,7 @@ def bloom_model(): model = HuggingfaceGenerativeModel( "bloom-560m", model_id_or_path="bigscience/bloom-560m", + max_length=512, dtype=torch.float32, ) model.load() @@ -152,9 +153,10 @@ async def test_t5(t5_model: HuggingfaceGenerativeModel): prompt="translate from English to German: we are making words", stream=False, ) - request = CompletionRequest(params=params) + request = CompletionRequest(params=params, context={}) response = await t5_model.create_completion(request) assert response.choices[0].text == "wir setzen Worte" + assert response.usage.completion_tokens == 7 @pytest.mark.asyncio @@ -165,7 +167,7 @@ async def test_t5_stopping_criteria(t5_model: HuggingfaceGenerativeModel): stop=["setzen "], stream=False, ) - request = CompletionRequest(params=params) + request = CompletionRequest(params=params, context={}) response = await t5_model.create_completion(request) assert response.choices[0].text == "wir setzen" @@ -178,7 +180,7 @@ async def test_t5_bad_params(t5_model: HuggingfaceGenerativeModel): echo=True, stream=False, ) - request = CompletionRequest(params=params) + request = CompletionRequest(params=params, context={}) with pytest.raises(ValueError) as err_info: await t5_model.create_completion(request) assert err_info.value.args[0] == "'echo' is not supported by encoder-decoder models" @@ -313,7 +315,7 @@ async def test_bloom_completion(bloom_model: HuggingfaceGenerativeModel): stream=False, echo=True, ) - request = CompletionRequest(params=params) + request = CompletionRequest(params=params, context={}) response = await bloom_model.create_completion(request) assert ( response.choices[0].text @@ -330,7 +332,7 @@ async def test_bloom_completion_max_tokens(bloom_model: HuggingfaceGenerativeMod echo=True, max_tokens=100, # bloom doesn't have any field specifying context length. Our implementation would default to 2048. Testing with something longer than HF's default max_length of 20 ) - request = CompletionRequest(params=params) + request = CompletionRequest(params=params, context={}) response = await bloom_model.create_completion(request) assert ( response.choices[0].text @@ -346,7 +348,7 @@ async def test_bloom_completion_streaming(bloom_model: HuggingfaceGenerativeMode stream=True, echo=False, ) - request = CompletionRequest(params=params) + request = CompletionRequest(params=params, context={}) response = await bloom_model.create_completion(request) output = "" async for chunk in response: @@ -367,14 +369,18 @@ async def test_bloom_chat_completion(bloom_model: HuggingfaceGenerativeModel): }, ] params = CreateChatCompletionRequest( - model="bloom-560m", messages=messages, stream=False, max_tokens=20 + model="bloom-560m", + messages=messages, + stream=False, + max_tokens=20, ) - request = ChatCompletionRequest(params=params) + request = ChatCompletionRequest(params=params, context={}) response = await bloom_model.create_chat_completion(request) assert ( response.choices[0].message.content == "The first thing you need to do is to get a good idea of what you are looking for." ) + assert response.usage.completion_tokens == 20 @pytest.mark.asyncio @@ -395,7 +401,7 @@ async def test_bloom_chat_completion_streaming(bloom_model: HuggingfaceGenerativ stream=True, max_tokens=20, ) - request = ChatCompletionRequest(params=params) + request = ChatCompletionRequest(params=params, context={}) response = await bloom_model.create_chat_completion(request) output = "" async for chunk in response: @@ -443,7 +449,7 @@ async def test_input_padding_with_pad_token_not_specified( stream=False, temperature=0, ) - request = CompletionRequest(params=params) + request = CompletionRequest(params=params, context={}) response = await openai_gpt_model.create_completion(request) assert ( response.choices[0].text diff --git a/python/kserve/kserve/constants/constants.py b/python/kserve/kserve/constants/constants.py index c485ee4327c..cf1dbd95b3d 100644 --- a/python/kserve/kserve/constants/constants.py +++ b/python/kserve/kserve/constants/constants.py @@ -71,3 +71,6 @@ # K8S metadata key constants GENERATION = "generation" + +# LLM stats map key +LLM_STATS_KEY = "llm-stats" diff --git a/python/kserve/kserve/metrics.py b/python/kserve/kserve/metrics.py index aa071e8659f..7d334ecc6b8 100644 --- a/python/kserve/kserve/metrics.py +++ b/python/kserve/kserve/metrics.py @@ -13,6 +13,7 @@ # limitations under the License. from prometheus_client import Histogram +from pydantic import BaseModel PROM_LABELS = ["model_name"] PRE_HIST_TIME = Histogram( @@ -29,5 +30,12 @@ ) +class LLMStats(BaseModel): + """LLM metrics data class.""" + + num_prompt_tokens: int = 0 + num_generation_tokens: int = 0 + + def get_labels(model_name): return {PROM_LABELS[0]: model_name} diff --git a/python/kserve/kserve/protocol/rest/openai/types/__init__.py b/python/kserve/kserve/protocol/rest/openai/types/__init__.py index f2e7fbcded7..3400c917014 100644 --- a/python/kserve/kserve/protocol/rest/openai/types/__init__.py +++ b/python/kserve/kserve/protocol/rest/openai/types/__init__.py @@ -46,6 +46,7 @@ ) from kserve.protocol.rest.openai.types.openapi import TopLogprob from kserve.protocol.rest.openai.types.openapi import ErrorResponse +from kserve.protocol.rest.openai.types.openapi import CompletionUsage ChatCompletionRequestMessage = Union[ ChatCompletionRequestSystemMessage, @@ -78,4 +79,5 @@ "ErrorResponse", "Logprobs", "TopLogprob", + "CompletionUsage", ] diff --git a/test/e2e/predictor/test_huggingface.py b/test/e2e/predictor/test_huggingface.py index 57ec45e6155..3933d7b9396 100644 --- a/test/e2e/predictor/test_huggingface.py +++ b/test/e2e/predictor/test_huggingface.py @@ -50,6 +50,8 @@ def test_huggingface_openai_chat_completions(): "27dcfa74d334bc871f3234de431e71c6eeba5dd6", "--backend", "huggingface", + "--max_length", + "512", ], resources=V1ResourceRequirements( requests={"cpu": "1", "memory": "2Gi"}, @@ -241,7 +243,14 @@ def test_huggingface_openai_text_2_text(): model_format=V1beta1ModelFormat( name="huggingface", ), - args=["--model_id", "t5-small", "--backend", "huggingface"], + args=[ + "--model_id", + "t5-small", + "--backend", + "huggingface", + "--max_length", + "512", + ], resources=V1ResourceRequirements( requests={"cpu": "1", "memory": "2Gi"}, limits={"cpu": "1", "memory": "4Gi"},