diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 04c2843792a1b..3b3e0ae64a037 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -1,7 +1,8 @@ -from time import time - from vllm import LLM, SamplingParams +# NOTE: This is just a running example. For benchmarking purpose, +# please see benchmarks/benchmark_prefix_caching.py + # Common prefix. prefix = ( "You are an expert school principal, skilled in effectively managing " @@ -37,9 +38,7 @@ # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. -start_time_regular = time() outputs = regular_llm.generate(generating_prompts, sampling_params) -duration_regular = time() - start_time_regular regular_generated_texts = [] # Print the outputs. @@ -55,9 +54,7 @@ prefix_cached_llm.generate(generating_prompts[0], sampling_params) # Generate with prefix caching. -start_time_cached = time() outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) -duration_cached = time() - start_time_cached print("Results with `enable_prefix_caching`") @@ -77,6 +74,3 @@ for i in range(len(prompts)) ]) print(f"Generated answers are the same: {generated_same}") - -speedup = round(duration_regular / duration_cached, 2) -print(f"Speed up of cached generation compared to the regular is: {speedup}")