[Misc] Improved prefix cache example (vllm-project#9077)

KuntaiDu · Oct 4, 2024 · 0d04ab0 · 0d04ab0
1 parent 185f8f6
commit 0d04ab0
Showing 1 changed file with 3 additions and 9 deletions.
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
@@ -1,7 +1,8 @@
-from time import time
-
 from vllm import LLM, SamplingParams
 
+# NOTE: This is just a running example. For benchmarking purpose,
+# please see benchmarks/benchmark_prefix_caching.py
+
 # Common prefix.
 prefix = (
     "You are an expert school principal, skilled in effectively managing "
@@ -37,9 +38,7 @@
 
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
-start_time_regular = time()
 outputs = regular_llm.generate(generating_prompts, sampling_params)
-duration_regular = time() - start_time_regular
 
 regular_generated_texts = []
 # Print the outputs.
@@ -55,9 +54,7 @@
 prefix_cached_llm.generate(generating_prompts[0], sampling_params)
 
 # Generate with prefix caching.
-start_time_cached = time()
 outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
-duration_cached = time() - start_time_cached
 
 print("Results with `enable_prefix_caching`")
 
@@ -77,6 +74,3 @@
     for i in range(len(prompts))
 ])
 print(f"Generated answers are the same: {generated_same}")
-
-speedup = round(duration_regular / duration_cached, 2)
-print(f"Speed up of cached generation compared to the regular is: {speedup}")