Skip to content

Commit

Permalink
[Misc] Improved prefix cache example (vllm-project#9077)
Browse files Browse the repository at this point in the history
  • Loading branch information
Imss27 authored Oct 4, 2024
1 parent 185f8f6 commit 0d04ab0
Showing 1 changed file with 3 additions and 9 deletions.
12 changes: 3 additions & 9 deletions examples/offline_inference_with_prefix.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from time import time

from vllm import LLM, SamplingParams

# NOTE: This is just a running example. For benchmarking purpose,
# please see benchmarks/benchmark_prefix_caching.py

# Common prefix.
prefix = (
"You are an expert school principal, skilled in effectively managing "
Expand Down Expand Up @@ -37,9 +38,7 @@

# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
start_time_regular = time()
outputs = regular_llm.generate(generating_prompts, sampling_params)
duration_regular = time() - start_time_regular

regular_generated_texts = []
# Print the outputs.
Expand All @@ -55,9 +54,7 @@
prefix_cached_llm.generate(generating_prompts[0], sampling_params)

# Generate with prefix caching.
start_time_cached = time()
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
duration_cached = time() - start_time_cached

print("Results with `enable_prefix_caching`")

Expand All @@ -77,6 +74,3 @@
for i in range(len(prompts))
])
print(f"Generated answers are the same: {generated_same}")

speedup = round(duration_regular / duration_cached, 2)
print(f"Speed up of cached generation compared to the regular is: {speedup}")

0 comments on commit 0d04ab0

Please sign in to comment.