Move offline_inference example to separate file

vllm-project · njhill · Jun 21, 2024 · May 20, 2024 · May 20, 2024 · May 20, 2024
commit e7742e75953702f9f302c3da714ec93fa03fc831
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
@@ -1,35 +1,20 @@
-import time
-
 from vllm import LLM, SamplingParams
 
-template = ("Below is an instruction that describes a task. Write a response "
-            "that appropriately completes the request.\n\n### Instruction:\n{}"
-            "\n\n### Response:")
-
 # Sample prompts.
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
     "The capital of France is",
     "The future of AI is",
 ]
-prompts = [template.format(prompt) for prompt in prompts]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="ibm-granite/granite-7b-instruct",
-          use_v2_block_manager=True,
-          enforce_eager=True,
-          speculative_model="ibm-granite/granite-7b-instruct-accelerator",
-          num_speculative_tokens=5)
+llm = LLM(model="facebook/opt-125m")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
-start = time.time()
-outputs = llm.generate(prompts, sampling_params)
-end = time.time()
-print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
 # Print the outputs.
 for output in outputs:
     prompt = output.prompt

diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py
@@ -0,0 +1,60 @@
+import gc
+import time
+from typing import List
+
+from vllm import LLM, SamplingParams
+
+
+def time_generation(llm: LLM, prompts: List[str], sampling_params: SamplingParams):
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    # Warmup first
+    llm.generate(prompts, sampling_params)
+    llm.generate(prompts, sampling_params)
+    start = time.time()
+    outputs = llm.generate(prompts, sampling_params)
+    end = time.time()
+    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    # Print the outputs.
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+
+    template = (
+        "Below is an instruction that describes a task. Write a response "
+        "that appropriately completes the request.\n\n### Instruction:\n{}"
+        "\n\n### Response:\n")
+
+    # Sample prompts.
+    prompts = [
+        "Write about the president of the United States.",
+    ]
+    prompts = [template.format(prompt) for prompt in prompts]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
+
+    # Create an LLM without spec decoding
+    llm = LLM(
+        model="meta-llama/Llama-2-13b-chat-hf",
+    )
+
+    print("Without speculation")
+    time_generation(llm, prompts, sampling_params)
+
+    del llm
+    gc.collect()
+
+    # Create an LLM with spec decoding
+    llm = LLM(
+        model="meta-llama/Llama-2-13b-chat-hf",
+        speculative_model="ibm-fms/llama-13b-accelerator",
+        # These are currently required for MLPSpeculator decoding
+        use_v2_block_manager=True,
+        enforce_eager=True,
+    )
+
+    print("With speculation")
+    time_generation(llm, prompts, sampling_params)