review comments

vllm-project · Aug 29, 2024 · 942bc12 · 942bc12
1 parent e26b18b
commit 942bc12
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 4 deletions.
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -1256,12 +1256,18 @@ def _process_model_outputs(self,
         """Apply the model output to the sequences in the scheduled seq groups.
 
         virtual_engine: The engine id to operate on
+        
         is_async: Indicates whether this postprocessor runs in 
             parallel with the GPU forward pass and is processing 
             tokens from the previous step. If this is true, then
             no tokens need to be appended since it is already done
             externally (before the next schedule() call)
         
+        sampler_output: Used with multi-step execution to provide 
+            sampler_output of each step
+        is_last_output: Used with multi-step execution to indicate
+            the last step (of each multi-step group)
+            
         Returns RequestOutputs that can be returned to the client.
         """
         now = time.time()

diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
@@ -375,10 +375,12 @@ def execute_model(
 
         # Pythonize the output and block if needed since it is the last step
         if model_input.is_last_step:
-            outputs = self._final_process_outputs(
-                model_input,
-                model_input.frozen_model_input.async_callback.  # type: ignore
-                keywords["output_proc_callback"])  # type: ignore
+            assert model_input.frozen_model_input is not None
+            async_callback = model_input.frozen_model_input.async_callback  # type: ignore
+            output_proc_callback = async_callback.keywords[
+                "output_proc_callback"] if async_callback is not None else None
+            outputs = self._final_process_outputs(model_input,
+                                                  output_proc_callback)
             return outputs
 
         # should be [SamplerOutput]