diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 74503923073e..92c02072593e 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1256,12 +1256,18 @@ def _process_model_outputs(self, """Apply the model output to the sequences in the scheduled seq groups. virtual_engine: The engine id to operate on + is_async: Indicates whether this postprocessor runs in parallel with the GPU forward pass and is processing tokens from the previous step. If this is true, then no tokens need to be appended since it is already done externally (before the next schedule() call) + sampler_output: Used with multi-step execution to provide + sampler_output of each step + is_last_output: Used with multi-step execution to indicate + the last step (of each multi-step group) + Returns RequestOutputs that can be returned to the client. """ now = time.time() diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 32df0d43b316..1c1075200613 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -375,10 +375,12 @@ def execute_model( # Pythonize the output and block if needed since it is the last step if model_input.is_last_step: - outputs = self._final_process_outputs( - model_input, - model_input.frozen_model_input.async_callback. # type: ignore - keywords["output_proc_callback"]) # type: ignore + assert model_input.frozen_model_input is not None + async_callback = model_input.frozen_model_input.async_callback # type: ignore + output_proc_callback = async_callback.keywords[ + "output_proc_callback"] if async_callback is not None else None + outputs = self._final_process_outputs(model_input, + output_proc_callback) return outputs # should be [SamplerOutput]