diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c57e6cd71640..5cdb490e305f 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1202,10 +1202,11 @@ def _can_append_slots(self, seq_group: SequenceGroup, seq_group=seq_group, num_lookahead_slots=num_lookahead_slots) def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool: - # TODO: does it work with parallel sampling? - no_beam_search = seq_group.sampling_params is None or ( + # async_output_proc is allowed only when we have a single sequence + # in the sequence group + no_single_seq = seq_group.sampling_params is None or ( seq_group.sampling_params.best_of == 1) - return no_beam_search + return no_single_seq def schedule( self