From ed6c21837fc57a1d4d1b5990f42d007f9b934f9b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 30 Jul 2024 14:32:12 -0700 Subject: [PATCH] [core][misc] improve free_finished_seq_groups (#6865) Co-authored-by: Woosuk Kwon --- vllm/core/scheduler.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 5b7b569c3e08d..5cdf1d15c31e1 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -313,6 +313,7 @@ def __init__( # Sequence groups finished requests ids since last step iteration. # It lets the model know that any state associated with these requests # can and must be released after the current step. + # This is used to evict the finished requests from the Mamba cache. self._finished_requests_ids: List[str] = list() # Time at previous scheduling step self.prev_time = 0.0 @@ -374,6 +375,7 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None: for aborted_group in aborted_groups: # Remove the sequence group from the state queue. state_queue.remove(aborted_group) + # Remove the aborted request from the Mamba cache. self._finished_requests_ids.append(aborted_group.request_id) for seq in aborted_group.get_seqs(): if seq.is_finished(): @@ -1057,13 +1059,16 @@ def free_seq(self, seq: Sequence) -> None: self.block_manager.free(seq) def free_finished_seq_groups(self) -> None: - for queue in [self.running, self.swapped, self.waiting]: - self._finished_requests_ids += [ - seq_group.request_id for seq_group in queue - if seq_group.is_finished() - ] - self.running = deque(seq_group for seq_group in self.running - if not seq_group.is_finished()) + remaining: Deque[SequenceGroup] = deque() + for seq_group in self.running: + if seq_group.is_finished(): + # Add the finished requests to the finished requests list. + # This list will be used to update the Mamba cache in the + # next step. + self._finished_requests_ids.append(seq_group.request_id) + else: + remaining.append(seq_group) + self.running = remaining def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: self.block_manager.allocate(seq_group)