From 7eb0e0d7a42b3ac64a7912faf1f2822601da5f2a Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 15 May 2024 09:44:51 -0400 Subject: [PATCH 01/47] added block manager tests --- tests/core/test_block_manager.py | 132 ++++++++++++++++++++++++++++++- 1 file changed, 131 insertions(+), 1 deletion(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 22a9f0cf47d32..6b2fa21f2ef46 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -12,7 +12,7 @@ from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device -from .utils import create_dummy_prompt +from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder def test_block_allocator_allocate(): @@ -89,6 +89,34 @@ def test_allocate(): block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK +def test_allocate_encoder_decoder(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_req_per_seq_group = 2 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate same sequence group to all available gpu blocks. + for i in range(num_gpu_blocks//block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder(str(i), block_size, block_size) + assert block_manager.can_allocate(seq_group) + block_manager.allocate(seq_group) + assert block_manager.can_allocate(seq_group) != AllocStatus.OK + + # Allocate same sequence group to all available gpu blocks. + # Use watermark to reserve one gpu block. + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=1 / num_gpu_blocks) + for i in range((num_gpu_blocks - 1)//block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder(str(i), block_size//2, block_size//2) + assert block_manager.can_allocate(seq_group) + block_manager.allocate(seq_group) + assert block_manager.can_allocate(seq_group) != AllocStatus.OK def test_append_slot_single_seq(): block_size = 4 @@ -240,6 +268,58 @@ def test_swap(): assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) +def test_swap_encoder_decoder(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + decoder_prompt, encoder_prompt, seq_group = create_dummy_prompt_encoder_decoder("1", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + decoder_prompt.status = SequenceStatus.WAITING + encoder_prompt.status = SequenceStatus.WAITING + block_manager.allocate(seq_group) + + # Emulate a forward pass by appending a single token. + # The block manager then knows how many unprocessed + # tokens will be written in the next forward pass. + token_id = 0 + decoder_prompt.status = SequenceStatus.RUNNING + decoder_prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) + + # Swap encoder/decoder seq group from GPU -> CPU. + decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt) + encoder_gpu_blocks = block_manager.get_encoder_block_table(seq_group) + gpu_blocks = decoder_gpu_blocks + encoder_gpu_blocks + assert block_manager.can_swap_out(seq_group) + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_out(seq_group) + assert [x[0] for x in mapping] == gpu_blocks + #assert list(mapping.keys()) == gpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) + assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks + decoder_prompt.status = SequenceStatus.SWAPPED + + # Swap decoder seq group from CPU -> GPU. + decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt) + encoder_cpu_blocks = block_manager.get_encoder_block_table(seq_group) + cpu_blocks = decoder_cpu_blocks + encoder_cpu_blocks + assert block_manager.can_swap_in(seq_group) == AllocStatus.OK + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_in(seq_group) + assert [x[0] for x in mapping] == cpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks + assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) def test_free(): block_size = 4 @@ -264,6 +344,34 @@ def test_free(): with pytest.raises(KeyError): block_manager.get_block_table(prompt) +def test_free_encoder_decoder(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + decoder_prompt, encoder_prompt, seq_group = create_dummy_prompt_encoder_decoder("1", + decoder_prompt_length=block_size//2, + encoder_prompt_length=block_size//2) + block_manager.allocate(seq_group) + + # Free allocated seq. + decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt)) + encoder_prompt_blocks = len(block_manager.get_encoder_block_table(seq_group)) + prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks + before_blocks = block_manager.get_num_free_gpu_blocks() + block_manager.free(decoder_prompt) + block_manager.free_encoder(seq_group) + after_blocks = block_manager.get_num_free_gpu_blocks() + assert after_blocks == before_blocks + prompt_blocks + + # Block table for freed encoder & decoder seq's are deleted. + with pytest.raises(KeyError): + block_manager.get_block_table(decoder_prompt) + block_manager.get_block_table(encoder_prompt) def test_reset(): block_size = 4 @@ -285,6 +393,28 @@ def test_reset(): block_manager.reset() assert block_manager.get_num_free_gpu_blocks() == original_blocks +def test_reset_encoder_decoder(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_req_per_seq_group = 2 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + + # Allocate same seq group on all available gpu blocks. + original_blocks = block_manager.get_num_free_gpu_blocks() + for i in range(num_gpu_blocks//block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder(f"{i}", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + block_manager.allocate(seq_group) + assert block_manager.get_num_free_gpu_blocks() == 0 + + # Resetting block manager frees all allocated blocks. + block_manager.reset() + assert block_manager.get_num_free_gpu_blocks() == original_blocks def test_sliding_window_multi_seq(): """ From 6e41c39b24e8bdcff76ebbab0b95e16c0603e0b3 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 15 May 2024 09:52:03 -0400 Subject: [PATCH 02/47] passing block manager encoder/decoder test --- tests/core/utils.py | 29 ++++++++ vllm/core/block_manager_v1.py | 130 ++++++++++++++++++++++++++++++++-- vllm/sequence.py | 12 ++++ 3 files changed, 166 insertions(+), 5 deletions(-) diff --git a/tests/core/utils.py b/tests/core/utils.py index 8fb13177a2d6c..170bf9fff3dd2 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -32,6 +32,35 @@ def create_dummy_prompt( return prompt, seq_group +def create_dummy_prompt_encoder_decoder( + request_id: str, + decoder_prompt_length: int, + encoder_prompt_length: int, + block_size: Optional[int] = None, + lora_request: Optional[LoRARequest] = None, + use_beam_search: bool = False, + best_of: int = 1, +) -> Tuple[Sequence, SequenceGroup]: + if not block_size: + block_size = decoder_prompt_length + + # Create dummy prompt sequence with tokens 0...block_size-1 + # and prompt "0 ... block_size". + decoder_prompt_tokens = list(range(decoder_prompt_length)) + decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens]) + decoder_prompt = Sequence(int(request_id), decoder_prompt_str, decoder_prompt_tokens, block_size) + encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) + encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) + encoder_prompt = Sequence(int(request_id), encoder_prompt_str, encoder_prompt_tokens, block_size) + seq_group = SequenceGroup( + request_id=request_id, + seqs=[decoder_prompt], + sampling_params=SamplingParams(use_beam_search=use_beam_search, best_of=best_of), + arrival_time=time.time(), + lora_request=lora_request, + encoder_seq=encoder_prompt) + + return decoder_prompt, encoder_prompt, seq_group def create_seq_group( seq_prompt_len: int = 1024, diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 52a170d79e4e7..bd2ccbbb86572 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -255,12 +255,23 @@ def __init__( Device.CPU, block_size, num_cpu_blocks) # Mapping: seq_id -> BlockTable. self.block_tables: Dict[int, BlockTable] = {} + # Mapping: req_id -> BlockTable + # Note that each SequenceGroup has a unique + # request ID + self.encoder_block_tables: Dict[str, BlockTable] = {} + + def get_seq_num_required_blocks(self, seq: Sequence) -> int: + if seq is None: + return 0 + return len(seq.logical_token_blocks) def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - num_required_blocks = len(seq.logical_token_blocks) + + decoder_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) + encoder_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_encoder_seq()) + num_required_blocks = decoder_num_required_blocks+encoder_num_required_blocks if self.block_sliding_window is not None: num_required_blocks = min(num_required_blocks, @@ -276,9 +287,9 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER - def allocate(self, seq_group: SequenceGroup) -> None: + def allocate_decoder(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same - # prompt. + # decoder prompt. seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] # Allocate new physical token blocks that will store the prompt tokens. @@ -301,10 +312,46 @@ def allocate(self, seq_group: SequenceGroup) -> None: block.ref_count = seq_group.num_seqs() block_table.append(block) - # Assign the block table for each sequence. + # Assign the decoder block table for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() + def allocate_encoder(self, seq_group: SequenceGroup) -> None: + # NOTE: Here we assume that all sequences in the group have the same + # encoder prompt. + seq = seq_group.get_encoder_seq() + + # Allocate new physical token blocks that will store the prompt tokens. + block_table: BlockTable = [] + if seq is None: + # Assign empty encoder block table for the SequenceGroup + self.encoder_block_tables[seq_group.request_id] = block_table + else: + num_prompt_blocks = len(seq.logical_token_blocks) + for logical_idx in range(num_prompt_blocks): + if (self.block_sliding_window is not None + and logical_idx >= self.block_sliding_window): + block = block_table[logical_idx % self.block_sliding_window] + # Set the reference counts of the token blocks. + block.ref_count = seq_group.num_seqs() + elif self.enable_caching: + block = self.gpu_allocator.allocate( + seq.hash_of_block(logical_idx), + seq.num_hashed_tokens_of_block(logical_idx)) + else: + block = self.gpu_allocator.allocate() + # Set the reference counts of the token blocks. + # TODO: feature not supported with encoder/decoder + block.ref_count = seq_group.num_seqs() + block_table.append(block) + + # Assign the encoder block table for the SequenceGroup. + self.encoder_block_tables[seq_group.request_id] = block_table + + def allocate(self, seq_group: SequenceGroup) -> None: + self.allocate_decoder(seq_group) + self.allocate_encoder(seq_group) + def can_append_slots(self, seq_group: SequenceGroup, num_lookahead_slots: int = 0) -> bool: @@ -445,11 +492,15 @@ def _get_physical_blocks( self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]: # NOTE: Here, we assume that the physical blocks are only shared by # the sequences in the same group. + request_id = seq_group.request_id blocks: Set[PhysicalTokenBlock] = set() for seq in seq_group.get_seqs(): if seq.is_finished(): continue blocks.update(self.block_tables[seq.seq_id]) + # Encoder blocks + if seq_group.encoder_seq is not None: + blocks.update(self.encoder_block_tables[request_id]) return list(blocks) def can_swap_in(self, @@ -459,6 +510,8 @@ def can_swap_in(self, ), "BlockSpaceManagerV1 does not support lookahead allocation" blocks = self._get_physical_blocks(seq_group) num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED) + if seq_group.encoder_seq is not None: + num_swapped_seqs += 1 num_free_blocks = self.gpu_allocator.get_num_free_blocks() # NOTE: Conservatively, we assume that every sequence will allocate # at least one free block right after the swap-in. @@ -477,6 +530,8 @@ def swap_in(self, assert (num_lookahead_slots == 0 ), "BlockSpaceManagerV1 does not support lookahead allocation" + request_id = seq_group.request_id + # CPU block -> GPU block. # dict is efficient in lookup `if cpu_block in mapping` mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} @@ -497,6 +552,23 @@ def swap_in(self, self.cpu_allocator.free(cpu_block) self.block_tables[seq.seq_id] = new_block_table + if seq_group.encoder_seq is not None: + new_block_table: BlockTable = [] + block_table = self.encoder_block_tables[request_id] + + for cpu_block in block_table: + if cpu_block in mapping: + gpu_block = mapping[cpu_block] + gpu_block.ref_count += 1 + else: + gpu_block = self.gpu_allocator.allocate( + cpu_block.block_hash, cpu_block.num_hashed_tokens) + mapping[cpu_block] = gpu_block + new_block_table.append(gpu_block) + # Free the CPU block swapped in to GPU. + self.cpu_allocator.free(cpu_block) + self.encoder_block_tables[request_id] = new_block_table + block_number_mapping = { cpu_block.block_number: gpu_block.block_number for cpu_block, gpu_block in mapping.items() @@ -509,6 +581,8 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: return len(blocks) <= self.cpu_allocator.get_num_free_blocks() def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + request_id = seq_group.request_id + # GPU block -> CPU block. # dict is efficient in lookup `if gpu_block in mapping` mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} @@ -529,6 +603,23 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: self.gpu_allocator.free(gpu_block) self.block_tables[seq.seq_id] = new_block_table + if seq_group.encoder_seq is not None: + new_block_table: BlockTable = [] + block_table = self.encoder_block_tables[request_id] + + for gpu_block in block_table: + if gpu_block in mapping: + cpu_block = mapping[gpu_block] + cpu_block.ref_count += 1 + else: + cpu_block = self.cpu_allocator.allocate( + gpu_block.block_hash, gpu_block.num_hashed_tokens) + mapping[gpu_block] = cpu_block + new_block_table.append(cpu_block) + # Free the GPU block swapped out to CPU. + self.gpu_allocator.free(gpu_block) + self.encoder_block_tables[request_id] = new_block_table + block_number_mapping = { gpu_block.block_number: cpu_block.block_number for gpu_block, cpu_block in mapping.items() @@ -559,15 +650,32 @@ def free(self, seq: Sequence) -> None: self._free_block_table(block_table) del self.block_tables[seq.seq_id] + def free_encoder(self, seq_group: SequenceGroup) -> None: + if seq_group.request_id not in self.encoder_block_tables: + # Already freed or hasn't ben scheduled yet. + return + block_table = self.encoder_block_tables[seq_group.request_id] + self._free_block_table(block_table) + del self.encoder_block_tables[seq_group.request_id] + def reset(self) -> None: + # Free decoder block tables for block_table in self.block_tables.values(): self._free_block_table(block_table) self.block_tables.clear() + # Free encoder block tables + for block_table in self.encoder_block_tables.values(): + self._free_block_table(block_table) + self.encoder_block_tables.clear() def get_block_table(self, seq: Sequence) -> List[int]: block_table = self.block_tables[seq.seq_id] return [block.block_number for block in block_table] + def get_encoder_block_table(self, seq_group: SequenceGroup) -> List[int]: + block_table = self.encoder_block_tables[seq_group.request_id] + return [block.block_number for block in block_table] + def get_num_free_gpu_blocks(self) -> int: return self.gpu_allocator.get_num_free_blocks() @@ -586,6 +694,18 @@ def access_all_blocks_in_seq( for block in block_table: block.last_accessed = access_time + def access_all_encoder_blocks_in_seq_group( + self, + seq_group: SequenceGroup, + access_time: float, + ) -> None: + if self.enable_caching: + # Update the last accessed time of all the blocks accessed + # in this step. + block_table = self.encoder_block_tables[seq_group.request_id] + for block in block_table: + block.last_accessed = access_time + def compute_full_blocks_in_seq(self, seq: Sequence): if seq.seq_id not in self.block_tables: return diff --git a/vllm/sequence.py b/vllm/sequence.py index aa759448d82b1..ca2de3ef0d774 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -420,6 +420,7 @@ class SequenceGroup: for an embedding model. pooling_params: The pooling parameters used to generate the pooling for an embedding model. + encoder_seq: Optional, the single encoder sequence. """ def __init__( @@ -432,6 +433,7 @@ def __init__( multi_modal_data: Optional[MultiModalData] = None, embeddings: Optional[List[float]] = None, pooling_params: Optional[PoolingParams] = None, + encoder_seq: Optional[Sequence] = None, ) -> None: self.request_id = request_id self.seqs_dict = {seq.seq_id: seq for seq in seqs} @@ -447,6 +449,7 @@ def __init__( self.multi_modal_data = multi_modal_data self.embeddings = embeddings self.pooling_params = pooling_params + self.encoder_seq = encoder_seq @property def prompt(self) -> str: @@ -524,6 +527,9 @@ def get_seqs( seq for seq in self.seqs_dict.values() if seq.status == status ] + def get_encoder_seq(self) -> Sequence: + return self.encoder_seq + def get_unfinished_seqs(self) -> List[Sequence]: return [ seq for seq in self.seqs_dict.values() if not seq.is_finished() @@ -607,6 +613,8 @@ class SequenceGroupMetadata: used in prefix caching. state: Internal state tied to this sequence group. multi_modal_data: Multi modal data. + encoder_seq_data: Optional, the sequence data for the single encoder prompt. + encoder_block_table: Optional, the block table for the single encoder prompt. """ def __init__( @@ -623,6 +631,8 @@ def __init__( computed_block_nums: Optional[List[int]] = None, state: Optional[SequenceGroupState] = None, multi_modal_data: Optional[MultiModalData] = None, + encoder_seq_data: Optional[SequenceData] = None, + encoder_block_table: Optional[Dict[int, List[int]]] = None, ) -> None: self.request_id = request_id self.is_prompt = is_prompt @@ -634,6 +644,8 @@ def __init__( self.computed_block_nums = computed_block_nums self.multi_modal_data = multi_modal_data self.state = SequenceGroupState() if state is None else state + self.encoder_seq_data = encoder_seq_data + self.encoder_block_table = encoder_block_table self._token_chunk_size = token_chunk_size self.do_sample = do_sample From f04ee73114eb50dbf03cb1d2a9ecd238705db035 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 15 May 2024 14:22:04 -0400 Subject: [PATCH 03/47] block manager v2 changes to pass test_can_allocate_seq_group_encoder_decoder --- tests/core/block/test_block_manager_v2.py | 49 ++++++++++++++++++++++- tests/core/utils.py | 49 +++++++++++++++++++++++ vllm/core/block_manager_v2.py | 6 +++ 3 files changed, 103 insertions(+), 1 deletion(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 1e8e4ccdfb151..6cb2f3708199f 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -5,7 +5,7 @@ from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list -from ..utils import create_seq_group +from ..utils import create_seq_group, create_seq_group_encoder_decoder @pytest.mark.parametrize("block_size", [16]) @@ -52,6 +52,53 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int, assert can_allocate_result == AllocStatus.LATER +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160]) +@pytest.mark.parametrize("num_seqs_per_group", [1, 4]) +@pytest.mark.parametrize("watermark", [0.0, 0.5]) +def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_group: int, + num_gpu_blocks: int, watermark: float): + block_manager = BlockSpaceManagerV2( + block_size=block_size, + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=1024, + watermark=watermark, + ) + num_watermark_blocks = int(watermark * num_gpu_blocks) + + num_output_blocks_per_seq = 1 + + # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but + # the current implementation assumes all seqs are new prompts / don't have + # different output lens. + num_output_blocks = num_output_blocks_per_seq + + for bdx,num_prompt_blocks in enumerate(range(1, num_gpu_blocks - num_output_blocks)): + num_encoder_blocks_per_seq = num_prompt_blocks + + seq_group = create_seq_group_encoder_decoder( + seq_prompt_len=block_size * num_prompt_blocks, + seq_output_lens=[ + block_size * num_output_blocks_per_seq + for _ in range(num_seqs_per_group) + ], + request_id=str(bdx) + ) + + assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks + + can_allocate_result = block_manager.can_allocate(seq_group) + + num_required_blocks = num_prompt_blocks + num_output_blocks + num_encoder_blocks_per_seq + + if num_gpu_blocks - num_required_blocks < num_watermark_blocks: + assert can_allocate_result == AllocStatus.NEVER + elif num_gpu_blocks >= num_required_blocks: + assert can_allocate_result == AllocStatus.OK + else: + assert can_allocate_result == AllocStatus.LATER + + @pytest.mark.parametrize("block_size", [1, 8]) @pytest.mark.parametrize("prompt_len", [1, 7, 8]) @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129]) diff --git a/tests/core/utils.py b/tests/core/utils.py index 170bf9fff3dd2..91930457bd25b 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -102,5 +102,54 @@ def create_seq_group( return seq_group +def create_seq_group_encoder_decoder( + seq_prompt_len: int = 1024, + seq_output_lens: Iterable[int] = (128, ), + request_id: str = '0', + seq_id_start: int = 0, + sampling_params: Optional[SamplingParams] = None) -> SequenceGroup: + + assert len(seq_output_lens) > 0 + + if sampling_params is None: + sampling_params = SamplingParams() + + prompt_token_ids = [0] * seq_prompt_len + + seqs = [] + for seq_id_offset, output_len in enumerate(seq_output_lens): + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + prompt="", + prompt_token_ids=prompt_token_ids, + block_size=16, + ) + + for i in range(output_len): + seq.append_token_id( + token_id=i, + logprobs={i: Logprob(0.0)}, + ) + seqs.append(seq) + + # Encoder sequence + encoder_seq = Sequence( + seq_id=seq_id_start + len(seq_output_lens), + prompt="", + prompt_token_ids=prompt_token_ids, + block_size=16, + ) + + seq_group = SequenceGroup( + request_id=request_id, + seqs=seqs, + sampling_params=sampling_params, + arrival_time=time.time(), + encoder_seq=encoder_seq + ) + + return seq_group + + def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index f0bc96564050a..06bfbba78dce6 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -96,6 +96,12 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: block_size=self.block_size, ) + if seq_group.encoder_seq is not None: + num_required_blocks += BlockTable.get_num_required_blocks( + seq_group.encoder_seq.get_token_ids(), + block_size=self.block_size, + ) + assert self.block_sliding_window is None if self.block_sliding_window is not None: num_required_blocks = min(num_required_blocks, From 07bbd8ac4c44f50f42137350ee928483842d02ee Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 15 May 2024 14:47:47 -0400 Subject: [PATCH 04/47] block manager v2 support for encoder/decoder --- vllm/core/block_manager_v1.py | 9 ++---- vllm/core/block_manager_v2.py | 59 ++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 7 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index bd2ccbbb86572..812d1ee3197a5 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -319,14 +319,11 @@ def allocate_decoder(self, seq_group: SequenceGroup) -> None: def allocate_encoder(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same # encoder prompt. - seq = seq_group.get_encoder_seq() # Allocate new physical token blocks that will store the prompt tokens. - block_table: BlockTable = [] - if seq is None: - # Assign empty encoder block table for the SequenceGroup - self.encoder_block_tables[seq_group.request_id] = block_table - else: + seq = seq_group.get_encoder_seq() + if seq is not None: + block_table: BlockTable = [] num_prompt_blocks = len(seq.logical_token_blocks) for logical_idx in range(num_prompt_blocks): if (self.block_sliding_window is not None diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 06bfbba78dce6..2f7a11bacc1a1 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -10,6 +10,7 @@ from vllm.utils import Device SeqId = int +EncoderSeqId = str class BlockSpaceManagerV2(BlockSpaceManager): @@ -85,6 +86,7 @@ def __init__( ) self.block_tables: Dict[SeqId, BlockTable] = {} + self.encoder_block_tables: Dict[EncoderSeqId, BlockTable] = {} def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share @@ -119,7 +121,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER - def allocate(self, seq_group: SequenceGroup) -> None: + def allocate_decoder(self, seq_group: SequenceGroup) -> None: waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) assert not (set(seq.seq_id for seq in waiting_seqs) & self.block_tables.keys()), "block table already exists" @@ -140,6 +142,28 @@ def allocate(self, seq_group: SequenceGroup) -> None: for seq in waiting_seqs[1:]: self.block_tables[seq.seq_id] = block_table.fork() + def allocate_encoder(self, seq_group: SequenceGroup) -> None: + # NOTE: Here we assume that all sequences in the group have the same + # prompt. + request_id = seq_group.request_id + seq = seq_group.encoder_seq + + assert not (request_id in self.encoder_block_tables), "block table already exists" + + seq = seq_group.get_encoder_seq() + if seq is not None: + block_table = BlockTable( + block_size=self.block_size, + block_allocator=self.block_allocator, + ) + assert self.block_sliding_window is None + block_table.allocate(seq.get_token_ids()) + self.encoder_block_tables[request_id] = block_table + + def allocate(self, seq_group: SequenceGroup) -> None: + self.allocate_decoder(seq_group) + self.allocate_encoder(seq_group) + def can_append_slots(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> bool: """Determine if there is enough space in the GPU KV cache to continue @@ -193,12 +217,29 @@ def free(self, seq: Sequence) -> None: self.block_tables[seq.seq_id].free() del self.block_tables[seq.seq_id] + def free_encoder(self, seq_group: SequenceGroup) -> None: + request_id = seq_group.request_id + if request_id not in self.encoder_block_tables: + # Already freed or hasn't ben scheduled yet. + return + self.encoder_block_tables[request_id].free() + del self.encoder_block_tables[request_id] + + del self.encoder_block_tables[seq_group.request_id] + def get_block_table(self, seq: Sequence) -> List[int]: assert seq.seq_id in self.block_tables block_ids = self.block_tables[seq.seq_id].physical_block_ids assert all(b is not None for b in block_ids) return block_ids # type: ignore + def get_encoder_block_table(self, seq_group: SequenceGroup) -> List[int]: + request_id = seq_group.request_id + assert request_id in self.encoder_block_tables + block_ids = self.block_tables[request_id].physical_block_ids + assert all(b is not None for b in block_ids) + return block_ids + def access_all_blocks_in_seq(self, seq: Sequence, now: float): # Update the last accessed time of all the blocks accessed # in this step. @@ -215,6 +256,22 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float): block_ids, # type: ignore now) + def access_all_encoder_blocks_in_seq_group( + self, + seq_group: SequenceGroup, + now: float, + ) -> None: + if self.enable_caching: + # Update the last accessed time of all the blocks accessed + # in this step. + block_table = self.encoder_block_tables[seq_group.request_id] + block_ids = [] + for block_id in block_table.physical_block_ids: + block_ids.append(block_id) + self.block_allocator.mark_blocks_as_accessed( + block_ids, # type: ignore + now) + def mark_blocks_as_computed(self, seq_group: SequenceGroup): # The only need for mark block as computed is for prefix caching, # while currently we could determine whether one block is computed From 3e95602f9c408f82628e881f30540ac82b3cb5f7 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 15 May 2024 15:11:35 -0400 Subject: [PATCH 05/47] renamed encoder to cross in block manager v2, regarding block tables --- vllm/core/block_manager_v2.py | 32 ++++++++++++++++---------------- vllm/sequence.py | 6 +++--- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 2f7a11bacc1a1..426612f615508 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -86,7 +86,7 @@ def __init__( ) self.block_tables: Dict[SeqId, BlockTable] = {} - self.encoder_block_tables: Dict[EncoderSeqId, BlockTable] = {} + self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {} def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share @@ -121,7 +121,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER - def allocate_decoder(self, seq_group: SequenceGroup) -> None: + def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None: waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) assert not (set(seq.seq_id for seq in waiting_seqs) & self.block_tables.keys()), "block table already exists" @@ -142,13 +142,13 @@ def allocate_decoder(self, seq_group: SequenceGroup) -> None: for seq in waiting_seqs[1:]: self.block_tables[seq.seq_id] = block_table.fork() - def allocate_encoder(self, seq_group: SequenceGroup) -> None: + def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same # prompt. request_id = seq_group.request_id seq = seq_group.encoder_seq - assert not (request_id in self.encoder_block_tables), "block table already exists" + assert not (request_id in self.cross_block_tables), "block table already exists" seq = seq_group.get_encoder_seq() if seq is not None: @@ -158,11 +158,11 @@ def allocate_encoder(self, seq_group: SequenceGroup) -> None: ) assert self.block_sliding_window is None block_table.allocate(seq.get_token_ids()) - self.encoder_block_tables[request_id] = block_table + self.cross_block_tables[request_id] = block_table def allocate(self, seq_group: SequenceGroup) -> None: - self.allocate_decoder(seq_group) - self.allocate_encoder(seq_group) + self.allocate_self_block_tables(seq_group) + self.allocate_cross_block_table(seq_group) def can_append_slots(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> bool: @@ -217,15 +217,15 @@ def free(self, seq: Sequence) -> None: self.block_tables[seq.seq_id].free() del self.block_tables[seq.seq_id] - def free_encoder(self, seq_group: SequenceGroup) -> None: + def free_cross(self, seq_group: SequenceGroup) -> None: request_id = seq_group.request_id - if request_id not in self.encoder_block_tables: + if request_id not in self.cross_block_tables: # Already freed or hasn't ben scheduled yet. return - self.encoder_block_tables[request_id].free() - del self.encoder_block_tables[request_id] + self.cross_block_tables[request_id].free() + del self.cross_block_tables[request_id] - del self.encoder_block_tables[seq_group.request_id] + del self.cross_block_tables[seq_group.request_id] def get_block_table(self, seq: Sequence) -> List[int]: assert seq.seq_id in self.block_tables @@ -233,9 +233,9 @@ def get_block_table(self, seq: Sequence) -> List[int]: assert all(b is not None for b in block_ids) return block_ids # type: ignore - def get_encoder_block_table(self, seq_group: SequenceGroup) -> List[int]: + def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: request_id = seq_group.request_id - assert request_id in self.encoder_block_tables + assert request_id in self.cross_block_tables block_ids = self.block_tables[request_id].physical_block_ids assert all(b is not None for b in block_ids) return block_ids @@ -256,7 +256,7 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float): block_ids, # type: ignore now) - def access_all_encoder_blocks_in_seq_group( + def access_all_cross_blocks_in_seq_group( self, seq_group: SequenceGroup, now: float, @@ -264,7 +264,7 @@ def access_all_encoder_blocks_in_seq_group( if self.enable_caching: # Update the last accessed time of all the blocks accessed # in this step. - block_table = self.encoder_block_tables[seq_group.request_id] + block_table = self.cross_block_tables[seq_group.request_id] block_ids = [] for block_id in block_table.physical_block_ids: block_ids.append(block_id) diff --git a/vllm/sequence.py b/vllm/sequence.py index ca2de3ef0d774..a73e70c1ae69d 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -614,7 +614,7 @@ class SequenceGroupMetadata: state: Internal state tied to this sequence group. multi_modal_data: Multi modal data. encoder_seq_data: Optional, the sequence data for the single encoder prompt. - encoder_block_table: Optional, the block table for the single encoder prompt. + cross_block_table: Optional, the cross-attention block table associated with the single encoder prompt. """ def __init__( @@ -632,7 +632,7 @@ def __init__( state: Optional[SequenceGroupState] = None, multi_modal_data: Optional[MultiModalData] = None, encoder_seq_data: Optional[SequenceData] = None, - encoder_block_table: Optional[Dict[int, List[int]]] = None, + cross_block_table: Optional[Dict[int, List[int]]] = None, ) -> None: self.request_id = request_id self.is_prompt = is_prompt @@ -645,7 +645,7 @@ def __init__( self.multi_modal_data = multi_modal_data self.state = SequenceGroupState() if state is None else state self.encoder_seq_data = encoder_seq_data - self.encoder_block_table = encoder_block_table + self.cross_block_table = cross_block_table self._token_chunk_size = token_chunk_size self.do_sample = do_sample From 04f38a819445c0141246feeb6969cc4b1e67891f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 15 May 2024 15:22:53 -0400 Subject: [PATCH 06/47] renamed encoder to cross where appropriate --- tests/core/block/test_block_manager_v2.py | 4 +- tests/core/test_block_manager.py | 12 ++--- vllm/core/block_manager_v1.py | 54 +++++++++++------------ 3 files changed, 35 insertions(+), 35 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 6cb2f3708199f..9b1c6cd68a15a 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -74,7 +74,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_gr num_output_blocks = num_output_blocks_per_seq for bdx,num_prompt_blocks in enumerate(range(1, num_gpu_blocks - num_output_blocks)): - num_encoder_blocks_per_seq = num_prompt_blocks + num_cross_blocks_per_seq = num_prompt_blocks seq_group = create_seq_group_encoder_decoder( seq_prompt_len=block_size * num_prompt_blocks, @@ -89,7 +89,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_gr can_allocate_result = block_manager.can_allocate(seq_group) - num_required_blocks = num_prompt_blocks + num_output_blocks + num_encoder_blocks_per_seq + num_required_blocks = num_prompt_blocks + num_output_blocks + num_cross_blocks_per_seq if num_gpu_blocks - num_required_blocks < num_watermark_blocks: assert can_allocate_result == AllocStatus.NEVER diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 6b2fa21f2ef46..62b7132e40462 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -293,8 +293,8 @@ def test_swap_encoder_decoder(): # Swap encoder/decoder seq group from GPU -> CPU. decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt) - encoder_gpu_blocks = block_manager.get_encoder_block_table(seq_group) - gpu_blocks = decoder_gpu_blocks + encoder_gpu_blocks + cross_gpu_blocks = block_manager.get_cross_block_table(seq_group) + gpu_blocks = decoder_gpu_blocks + cross_gpu_blocks assert block_manager.can_swap_out(seq_group) before_cpu_blocks = block_manager.get_num_free_cpu_blocks() before_gpu_blocks = block_manager.get_num_free_gpu_blocks() @@ -309,8 +309,8 @@ def test_swap_encoder_decoder(): # Swap decoder seq group from CPU -> GPU. decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt) - encoder_cpu_blocks = block_manager.get_encoder_block_table(seq_group) - cpu_blocks = decoder_cpu_blocks + encoder_cpu_blocks + cross_cpu_blocks = block_manager.get_cross_block_table(seq_group) + cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks assert block_manager.can_swap_in(seq_group) == AllocStatus.OK before_cpu_blocks = block_manager.get_num_free_cpu_blocks() before_gpu_blocks = block_manager.get_num_free_gpu_blocks() @@ -360,11 +360,11 @@ def test_free_encoder_decoder(): # Free allocated seq. decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt)) - encoder_prompt_blocks = len(block_manager.get_encoder_block_table(seq_group)) + encoder_prompt_blocks = len(block_manager.get_cross_block_table(seq_group)) prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks before_blocks = block_manager.get_num_free_gpu_blocks() block_manager.free(decoder_prompt) - block_manager.free_encoder(seq_group) + block_manager.free_cross(seq_group) after_blocks = block_manager.get_num_free_gpu_blocks() assert after_blocks == before_blocks + prompt_blocks diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 812d1ee3197a5..11a52b3618b44 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -258,7 +258,7 @@ def __init__( # Mapping: req_id -> BlockTable # Note that each SequenceGroup has a unique # request ID - self.encoder_block_tables: Dict[str, BlockTable] = {} + self.cross_block_tables: Dict[str, BlockTable] = {} def get_seq_num_required_blocks(self, seq: Sequence) -> int: if seq is None: @@ -269,9 +269,9 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - decoder_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) - encoder_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_encoder_seq()) - num_required_blocks = decoder_num_required_blocks+encoder_num_required_blocks + self_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) + cross_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_encoder_seq()) + num_required_blocks = self_num_required_blocks+cross_num_required_blocks if self.block_sliding_window is not None: num_required_blocks = min(num_required_blocks, @@ -287,7 +287,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER - def allocate_decoder(self, seq_group: SequenceGroup) -> None: + def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same # decoder prompt. seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] @@ -316,7 +316,7 @@ def allocate_decoder(self, seq_group: SequenceGroup) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() - def allocate_encoder(self, seq_group: SequenceGroup) -> None: + def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same # encoder prompt. @@ -342,12 +342,12 @@ def allocate_encoder(self, seq_group: SequenceGroup) -> None: block.ref_count = seq_group.num_seqs() block_table.append(block) - # Assign the encoder block table for the SequenceGroup. - self.encoder_block_tables[seq_group.request_id] = block_table + # Assign the cross-attention block table for the SequenceGroup. + self.cross_block_tables[seq_group.request_id] = block_table def allocate(self, seq_group: SequenceGroup) -> None: - self.allocate_decoder(seq_group) - self.allocate_encoder(seq_group) + self.allocate_self_block_tables(seq_group) + self.allocate_cross_block_table(seq_group) def can_append_slots(self, seq_group: SequenceGroup, @@ -495,9 +495,9 @@ def _get_physical_blocks( if seq.is_finished(): continue blocks.update(self.block_tables[seq.seq_id]) - # Encoder blocks + # Cross-attention blocks if seq_group.encoder_seq is not None: - blocks.update(self.encoder_block_tables[request_id]) + blocks.update(self.cross_block_tables[request_id]) return list(blocks) def can_swap_in(self, @@ -551,7 +551,7 @@ def swap_in(self, if seq_group.encoder_seq is not None: new_block_table: BlockTable = [] - block_table = self.encoder_block_tables[request_id] + block_table = self.cross_block_tables[request_id] for cpu_block in block_table: if cpu_block in mapping: @@ -564,7 +564,7 @@ def swap_in(self, new_block_table.append(gpu_block) # Free the CPU block swapped in to GPU. self.cpu_allocator.free(cpu_block) - self.encoder_block_tables[request_id] = new_block_table + self.cross_block_tables[request_id] = new_block_table block_number_mapping = { cpu_block.block_number: gpu_block.block_number @@ -602,7 +602,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: if seq_group.encoder_seq is not None: new_block_table: BlockTable = [] - block_table = self.encoder_block_tables[request_id] + block_table = self.cross_block_tables[request_id] for gpu_block in block_table: if gpu_block in mapping: @@ -615,7 +615,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: new_block_table.append(cpu_block) # Free the GPU block swapped out to CPU. self.gpu_allocator.free(gpu_block) - self.encoder_block_tables[request_id] = new_block_table + self.cross_block_tables[request_id] = new_block_table block_number_mapping = { gpu_block.block_number: cpu_block.block_number @@ -647,30 +647,30 @@ def free(self, seq: Sequence) -> None: self._free_block_table(block_table) del self.block_tables[seq.seq_id] - def free_encoder(self, seq_group: SequenceGroup) -> None: - if seq_group.request_id not in self.encoder_block_tables: + def free_cross(self, seq_group: SequenceGroup) -> None: + if seq_group.request_id not in self.cross_block_tables: # Already freed or hasn't ben scheduled yet. return - block_table = self.encoder_block_tables[seq_group.request_id] + block_table = self.cross_block_tables[seq_group.request_id] self._free_block_table(block_table) - del self.encoder_block_tables[seq_group.request_id] + del self.cross_block_tables[seq_group.request_id] def reset(self) -> None: # Free decoder block tables for block_table in self.block_tables.values(): self._free_block_table(block_table) self.block_tables.clear() - # Free encoder block tables - for block_table in self.encoder_block_tables.values(): + # Free cross-attention block tables + for block_table in self.cross_block_tables.values(): self._free_block_table(block_table) - self.encoder_block_tables.clear() + self.cross_block_tables.clear() def get_block_table(self, seq: Sequence) -> List[int]: block_table = self.block_tables[seq.seq_id] return [block.block_number for block in block_table] - def get_encoder_block_table(self, seq_group: SequenceGroup) -> List[int]: - block_table = self.encoder_block_tables[seq_group.request_id] + def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: + block_table = self.cross_block_tables[seq_group.request_id] return [block.block_number for block in block_table] def get_num_free_gpu_blocks(self) -> int: @@ -691,7 +691,7 @@ def access_all_blocks_in_seq( for block in block_table: block.last_accessed = access_time - def access_all_encoder_blocks_in_seq_group( + def access_all_cross_blocks_in_seq_group( self, seq_group: SequenceGroup, access_time: float, @@ -699,7 +699,7 @@ def access_all_encoder_blocks_in_seq_group( if self.enable_caching: # Update the last accessed time of all the blocks accessed # in this step. - block_table = self.encoder_block_tables[seq_group.request_id] + block_table = self.cross_block_tables[seq_group.request_id] for block in block_table: block.last_accessed = access_time From 2dcd663d40bdcc1cf2aca19b9cec64395ac6d528 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 15 May 2024 15:45:12 -0400 Subject: [PATCH 07/47] formatting --- tests/core/block/test_block_manager_v2.py | 16 +++++--- tests/core/test_block_manager.py | 43 +++++++++++++++------- tests/core/utils.py | 45 ++++++++++++----------- vllm/core/block_manager_v1.py | 18 +++++---- vllm/core/block_manager_v2.py | 8 ++-- vllm/sequence.py | 9 +++-- 6 files changed, 85 insertions(+), 54 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 9b1c6cd68a15a..06c3389cfa0f0 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -56,8 +56,10 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int, @pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160]) @pytest.mark.parametrize("num_seqs_per_group", [1, 4]) @pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_group: int, - num_gpu_blocks: int, watermark: float): +def test_can_allocate_seq_group_encoder_decoder(block_size: int, + num_seqs_per_group: int, + num_gpu_blocks: int, + watermark: float): block_manager = BlockSpaceManagerV2( block_size=block_size, num_gpu_blocks=num_gpu_blocks, @@ -73,7 +75,8 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_gr # different output lens. num_output_blocks = num_output_blocks_per_seq - for bdx,num_prompt_blocks in enumerate(range(1, num_gpu_blocks - num_output_blocks)): + for bdx, num_prompt_blocks in enumerate( + range(1, num_gpu_blocks - num_output_blocks)): num_cross_blocks_per_seq = num_prompt_blocks seq_group = create_seq_group_encoder_decoder( @@ -82,14 +85,15 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_gr block_size * num_output_blocks_per_seq for _ in range(num_seqs_per_group) ], - request_id=str(bdx) - ) + request_id=str(bdx)) assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks can_allocate_result = block_manager.can_allocate(seq_group) - num_required_blocks = num_prompt_blocks + num_output_blocks + num_cross_blocks_per_seq + num_required_blocks = num_prompt_blocks + \ + num_output_blocks + \ + num_cross_blocks_per_seq if num_gpu_blocks - num_required_blocks < num_watermark_blocks: assert can_allocate_result == AllocStatus.NEVER diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 62b7132e40462..d6ab246699903 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -89,6 +89,7 @@ def test_allocate(): block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK + def test_allocate_encoder_decoder(): block_size = 4 num_cpu_blocks = 4 @@ -100,8 +101,9 @@ def test_allocate_encoder_decoder(): watermark=0) # Allocate same sequence group to all available gpu blocks. - for i in range(num_gpu_blocks//block_req_per_seq_group): - _, _, seq_group = create_dummy_prompt_encoder_decoder(str(i), block_size, block_size) + for i in range(num_gpu_blocks // block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder( + str(i), block_size, block_size) assert block_manager.can_allocate(seq_group) block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK @@ -112,12 +114,14 @@ def test_allocate_encoder_decoder(): num_cpu_blocks, num_gpu_blocks, watermark=1 / num_gpu_blocks) - for i in range((num_gpu_blocks - 1)//block_req_per_seq_group): - _, _, seq_group = create_dummy_prompt_encoder_decoder(str(i), block_size//2, block_size//2) + for i in range((num_gpu_blocks - 1) // block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder( + str(i), block_size // 2, block_size // 2) assert block_manager.can_allocate(seq_group) block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK + def test_append_slot_single_seq(): block_size = 4 num_cpu_blocks = 4 @@ -268,6 +272,7 @@ def test_swap(): assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) + def test_swap_encoder_decoder(): block_size = 4 num_cpu_blocks = 4 @@ -277,9 +282,11 @@ def test_swap_encoder_decoder(): num_gpu_blocks, watermark=0) - decoder_prompt, encoder_prompt, seq_group = create_dummy_prompt_encoder_decoder("1", - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) + decoder_prompt, encoder_prompt, seq_group = \ + create_dummy_prompt_encoder_decoder( + "1", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) decoder_prompt.status = SequenceStatus.WAITING encoder_prompt.status = SequenceStatus.WAITING block_manager.allocate(seq_group) @@ -321,6 +328,7 @@ def test_swap_encoder_decoder(): assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) + def test_free(): block_size = 4 num_cpu_blocks = 4 @@ -344,6 +352,7 @@ def test_free(): with pytest.raises(KeyError): block_manager.get_block_table(prompt) + def test_free_encoder_decoder(): block_size = 4 num_cpu_blocks = 4 @@ -353,9 +362,11 @@ def test_free_encoder_decoder(): num_gpu_blocks, watermark=0) - decoder_prompt, encoder_prompt, seq_group = create_dummy_prompt_encoder_decoder("1", - decoder_prompt_length=block_size//2, - encoder_prompt_length=block_size//2) + decoder_prompt, encoder_prompt, seq_group = \ + create_dummy_prompt_encoder_decoder( + "1", + decoder_prompt_length=block_size // 2, + encoder_prompt_length=block_size // 2) block_manager.allocate(seq_group) # Free allocated seq. @@ -373,6 +384,7 @@ def test_free_encoder_decoder(): block_manager.get_block_table(decoder_prompt) block_manager.get_block_table(encoder_prompt) + def test_reset(): block_size = 4 num_cpu_blocks = 4 @@ -393,6 +405,7 @@ def test_reset(): block_manager.reset() assert block_manager.get_num_free_gpu_blocks() == original_blocks + def test_reset_encoder_decoder(): block_size = 4 num_cpu_blocks = 4 @@ -405,10 +418,11 @@ def test_reset_encoder_decoder(): # Allocate same seq group on all available gpu blocks. original_blocks = block_manager.get_num_free_gpu_blocks() - for i in range(num_gpu_blocks//block_req_per_seq_group): - _, _, seq_group = create_dummy_prompt_encoder_decoder(f"{i}", - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) + for i in range(num_gpu_blocks // block_req_per_seq_group): + _, _, seq_group = create_dummy_prompt_encoder_decoder( + f"{i}", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) block_manager.allocate(seq_group) assert block_manager.get_num_free_gpu_blocks() == 0 @@ -416,6 +430,7 @@ def test_reset_encoder_decoder(): block_manager.reset() assert block_manager.get_num_free_gpu_blocks() == original_blocks + def test_sliding_window_multi_seq(): """ Tests that memory allocation and deallocation is handled diff --git a/tests/core/utils.py b/tests/core/utils.py index 91930457bd25b..376af0f0eac4f 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -32,6 +32,7 @@ def create_dummy_prompt( return prompt, seq_group + def create_dummy_prompt_encoder_decoder( request_id: str, decoder_prompt_length: int, @@ -48,20 +49,24 @@ def create_dummy_prompt_encoder_decoder( # and prompt "0 ... block_size". decoder_prompt_tokens = list(range(decoder_prompt_length)) decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens]) - decoder_prompt = Sequence(int(request_id), decoder_prompt_str, decoder_prompt_tokens, block_size) + decoder_prompt = Sequence(int(request_id), decoder_prompt_str, + decoder_prompt_tokens, block_size) encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) - encoder_prompt = Sequence(int(request_id), encoder_prompt_str, encoder_prompt_tokens, block_size) - seq_group = SequenceGroup( - request_id=request_id, - seqs=[decoder_prompt], - sampling_params=SamplingParams(use_beam_search=use_beam_search, best_of=best_of), - arrival_time=time.time(), - lora_request=lora_request, - encoder_seq=encoder_prompt) + encoder_prompt = Sequence(int(request_id), encoder_prompt_str, + encoder_prompt_tokens, block_size) + seq_group = SequenceGroup(request_id=request_id, + seqs=[decoder_prompt], + sampling_params=SamplingParams( + use_beam_search=use_beam_search, + best_of=best_of), + arrival_time=time.time(), + lora_request=lora_request, + encoder_seq=encoder_prompt) return decoder_prompt, encoder_prompt, seq_group + def create_seq_group( seq_prompt_len: int = 1024, seq_output_lens: Iterable[int] = (128, ), @@ -134,20 +139,18 @@ def create_seq_group_encoder_decoder( # Encoder sequence encoder_seq = Sequence( - seq_id=seq_id_start + len(seq_output_lens), - prompt="", - prompt_token_ids=prompt_token_ids, - block_size=16, - ) - - seq_group = SequenceGroup( - request_id=request_id, - seqs=seqs, - sampling_params=sampling_params, - arrival_time=time.time(), - encoder_seq=encoder_seq + seq_id=seq_id_start + len(seq_output_lens), + prompt="", + prompt_token_ids=prompt_token_ids, + block_size=16, ) + seq_group = SequenceGroup(request_id=request_id, + seqs=seqs, + sampling_params=sampling_params, + arrival_time=time.time(), + encoder_seq=encoder_seq) + return seq_group diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 11a52b3618b44..03eba2e80c78d 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -263,15 +263,18 @@ def __init__( def get_seq_num_required_blocks(self, seq: Sequence) -> int: if seq is None: return 0 - return len(seq.logical_token_blocks) + return len(seq.logical_token_blocks) def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - self_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) - cross_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_encoder_seq()) - num_required_blocks = self_num_required_blocks+cross_num_required_blocks + self_num_required_blocks = self.get_seq_num_required_blocks( + seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) + cross_num_required_blocks = self.get_seq_num_required_blocks( + seq_group.get_encoder_seq()) + num_required_blocks = self_num_required_blocks + \ + cross_num_required_blocks if self.block_sliding_window is not None: num_required_blocks = min(num_required_blocks, @@ -328,7 +331,8 @@ def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None: for logical_idx in range(num_prompt_blocks): if (self.block_sliding_window is not None and logical_idx >= self.block_sliding_window): - block = block_table[logical_idx % self.block_sliding_window] + block = block_table[logical_idx % + self.block_sliding_window] # Set the reference counts of the token blocks. block.ref_count = seq_group.num_seqs() elif self.enable_caching: @@ -550,7 +554,7 @@ def swap_in(self, self.block_tables[seq.seq_id] = new_block_table if seq_group.encoder_seq is not None: - new_block_table: BlockTable = [] + new_block_table = [] block_table = self.cross_block_tables[request_id] for cpu_block in block_table: @@ -601,7 +605,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: self.block_tables[seq.seq_id] = new_block_table if seq_group.encoder_seq is not None: - new_block_table: BlockTable = [] + new_block_table = [] block_table = self.cross_block_tables[request_id] for gpu_block in block_table: diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 426612f615508..4ae3361e7b234 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -148,7 +148,9 @@ def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None: request_id = seq_group.request_id seq = seq_group.encoder_seq - assert not (request_id in self.cross_block_tables), "block table already exists" + assert (request_id + not in self.cross_block_tables), \ + "block table already exists" seq = seq_group.get_encoder_seq() if seq is not None: @@ -236,9 +238,9 @@ def get_block_table(self, seq: Sequence) -> List[int]: def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]: request_id = seq_group.request_id assert request_id in self.cross_block_tables - block_ids = self.block_tables[request_id].physical_block_ids + block_ids = self.cross_block_tables[request_id].physical_block_ids assert all(b is not None for b in block_ids) - return block_ids + return block_ids # type: ignore def access_all_blocks_in_seq(self, seq: Sequence, now: float): # Update the last accessed time of all the blocks accessed diff --git a/vllm/sequence.py b/vllm/sequence.py index a73e70c1ae69d..a11c411876ea8 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -528,7 +528,7 @@ def get_seqs( ] def get_encoder_seq(self) -> Sequence: - return self.encoder_seq + return self.encoder_seq # type: ignore def get_unfinished_seqs(self) -> List[Sequence]: return [ @@ -613,8 +613,11 @@ class SequenceGroupMetadata: used in prefix caching. state: Internal state tied to this sequence group. multi_modal_data: Multi modal data. - encoder_seq_data: Optional, the sequence data for the single encoder prompt. - cross_block_table: Optional, the cross-attention block table associated with the single encoder prompt. + encoder_seq_data: Optional, the sequence data + for the single encoder prompt. + cross_block_table: Optional, the cross-attention + block table associated with + the single encoder prompt. """ def __init__( From 2ced012a3e51a77abbbab2268d88730fdffa4a3f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 21 May 2024 06:19:19 -0400 Subject: [PATCH 08/47] fix wording nits (ben->been, decoder->encoder/decoder) --- tests/core/test_block_manager.py | 2 +- vllm/core/block_manager_v2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index d6ab246699903..81e3444815d4e 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -314,7 +314,7 @@ def test_swap_encoder_decoder(): assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks decoder_prompt.status = SequenceStatus.SWAPPED - # Swap decoder seq group from CPU -> GPU. + # Swap encoder/decoder seq group from CPU -> GPU. decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt) cross_cpu_blocks = block_manager.get_cross_block_table(seq_group) cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 4ae3361e7b234..978acd915b69b 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -222,7 +222,7 @@ def free(self, seq: Sequence) -> None: def free_cross(self, seq_group: SequenceGroup) -> None: request_id = seq_group.request_id if request_id not in self.cross_block_tables: - # Already freed or hasn't ben scheduled yet. + # Already freed or hasn't been scheduled yet. return self.cross_block_tables[request_id].free() del self.cross_block_tables[request_id] From 8286b4cfbe57001767617a9ee33066945f6baa3d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 13:46:58 -0400 Subject: [PATCH 09/47] changed two block manager tests to construct fake prompts that are equal in length to the bock size, rather than half the block size (which had been the case --- tests/core/test_block_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 81e3444815d4e..9dc1c88819b70 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -116,7 +116,7 @@ def test_allocate_encoder_decoder(): watermark=1 / num_gpu_blocks) for i in range((num_gpu_blocks - 1) // block_req_per_seq_group): _, _, seq_group = create_dummy_prompt_encoder_decoder( - str(i), block_size // 2, block_size // 2) + str(i), block_size, block_size) assert block_manager.can_allocate(seq_group) block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK @@ -365,8 +365,8 @@ def test_free_encoder_decoder(): decoder_prompt, encoder_prompt, seq_group = \ create_dummy_prompt_encoder_decoder( "1", - decoder_prompt_length=block_size // 2, - encoder_prompt_length=block_size // 2) + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) block_manager.allocate(seq_group) # Free allocated seq. From eba551cd7e1d53911cb392d773eec05cfe40cc4f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 13:50:04 -0400 Subject: [PATCH 10/47] keyword args for dummy prompt construction in block manager encoder/decoder tests --- tests/core/test_block_manager.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 9dc1c88819b70..19dfc09dbb001 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -103,7 +103,9 @@ def test_allocate_encoder_decoder(): # Allocate same sequence group to all available gpu blocks. for i in range(num_gpu_blocks // block_req_per_seq_group): _, _, seq_group = create_dummy_prompt_encoder_decoder( - str(i), block_size, block_size) + str(i), + decoder_prompt_length=block_size, + decoder_prompt_length=block_size) assert block_manager.can_allocate(seq_group) block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK @@ -116,7 +118,9 @@ def test_allocate_encoder_decoder(): watermark=1 / num_gpu_blocks) for i in range((num_gpu_blocks - 1) // block_req_per_seq_group): _, _, seq_group = create_dummy_prompt_encoder_decoder( - str(i), block_size, block_size) + str(i), + decoder_prompt_length=block_size, + decoder_prompt_length=block_size) assert block_manager.can_allocate(seq_group) block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK From a7c8b192cd7c6e6c815caf5acbbd4ed24b16925d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 14:00:05 -0400 Subject: [PATCH 11/47] bugfix - decoder prompt kwarg repeated in lieu of encoder prompt kwarg --- tests/core/test_block_manager.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 19dfc09dbb001..29956ff028143 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -73,7 +73,7 @@ def test_allocate(): # Allocate same sequence group to all available gpu blocks. for i in range(num_gpu_blocks): _, seq_group = create_dummy_prompt(str(i), block_size) - assert block_manager.can_allocate(seq_group) + assert block_manager.can_allocate(seq_group) == AllocStatus.OK block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK @@ -85,7 +85,7 @@ def test_allocate(): watermark=1 / num_gpu_blocks) for i in range(num_gpu_blocks - 1): _, seq_group = create_dummy_prompt(str(i), block_size) - assert block_manager.can_allocate(seq_group) + assert block_manager.can_allocate(seq_group) == AllocStatus.OK block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK @@ -105,8 +105,8 @@ def test_allocate_encoder_decoder(): _, _, seq_group = create_dummy_prompt_encoder_decoder( str(i), decoder_prompt_length=block_size, - decoder_prompt_length=block_size) - assert block_manager.can_allocate(seq_group) + encoder_prompt_length=block_size) + assert block_manager.can_allocate(seq_group) == AllocStatus.OK block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK @@ -120,8 +120,8 @@ def test_allocate_encoder_decoder(): _, _, seq_group = create_dummy_prompt_encoder_decoder( str(i), decoder_prompt_length=block_size, - decoder_prompt_length=block_size) - assert block_manager.can_allocate(seq_group) + encoder_prompt_length=block_size) + assert block_manager.can_allocate(seq_group) == AllocStatus.OK block_manager.allocate(seq_group) assert block_manager.can_allocate(seq_group) != AllocStatus.OK From 9feb994966e365fac63bbec526cafb24cf00dcde Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 14:09:42 -0400 Subject: [PATCH 12/47] In block manager test which used with block to detect error - created a second with block for encoder-related call that previously shared a with block with the corresponding decoder-related call --- tests/core/test_block_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 29956ff028143..808b0a5e651eb 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -386,6 +386,9 @@ def test_free_encoder_decoder(): # Block table for freed encoder & decoder seq's are deleted. with pytest.raises(KeyError): block_manager.get_block_table(decoder_prompt) + + # Block table for freed encoder & decoder seq's are deleted. + with pytest.raises(KeyError): block_manager.get_block_table(encoder_prompt) From 5eb0032bfaaf5bc43fab66f1fc8bea30045915b7 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 14:17:50 -0400 Subject: [PATCH 13/47] refactoring block manager v1/v2 swap in/swap out functions --- vllm/core/block_manager_v1.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 03eba2e80c78d..119e444df1b11 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -570,12 +570,7 @@ def swap_in(self, self.cpu_allocator.free(cpu_block) self.cross_block_tables[request_id] = new_block_table - block_number_mapping = { - cpu_block.block_number: gpu_block.block_number - for cpu_block, gpu_block in mapping.items() - } - # convert to list of tuples once here - return list(block_number_mapping.items()) + return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()] def can_swap_out(self, seq_group: SequenceGroup) -> bool: blocks = self._get_physical_blocks(seq_group) @@ -621,12 +616,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: self.gpu_allocator.free(gpu_block) self.cross_block_tables[request_id] = new_block_table - block_number_mapping = { - gpu_block.block_number: cpu_block.block_number - for gpu_block, cpu_block in mapping.items() - } - # convert to list of tuples once here - return list(block_number_mapping.items()) + return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()] def _free_block_table(self, block_table: BlockTable) -> None: # when using a sliding window, each seq will only use up From 0644cde2aced6d7fb6c279025b2a4a3d8f5625d2 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 14:23:50 -0400 Subject: [PATCH 14/47] formatting; changed blocktable type specifier from Dict to List[int] --- tests/core/test_block_manager.py | 6 +++--- vllm/core/block_manager_v1.py | 6 ++++-- vllm/sequence.py | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 808b0a5e651eb..cdaf2f22115e8 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -103,7 +103,7 @@ def test_allocate_encoder_decoder(): # Allocate same sequence group to all available gpu blocks. for i in range(num_gpu_blocks // block_req_per_seq_group): _, _, seq_group = create_dummy_prompt_encoder_decoder( - str(i), + str(i), decoder_prompt_length=block_size, encoder_prompt_length=block_size) assert block_manager.can_allocate(seq_group) == AllocStatus.OK @@ -118,8 +118,8 @@ def test_allocate_encoder_decoder(): watermark=1 / num_gpu_blocks) for i in range((num_gpu_blocks - 1) // block_req_per_seq_group): _, _, seq_group = create_dummy_prompt_encoder_decoder( - str(i), - decoder_prompt_length=block_size, + str(i), + decoder_prompt_length=block_size, encoder_prompt_length=block_size) assert block_manager.can_allocate(seq_group) == AllocStatus.OK block_manager.allocate(seq_group) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 119e444df1b11..2482cf17956f2 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -570,7 +570,8 @@ def swap_in(self, self.cpu_allocator.free(cpu_block) self.cross_block_tables[request_id] = new_block_table - return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()] + return [(cpu_block.block_number, gpu_block.block_number) + for cpu_block, gpu_block in mapping.items()] def can_swap_out(self, seq_group: SequenceGroup) -> bool: blocks = self._get_physical_blocks(seq_group) @@ -616,7 +617,8 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: self.gpu_allocator.free(gpu_block) self.cross_block_tables[request_id] = new_block_table - return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()] + return [(cpu_block.block_number, gpu_block.block_number) + for cpu_block, gpu_block in mapping.items()] def _free_block_table(self, block_table: BlockTable) -> None: # when using a sliding window, each seq will only use up diff --git a/vllm/sequence.py b/vllm/sequence.py index a11c411876ea8..6b07a00f09c6f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -527,8 +527,8 @@ def get_seqs( seq for seq in self.seqs_dict.values() if seq.status == status ] - def get_encoder_seq(self) -> Sequence: - return self.encoder_seq # type: ignore + def get_encoder_seq(self) -> Optional[Sequence]: + return self.encoder_seq def get_unfinished_seqs(self) -> List[Sequence]: return [ @@ -635,7 +635,7 @@ def __init__( state: Optional[SequenceGroupState] = None, multi_modal_data: Optional[MultiModalData] = None, encoder_seq_data: Optional[SequenceData] = None, - cross_block_table: Optional[Dict[int, List[int]]] = None, + cross_block_table: Optional[List[int]] = None, ) -> None: self.request_id = request_id self.is_prompt = is_prompt From 19ed7413e315ce665cc07722d72fb874a362fafd Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 14:39:50 -0400 Subject: [PATCH 15/47] prefixed internal method with _ --- vllm/core/block_manager_v1.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 2482cf17956f2..648ff843fd4e5 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -260,7 +260,7 @@ def __init__( # request ID self.cross_block_tables: Dict[str, BlockTable] = {} - def get_seq_num_required_blocks(self, seq: Sequence) -> int: + def _get_seq_num_required_blocks(self, seq: Sequence) -> int: if seq is None: return 0 return len(seq.logical_token_blocks) @@ -269,9 +269,9 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - self_num_required_blocks = self.get_seq_num_required_blocks( + self_num_required_blocks = self._get_seq_num_required_blocks( seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) - cross_num_required_blocks = self.get_seq_num_required_blocks( + cross_num_required_blocks = self._get_seq_num_required_blocks( seq_group.get_encoder_seq()) num_required_blocks = self_num_required_blocks + \ cross_num_required_blocks From a5579729928c4151e501138f82340c0afa2dc327 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 17:47:19 -0400 Subject: [PATCH 16/47] refactored self-/cross-attention allocation functions into a single helper function --- vllm/core/block_manager_v1.py | 57 ++++++++++++----------------------- 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 648ff843fd4e5..9f08d4a7939aa 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -290,11 +290,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER - def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None: - # NOTE: Here we assume that all sequences in the group have the same - # decoder prompt. - seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - + def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable: # Allocate new physical token blocks that will store the prompt tokens. num_prompt_blocks = len(seq.logical_token_blocks) @@ -304,7 +300,7 @@ def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None: and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] # Set the reference counts of the token blocks. - block.ref_count = seq_group.num_seqs() + block.ref_count = ref_count #seq_group.num_seqs() elif self.enable_caching: block = self.gpu_allocator.allocate( seq.hash_of_block(logical_idx), @@ -312,47 +308,32 @@ def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None: else: block = self.gpu_allocator.allocate() # Set the reference counts of the token blocks. - block.ref_count = seq_group.num_seqs() + block.ref_count = ref_count #seq_group.num_seqs() block_table.append(block) - # Assign the decoder block table for each sequence. - for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): - self.block_tables[seq.seq_id] = block_table.copy() + return block_table - def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None: + def allocate(self, seq_group: SequenceGroup) -> None: + # Allocate decoder sequences + # # NOTE: Here we assume that all sequences in the group have the same - # encoder prompt. + # decoder prompt. + seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] + block_table: BlockTable = self._allocate_sequence(seq, seq_group.num_seqs()) - # Allocate new physical token blocks that will store the prompt tokens. - seq = seq_group.get_encoder_seq() - if seq is not None: - block_table: BlockTable = [] - num_prompt_blocks = len(seq.logical_token_blocks) - for logical_idx in range(num_prompt_blocks): - if (self.block_sliding_window is not None - and logical_idx >= self.block_sliding_window): - block = block_table[logical_idx % - self.block_sliding_window] - # Set the reference counts of the token blocks. - block.ref_count = seq_group.num_seqs() - elif self.enable_caching: - block = self.gpu_allocator.allocate( - seq.hash_of_block(logical_idx), - seq.num_hashed_tokens_of_block(logical_idx)) - else: - block = self.gpu_allocator.allocate() - # Set the reference counts of the token blocks. - # TODO: feature not supported with encoder/decoder - block.ref_count = seq_group.num_seqs() - block_table.append(block) + # Assign the self-attention block tables for each sequence. + for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): + self.block_tables[seq.seq_id] = block_table.copy() + # Allocate encoder sequence + encoder_seq = seq_group.get_encoder_seq() + if encoder_seq is not None: + # A SequenceGroup has only a single encoder sequence (at most), + # thus allocate with a ref count of 1 + block_table: BlockTable = self._allocate_sequence(encoder_seq, 1) # Assign the cross-attention block table for the SequenceGroup. self.cross_block_tables[seq_group.request_id] = block_table - def allocate(self, seq_group: SequenceGroup) -> None: - self.allocate_self_block_tables(seq_group) - self.allocate_cross_block_table(seq_group) - def can_append_slots(self, seq_group: SequenceGroup, num_lookahead_slots: int = 0) -> bool: From e48bebf727ae67ffbdff206d168eab3e77b988da Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 17:59:44 -0400 Subject: [PATCH 17/47] Refactored block manager v2 self-/cross-block-table alloc functions together --- vllm/core/block_manager_v2.py | 38 ++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 978acd915b69b..a8085f54ac79d 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -121,7 +121,18 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER - def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None: + def _allocate_sequence(self, seq: Sequence) -> BlockTable: + block_table = BlockTable( + block_size=self.block_size, + block_allocator=self.block_allocator, + ) + assert self.block_sliding_window is None + block_table.allocate(seq.get_token_ids()) + + return block_table + + def allocate(self, seq_group: SequenceGroup) -> None: + # Allocate self-attention block tables for decoder sequences waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) assert not (set(seq.seq_id for seq in waiting_seqs) & self.block_tables.keys()), "block table already exists" @@ -129,43 +140,34 @@ def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same # prompt. seq = waiting_seqs[0] - - block_table = BlockTable( - block_size=self.block_size, - block_allocator=self.block_allocator, - ) - assert self.block_sliding_window is None - block_table.allocate(seq.get_token_ids()) + block_table: BlockTable = self._allocate_sequence(seq) self.block_tables[seq.seq_id] = block_table # Assign the block table for each sequence. for seq in waiting_seqs[1:]: self.block_tables[seq.seq_id] = block_table.fork() - def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None: + # Allocate cross-attention block table for encoder sequence + # # NOTE: Here we assume that all sequences in the group have the same - # prompt. + # encoder prompt. request_id = seq_group.request_id - seq = seq_group.encoder_seq + encoder_seq = seq_group.encoder_seq assert (request_id not in self.cross_block_tables), \ "block table already exists" - seq = seq_group.get_encoder_seq() - if seq is not None: + encoder_seq = seq_group.get_encoder_seq() + if encoder_seq is not None: block_table = BlockTable( block_size=self.block_size, block_allocator=self.block_allocator, ) assert self.block_sliding_window is None - block_table.allocate(seq.get_token_ids()) + block_table.allocate(encoder_seq.get_token_ids()) self.cross_block_tables[request_id] = block_table - def allocate(self, seq_group: SequenceGroup) -> None: - self.allocate_self_block_tables(seq_group) - self.allocate_cross_block_table(seq_group) - def can_append_slots(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> bool: """Determine if there is enough space in the GPU KV cache to continue From ac2da978c786d998247cfe55a3d2a788109b71e4 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 18:11:58 -0400 Subject: [PATCH 18/47] formatting --- vllm/core/block_manager_v1.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 9f08d4a7939aa..fa53b3cd33229 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -319,7 +319,8 @@ def allocate(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same # decoder prompt. seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - block_table: BlockTable = self._allocate_sequence(seq, seq_group.num_seqs()) + block_table: BlockTable = \ + self._allocate_sequence(seq, seq_group.num_seqs()) # Assign the self-attention block tables for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): @@ -330,7 +331,7 @@ def allocate(self, seq_group: SequenceGroup) -> None: if encoder_seq is not None: # A SequenceGroup has only a single encoder sequence (at most), # thus allocate with a ref count of 1 - block_table: BlockTable = self._allocate_sequence(encoder_seq, 1) + block_table = self._allocate_sequence(encoder_seq, 1) # Assign the cross-attention block table for the SequenceGroup. self.cross_block_tables[seq_group.request_id] = block_table From e985a2f05080a0e311f52adf119447993322541f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 18:40:07 -0400 Subject: [PATCH 19/47] refactored out block manager v1 swap_n/swap_out helper functions --- vllm/core/block_manager_v1.py | 116 ++++++++++++++++------------------ 1 file changed, 54 insertions(+), 62 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index fa53b3cd33229..dd6d8d702fae0 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -300,7 +300,7 @@ def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable: and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] # Set the reference counts of the token blocks. - block.ref_count = ref_count #seq_group.num_seqs() + block.ref_count = ref_count #seq_group.num_seqs() elif self.enable_caching: block = self.gpu_allocator.allocate( seq.hash_of_block(logical_idx), @@ -308,7 +308,7 @@ def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable: else: block = self.gpu_allocator.allocate() # Set the reference counts of the token blocks. - block.ref_count = ref_count #seq_group.num_seqs() + block.ref_count = ref_count #seq_group.num_seqs() block_table.append(block) return block_table @@ -507,6 +507,26 @@ def can_swap_in(self, else: return AllocStatus.LATER + def _swap_in_block_table( + self, block_table: BlockTable, + mapping: Dict[PhysicalTokenBlock, + PhysicalTokenBlock]) -> BlockTable: + new_block_table = [] + + for cpu_block in block_table: + if cpu_block in mapping: + gpu_block = mapping[cpu_block] + gpu_block.ref_count += 1 + else: + gpu_block = self.gpu_allocator.allocate( + cpu_block.block_hash, cpu_block.num_hashed_tokens) + mapping[cpu_block] = gpu_block + new_block_table.append(gpu_block) + # Free the CPU block swapped in to GPU. + self.cpu_allocator.free(cpu_block) + + return new_block_table + def swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int = 0) -> List[Tuple[int, int]]: @@ -519,38 +539,14 @@ def swap_in(self, # dict is efficient in lookup `if cpu_block in mapping` mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - new_block_table: BlockTable = [] - block_table = self.block_tables[seq.seq_id] - - for cpu_block in block_table: - if cpu_block in mapping: - gpu_block = mapping[cpu_block] - gpu_block.ref_count += 1 - else: - gpu_block = self.gpu_allocator.allocate( - cpu_block.block_hash, cpu_block.num_hashed_tokens) - mapping[cpu_block] = gpu_block - new_block_table.append(gpu_block) - # Free the CPU block swapped in to GPU. - self.cpu_allocator.free(cpu_block) - self.block_tables[seq.seq_id] = new_block_table + self.block_tables[seq.seq_id] = \ + self._swap_in_block_table(self.block_tables[seq.seq_id], + mapping) if seq_group.encoder_seq is not None: - new_block_table = [] - block_table = self.cross_block_tables[request_id] - - for cpu_block in block_table: - if cpu_block in mapping: - gpu_block = mapping[cpu_block] - gpu_block.ref_count += 1 - else: - gpu_block = self.gpu_allocator.allocate( - cpu_block.block_hash, cpu_block.num_hashed_tokens) - mapping[cpu_block] = gpu_block - new_block_table.append(gpu_block) - # Free the CPU block swapped in to GPU. - self.cpu_allocator.free(cpu_block) - self.cross_block_tables[request_id] = new_block_table + self.cross_block_tables[request_id] = \ + self._swap_in_block_table(self.cross_block_tables[request_id], + mapping) return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()] @@ -559,6 +555,26 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: blocks = self._get_physical_blocks(seq_group) return len(blocks) <= self.cpu_allocator.get_num_free_blocks() + def _swap_out_block_table( + self, block_table: BlockTable, + mapping: Dict[PhysicalTokenBlock, + PhysicalTokenBlock]) -> BlockTable: + + new_block_table: BlockTable = [] + for gpu_block in block_table: + if gpu_block in mapping: + cpu_block = mapping[gpu_block] + cpu_block.ref_count += 1 + else: + cpu_block = self.cpu_allocator.allocate( + gpu_block.block_hash, gpu_block.num_hashed_tokens) + mapping[gpu_block] = cpu_block + new_block_table.append(cpu_block) + # Free the GPU block swapped out to CPU. + self.gpu_allocator.free(gpu_block) + + return new_block_table + def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: request_id = seq_group.request_id @@ -566,38 +582,14 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: # dict is efficient in lookup `if gpu_block in mapping` mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - new_block_table: BlockTable = [] - block_table = self.block_tables[seq.seq_id] - - for gpu_block in block_table: - if gpu_block in mapping: - cpu_block = mapping[gpu_block] - cpu_block.ref_count += 1 - else: - cpu_block = self.cpu_allocator.allocate( - gpu_block.block_hash, gpu_block.num_hashed_tokens) - mapping[gpu_block] = cpu_block - new_block_table.append(cpu_block) - # Free the GPU block swapped out to CPU. - self.gpu_allocator.free(gpu_block) - self.block_tables[seq.seq_id] = new_block_table + self.block_tables[seq.seq_id] = \ + self._swap_out_block_table(self.block_tables[seq.seq_id], + mapping) if seq_group.encoder_seq is not None: - new_block_table = [] - block_table = self.cross_block_tables[request_id] - - for gpu_block in block_table: - if gpu_block in mapping: - cpu_block = mapping[gpu_block] - cpu_block.ref_count += 1 - else: - cpu_block = self.cpu_allocator.allocate( - gpu_block.block_hash, gpu_block.num_hashed_tokens) - mapping[gpu_block] = cpu_block - new_block_table.append(cpu_block) - # Free the GPU block swapped out to CPU. - self.gpu_allocator.free(gpu_block) - self.cross_block_tables[request_id] = new_block_table + self.cross_block_tables[request_id] = \ + self._swap_out_block_table(self.cross_block_tables[request_id], + mapping) return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()] From 98c5863ef946dbd52221b6b83517e483f48b3848 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Wed, 22 May 2024 18:53:40 -0400 Subject: [PATCH 20/47] Help function avoids prefix caching code in encoder/decoder scenarios; alloc method asserts no prefix caching + enc/dec; refactoring --- vllm/core/block_manager_v1.py | 36 +++++++++++++++++------------------ vllm/core/block_manager_v2.py | 16 ---------------- 2 files changed, 18 insertions(+), 34 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index dd6d8d702fae0..40274bd29e9b0 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -290,7 +290,10 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: else: return AllocStatus.LATER - def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable: + def _allocate_sequence(self, \ + seq: Sequence, \ + ref_count: int, \ + decoder_only: bool = True) -> BlockTable: # Allocate new physical token blocks that will store the prompt tokens. num_prompt_blocks = len(seq.logical_token_blocks) @@ -300,27 +303,36 @@ def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable: and logical_idx >= self.block_sliding_window): block = block_table[logical_idx % self.block_sliding_window] # Set the reference counts of the token blocks. - block.ref_count = ref_count #seq_group.num_seqs() - elif self.enable_caching: + block.ref_count = ref_count + elif decoder_only and self.enable_caching: block = self.gpu_allocator.allocate( seq.hash_of_block(logical_idx), seq.num_hashed_tokens_of_block(logical_idx)) else: block = self.gpu_allocator.allocate() # Set the reference counts of the token blocks. - block.ref_count = ref_count #seq_group.num_seqs() + block.ref_count = ref_count block_table.append(block) return block_table def allocate(self, seq_group: SequenceGroup) -> None: + decoder_only = \ + seq_group.get_encoder_seq() is None + + assert decoder_only or (not self.enable_caching), \ + "Automatic prefix caching currently not " + \ + "supported for encoder/decoder models." + # Allocate decoder sequences # # NOTE: Here we assume that all sequences in the group have the same # decoder prompt. seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] block_table: BlockTable = \ - self._allocate_sequence(seq, seq_group.num_seqs()) + self._allocate_sequence(seq, + seq_group.num_seqs(), + decoder_only) # Assign the self-attention block tables for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): @@ -331,7 +343,7 @@ def allocate(self, seq_group: SequenceGroup) -> None: if encoder_seq is not None: # A SequenceGroup has only a single encoder sequence (at most), # thus allocate with a ref count of 1 - block_table = self._allocate_sequence(encoder_seq, 1) + block_table = self._allocate_sequence(encoder_seq, 1, decoder_only) # Assign the cross-attention block table for the SequenceGroup. self.cross_block_tables[seq_group.request_id] = block_table @@ -661,18 +673,6 @@ def access_all_blocks_in_seq( for block in block_table: block.last_accessed = access_time - def access_all_cross_blocks_in_seq_group( - self, - seq_group: SequenceGroup, - access_time: float, - ) -> None: - if self.enable_caching: - # Update the last accessed time of all the blocks accessed - # in this step. - block_table = self.cross_block_tables[seq_group.request_id] - for block in block_table: - block.last_accessed = access_time - def compute_full_blocks_in_seq(self, seq: Sequence): if seq.seq_id not in self.block_tables: return diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index a8085f54ac79d..31d1a60657832 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -260,22 +260,6 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float): block_ids, # type: ignore now) - def access_all_cross_blocks_in_seq_group( - self, - seq_group: SequenceGroup, - now: float, - ) -> None: - if self.enable_caching: - # Update the last accessed time of all the blocks accessed - # in this step. - block_table = self.cross_block_tables[seq_group.request_id] - block_ids = [] - for block_id in block_table.physical_block_ids: - block_ids.append(block_id) - self.block_allocator.mark_blocks_as_accessed( - block_ids, # type: ignore - now) - def mark_blocks_as_computed(self, seq_group: SequenceGroup): # The only need for mark block as computed is for prefix caching, # while currently we could determine whether one block is computed From 84f5510a0a4e7d0b81b32e772e1cf710be83112b Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 13:49:30 -0400 Subject: [PATCH 21/47] block manager v1 NotImplementError's for sliding window and automatic prefix caching --- vllm/core/block_manager_v1.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 40274bd29e9b0..d5da128f1a691 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -277,6 +277,11 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: cross_num_required_blocks if self.block_sliding_window is not None: + if seq_group.get_encoder_seq() is not None: + raise NotImplementedError( + "Sliding window attention for encoder/decoder models " + \ + "is not currently supported.") + num_required_blocks = min(num_required_blocks, self.block_sliding_window) num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() @@ -320,9 +325,16 @@ def allocate(self, seq_group: SequenceGroup) -> None: decoder_only = \ seq_group.get_encoder_seq() is None - assert decoder_only or (not self.enable_caching), \ - "Automatic prefix caching currently not " + \ - "supported for encoder/decoder models." + if (self.block_sliding_window is not None) and \ + (not decoder_only): + raise NotImplementedError( + "Sliding window attention for encoder/decoder models " + \ + "is not currently supported.") + + if self.enable_caching and (not decoder_only): + raise NotImplementedError( + "Automatic prefix caching currently not " + \ + "supported for encoder/decoder models.") # Allocate decoder sequences # From cc61959d2075816ee49fa7a802e3c2240e737546 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 13:56:11 -0400 Subject: [PATCH 22/47] Fixes --- vllm/core/block_manager_v2.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 31d1a60657832..9c6466de468e5 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -152,7 +152,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: # NOTE: Here we assume that all sequences in the group have the same # encoder prompt. request_id = seq_group.request_id - encoder_seq = seq_group.encoder_seq assert (request_id not in self.cross_block_tables), \ @@ -160,12 +159,7 @@ def allocate(self, seq_group: SequenceGroup) -> None: encoder_seq = seq_group.get_encoder_seq() if encoder_seq is not None: - block_table = BlockTable( - block_size=self.block_size, - block_allocator=self.block_allocator, - ) - assert self.block_sliding_window is None - block_table.allocate(encoder_seq.get_token_ids()) + block_table: BlockTable = self._allocate_sequence(encoder_seq) self.cross_block_tables[request_id] = block_table def can_append_slots(self, seq_group: SequenceGroup, @@ -229,8 +223,6 @@ def free_cross(self, seq_group: SequenceGroup) -> None: self.cross_block_tables[request_id].free() del self.cross_block_tables[request_id] - del self.cross_block_tables[seq_group.request_id] - def get_block_table(self, seq: Sequence) -> List[int]: assert seq.seq_id in self.block_tables block_ids = self.block_tables[seq.seq_id].physical_block_ids From dcb9abe115cfd6bfa8f2131c645cbc0bb6acb2ab Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 13:58:30 -0400 Subject: [PATCH 23/47] formatting --- vllm/core/block_manager_v1.py | 2 +- vllm/core/block_manager_v2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index d5da128f1a691..95e9e5e20940d 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -281,7 +281,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: raise NotImplementedError( "Sliding window attention for encoder/decoder models " + \ "is not currently supported.") - + num_required_blocks = min(num_required_blocks, self.block_sliding_window) num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks() diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 9c6466de468e5..b89f1cd05d1c1 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -159,7 +159,7 @@ def allocate(self, seq_group: SequenceGroup) -> None: encoder_seq = seq_group.get_encoder_seq() if encoder_seq is not None: - block_table: BlockTable = self._allocate_sequence(encoder_seq) + block_table = self._allocate_sequence(encoder_seq) self.cross_block_tables[request_id] = block_table def can_append_slots(self, seq_group: SequenceGroup, From e8c40fcf152c5d2f6514830644c8eb683eee7aa9 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 17:08:00 -0400 Subject: [PATCH 24/47] explanatory comment --- vllm/sequence.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/sequence.py b/vllm/sequence.py index 6b07a00f09c6f..a456ecc111e4c 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -613,11 +613,15 @@ class SequenceGroupMetadata: used in prefix caching. state: Internal state tied to this sequence group. multi_modal_data: Multi modal data. - encoder_seq_data: Optional, the sequence data - for the single encoder prompt. - cross_block_table: Optional, the cross-attention - block table associated with - the single encoder prompt. + encoder_seq_data: Optional sequence data for encoder prompt + (SequenceGroup.encoder_seq). Should be None + unless you are working with an encoder/decoder + model. + cross_block_table: Optional cross-attention block table associated + with the encoder prompt + (SequenceGroup.encoder_seq). Should be None + unless you are working with an encoder/decoder + model. """ def __init__( From 5ccb70be1209521d0aa1e3d7cae7bf7707ac2fd8 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 17:18:03 -0400 Subject: [PATCH 25/47] various fixes according to reviews --- vllm/core/block_manager_v1.py | 2 +- vllm/core/block_manager_v2.py | 14 ++++++++++++++ vllm/sequence.py | 3 ++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 95e9e5e20940d..1c81edb7a2df3 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -352,7 +352,7 @@ def allocate(self, seq_group: SequenceGroup) -> None: # Allocate encoder sequence encoder_seq = seq_group.get_encoder_seq() - if encoder_seq is not None: + if not decoder_only: # A SequenceGroup has only a single encoder sequence (at most), # thus allocate with a ref count of 1 block_table = self._allocate_sequence(encoder_seq, 1, decoder_only) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index b89f1cd05d1c1..f094bf99e3201 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -132,6 +132,9 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: return block_table def allocate(self, seq_group: SequenceGroup) -> None: + decoder_only = \ + seq_group.get_encoder_seq() is None + # Allocate self-attention block tables for decoder sequences waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) assert not (set(seq.seq_id for seq in waiting_seqs) @@ -157,6 +160,17 @@ def allocate(self, seq_group: SequenceGroup) -> None: not in self.cross_block_tables), \ "block table already exists" + if (self.block_sliding_window is not None) and \ + (not decoder_only): + raise NotImplementedError( + "Sliding window attention for encoder/decoder models " + \ + "is not currently supported.") + + if self.enable_caching and (not decoder_only): + raise NotImplementedError( + "Automatic prefix caching currently not " + \ + "supported for encoder/decoder models.") + encoder_seq = seq_group.get_encoder_seq() if encoder_seq is not None: block_table = self._allocate_sequence(encoder_seq) diff --git a/vllm/sequence.py b/vllm/sequence.py index a456ecc111e4c..9c8fcccab75ae 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -420,7 +420,8 @@ class SequenceGroup: for an embedding model. pooling_params: The pooling parameters used to generate the pooling for an embedding model. - encoder_seq: Optional, the single encoder sequence. + encoder_seq: Optional, the single encoder sequence. Should be None + unless you are working with an encoder/decoder model. """ def __init__( From dfcc28b19188a11c74aee06265051eb8fbbe599f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 17:22:41 -0400 Subject: [PATCH 26/47] slight refactoring --- vllm/core/block_manager_v1.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 1c81edb7a2df3..2daf45182bba9 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -322,8 +322,8 @@ def _allocate_sequence(self, \ return block_table def allocate(self, seq_group: SequenceGroup) -> None: - decoder_only = \ - seq_group.get_encoder_seq() is None + encoder_seq = seq_group.get_encoder_seq() + decoder_only = encoder_seq is None if (self.block_sliding_window is not None) and \ (not decoder_only): @@ -351,7 +351,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: self.block_tables[seq.seq_id] = block_table.copy() # Allocate encoder sequence - encoder_seq = seq_group.get_encoder_seq() if not decoder_only: # A SequenceGroup has only a single encoder sequence (at most), # thus allocate with a ref count of 1 From 8d3ad05a9f7d568f16eea6e090f6803869fc5443 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 17:26:54 -0400 Subject: [PATCH 27/47] small refactor --- vllm/core/block_manager_v2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index f094bf99e3201..6e02359f51782 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -132,8 +132,9 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: return block_table def allocate(self, seq_group: SequenceGroup) -> None: + encoder_seq = seq_group.get_encoder_seq() decoder_only = \ - seq_group.get_encoder_seq() is None + encoder_seq is None # Allocate self-attention block tables for decoder sequences waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) @@ -171,8 +172,7 @@ def allocate(self, seq_group: SequenceGroup) -> None: "Automatic prefix caching currently not " + \ "supported for encoder/decoder models.") - encoder_seq = seq_group.get_encoder_seq() - if encoder_seq is not None: + if not decoder_only: block_table = self._allocate_sequence(encoder_seq) self.cross_block_tables[request_id] = block_table From 5a7697976a964cf23d6141d9e432abb63d3f9e9d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 17:34:34 -0400 Subject: [PATCH 28/47] replaced all encoder_seq is not None with not decoder_only --- vllm/core/block_manager_v1.py | 19 +++++++++++++++---- vllm/core/block_manager_v2.py | 5 ++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 2daf45182bba9..2e5d531565379 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -496,6 +496,10 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: def _get_physical_blocks( self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]: + encoder_seq = seq_group.get_encoder_seq() + decoder_only = \ + encoder_seq is None + # NOTE: Here, we assume that the physical blocks are only shared by # the sequences in the same group. request_id = seq_group.request_id @@ -505,7 +509,7 @@ def _get_physical_blocks( continue blocks.update(self.block_tables[seq.seq_id]) # Cross-attention blocks - if seq_group.encoder_seq is not None: + if not decoder_only: blocks.update(self.cross_block_tables[request_id]) return list(blocks) @@ -514,9 +518,12 @@ def can_swap_in(self, num_lookahead_slots: int = 0) -> AllocStatus: assert (num_lookahead_slots == 0 ), "BlockSpaceManagerV1 does not support lookahead allocation" + encoder_seq = seq_group.get_encoder_seq() + decoder_only = encoder_seq is None + blocks = self._get_physical_blocks(seq_group) num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED) - if seq_group.encoder_seq is not None: + if not decoder_only: num_swapped_seqs += 1 num_free_blocks = self.gpu_allocator.get_num_free_blocks() # NOTE: Conservatively, we assume that every sequence will allocate @@ -556,6 +563,8 @@ def swap_in(self, assert (num_lookahead_slots == 0 ), "BlockSpaceManagerV1 does not support lookahead allocation" + encoder_seq = seq_group.get_encoder_seq() + decoder_only = encoder_seq is None request_id = seq_group.request_id # CPU block -> GPU block. @@ -566,7 +575,7 @@ def swap_in(self, self._swap_in_block_table(self.block_tables[seq.seq_id], mapping) - if seq_group.encoder_seq is not None: + if not decoder_only: self.cross_block_tables[request_id] = \ self._swap_in_block_table(self.cross_block_tables[request_id], mapping) @@ -600,6 +609,8 @@ def _swap_out_block_table( def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: request_id = seq_group.request_id + encoder_seq = seq_group.get_encoder_seq() + decoder_only = encoder_seq is None # GPU block -> CPU block. # dict is efficient in lookup `if gpu_block in mapping` @@ -609,7 +620,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: self._swap_out_block_table(self.block_tables[seq.seq_id], mapping) - if seq_group.encoder_seq is not None: + if not decoder_only: self.cross_block_tables[request_id] = \ self._swap_out_block_table(self.cross_block_tables[request_id], mapping) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 6e02359f51782..a8090c1f93b5a 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -91,6 +91,9 @@ def __init__( def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. + encoder_seq = seq_group.get_encoder_seq() + decoder_only = encoder_seq is None + seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] num_required_blocks = BlockTable.get_num_required_blocks( @@ -98,7 +101,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: block_size=self.block_size, ) - if seq_group.encoder_seq is not None: + if not decoder_only: num_required_blocks += BlockTable.get_num_required_blocks( seq_group.encoder_seq.get_token_ids(), block_size=self.block_size, From 09ae4adb656b79897d62d28015f968b0c7471d8e Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 17:51:23 -0400 Subject: [PATCH 29/47] added is_encoder_decoder() method to sequence group --- vllm/core/block_manager_v1.py | 36 ++++++++++++++--------------------- vllm/core/block_manager_v2.py | 16 ++++++---------- vllm/sequence.py | 3 +++ 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 2e5d531565379..69a280c8bf9c6 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -277,7 +277,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: cross_num_required_blocks if self.block_sliding_window is not None: - if seq_group.get_encoder_seq() is not None: + if seq_group.is_encoder_decoder(): raise NotImplementedError( "Sliding window attention for encoder/decoder models " + \ "is not currently supported.") @@ -298,7 +298,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: def _allocate_sequence(self, \ seq: Sequence, \ ref_count: int, \ - decoder_only: bool = True) -> BlockTable: + is_encoder_decoder: bool = True) -> BlockTable: # Allocate new physical token blocks that will store the prompt tokens. num_prompt_blocks = len(seq.logical_token_blocks) @@ -309,7 +309,7 @@ def _allocate_sequence(self, \ block = block_table[logical_idx % self.block_sliding_window] # Set the reference counts of the token blocks. block.ref_count = ref_count - elif decoder_only and self.enable_caching: + elif not is_encoder_decoder and self.enable_caching: block = self.gpu_allocator.allocate( seq.hash_of_block(logical_idx), seq.num_hashed_tokens_of_block(logical_idx)) @@ -323,15 +323,15 @@ def _allocate_sequence(self, \ def allocate(self, seq_group: SequenceGroup) -> None: encoder_seq = seq_group.get_encoder_seq() - decoder_only = encoder_seq is None + is_encoder_decoder = seq_group.is_encoder_decoder() if (self.block_sliding_window is not None) and \ - (not decoder_only): + is_encoder_decoder: raise NotImplementedError( "Sliding window attention for encoder/decoder models " + \ "is not currently supported.") - if self.enable_caching and (not decoder_only): + if self.enable_caching and is_encoder_decoder: raise NotImplementedError( "Automatic prefix caching currently not " + \ "supported for encoder/decoder models.") @@ -344,17 +344,18 @@ def allocate(self, seq_group: SequenceGroup) -> None: block_table: BlockTable = \ self._allocate_sequence(seq, seq_group.num_seqs(), - decoder_only) + is_encoder_decoder) # Assign the self-attention block tables for each sequence. for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): self.block_tables[seq.seq_id] = block_table.copy() # Allocate encoder sequence - if not decoder_only: + if is_encoder_decoder: # A SequenceGroup has only a single encoder sequence (at most), # thus allocate with a ref count of 1 - block_table = self._allocate_sequence(encoder_seq, 1, decoder_only) + block_table = self._allocate_sequence(encoder_seq, 1, + is_encoder_decoder) # Assign the cross-attention block table for the SequenceGroup. self.cross_block_tables[seq_group.request_id] = block_table @@ -496,9 +497,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: def _get_physical_blocks( self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]: - encoder_seq = seq_group.get_encoder_seq() - decoder_only = \ - encoder_seq is None # NOTE: Here, we assume that the physical blocks are only shared by # the sequences in the same group. @@ -509,7 +507,7 @@ def _get_physical_blocks( continue blocks.update(self.block_tables[seq.seq_id]) # Cross-attention blocks - if not decoder_only: + if seq_group.is_encoder_decoder(): blocks.update(self.cross_block_tables[request_id]) return list(blocks) @@ -518,12 +516,10 @@ def can_swap_in(self, num_lookahead_slots: int = 0) -> AllocStatus: assert (num_lookahead_slots == 0 ), "BlockSpaceManagerV1 does not support lookahead allocation" - encoder_seq = seq_group.get_encoder_seq() - decoder_only = encoder_seq is None blocks = self._get_physical_blocks(seq_group) num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED) - if not decoder_only: + if seq_group.is_encoder_decoder(): num_swapped_seqs += 1 num_free_blocks = self.gpu_allocator.get_num_free_blocks() # NOTE: Conservatively, we assume that every sequence will allocate @@ -563,8 +559,6 @@ def swap_in(self, assert (num_lookahead_slots == 0 ), "BlockSpaceManagerV1 does not support lookahead allocation" - encoder_seq = seq_group.get_encoder_seq() - decoder_only = encoder_seq is None request_id = seq_group.request_id # CPU block -> GPU block. @@ -575,7 +569,7 @@ def swap_in(self, self._swap_in_block_table(self.block_tables[seq.seq_id], mapping) - if not decoder_only: + if seq_group.is_encoder_decoder(): self.cross_block_tables[request_id] = \ self._swap_in_block_table(self.cross_block_tables[request_id], mapping) @@ -609,8 +603,6 @@ def _swap_out_block_table( def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: request_id = seq_group.request_id - encoder_seq = seq_group.get_encoder_seq() - decoder_only = encoder_seq is None # GPU block -> CPU block. # dict is efficient in lookup `if gpu_block in mapping` @@ -620,7 +612,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: self._swap_out_block_table(self.block_tables[seq.seq_id], mapping) - if not decoder_only: + if seq_group.is_encoder_decoder(): self.cross_block_tables[request_id] = \ self._swap_out_block_table(self.cross_block_tables[request_id], mapping) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index a8090c1f93b5a..0dd2ffcd182ec 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -91,19 +91,16 @@ def __init__( def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - encoder_seq = seq_group.get_encoder_seq() - decoder_only = encoder_seq is None seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] - num_required_blocks = BlockTable.get_num_required_blocks( seq.get_token_ids(), block_size=self.block_size, ) - if not decoder_only: + if seq_group.is_encoder_decoder(): num_required_blocks += BlockTable.get_num_required_blocks( - seq_group.encoder_seq.get_token_ids(), + seq_group.get_encoder_seq().get_token_ids(), block_size=self.block_size, ) @@ -136,8 +133,7 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: def allocate(self, seq_group: SequenceGroup) -> None: encoder_seq = seq_group.get_encoder_seq() - decoder_only = \ - encoder_seq is None + is_encoder_decoder = seq_group.is_encoder_decoder() # Allocate self-attention block tables for decoder sequences waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) @@ -165,17 +161,17 @@ def allocate(self, seq_group: SequenceGroup) -> None: "block table already exists" if (self.block_sliding_window is not None) and \ - (not decoder_only): + is_encoder_decoder: raise NotImplementedError( "Sliding window attention for encoder/decoder models " + \ "is not currently supported.") - if self.enable_caching and (not decoder_only): + if self.enable_caching and is_encoder_decoder: raise NotImplementedError( "Automatic prefix caching currently not " + \ "supported for encoder/decoder models.") - if not decoder_only: + if is_encoder_decoder: block_table = self._allocate_sequence(encoder_seq) self.cross_block_tables[request_id] = block_table diff --git a/vllm/sequence.py b/vllm/sequence.py index 9c8fcccab75ae..ad6c8d54974c3 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -528,6 +528,9 @@ def get_seqs( seq for seq in self.seqs_dict.values() if seq.status == status ] + def is_encoder_decoder(self) -> bool: + return self.encoder_seq is not None + def get_encoder_seq(self) -> Optional[Sequence]: return self.encoder_seq From ecd1a998579ac171ce1936444fe9f7c8a6a09c92 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 18:59:03 -0400 Subject: [PATCH 30/47] tests for NotImplemented errors when encoder/decoder models are used with prefix cache or SWA --- tests/core/block/test_block_manager_v2.py | 103 +++++++++++++++++++++- tests/core/test_block_manager.py | 64 +++++++++++++- vllm/core/block_manager_v1.py | 29 +++--- vllm/core/block_manager_v2.py | 28 ++++-- 4 files changed, 205 insertions(+), 19 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 06c3389cfa0f0..cf423d292a25e 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -1,6 +1,8 @@ import pytest -from vllm.core.block_manager_v2 import BlockSpaceManagerV2 +from vllm.core.block_manager_v2 import (BlockSpaceManagerV2, + str_not_impl_enc_dec_prefix_cache, + str_not_impl_enc_dec_swa) from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list @@ -103,6 +105,105 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, assert can_allocate_result == AllocStatus.LATER +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_gpu_blocks", [16]) +@pytest.mark.parametrize("num_seqs_per_group", [1]) +@pytest.mark.parametrize("watermark", [0.0, 0.5]) +def test_allocate_encoder_decoder_fails_with_swa(block_size: int, + num_seqs_per_group: int, + num_gpu_blocks: int, + watermark: float): + ''' + SWA short for Sliding Window Attention. + + At time of writing block manager v2 does not support SWA. + + However even when SWA is implemented for block manager v2, + there will still most likely be a separate workstream required + to enable SWA for encoder/decoder models. + + Therefore this test enforces that one of the following cases + hold true: + 1. Block manager v2 does not support SWA at all (true at time of writing) + 2. Block manager v2 fails with NotImplementError when SWA is enabled + AND a SequenceGroup with an encoder sequence (i.e. in support of an + encoder/decoder model) is passed into can_allocate() as an argument + + The setup for this test is stripped down version of + test_can_allocate_seq_group_encoder_decoder() + ''' + + with pytest.raises((NotImplementedError, AssertionError)) as exc_info: + block_manager = BlockSpaceManagerV2( + block_size=block_size, + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=1024, + watermark=watermark, + sliding_window=5 # SWA + ) + + num_output_blocks_per_seq = 1 + num_prompt_blocks = 1 + num_output_blocks = num_output_blocks_per_seq + seq_group = create_seq_group_encoder_decoder( + seq_prompt_len=block_size * num_prompt_blocks, + seq_output_lens=[ + block_size * num_output_blocks_per_seq + for _ in range(num_seqs_per_group) + ], + request_id="0") + + assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks + block_manager.can_allocate(seq_group) + + # Assert that either + # 1. Block manager v2 constructor fails with assertion that sliding window + # is not yet supported (most likely near-term outcome at time of + # writing), or + # 2. can_allocate() fails with NotImplementedError due to combiantion of + # encoder/decoder and sliding window attention + if isinstance(exc_info.value, NotImplementedError): + assert str(exc_info.value) == str_not_impl_enc_dec_swa + elif isinstance(exc_info.value, AssertionError): + assert str(exc_info.value) == "Sliding window not yet supported" + + +@pytest.mark.parametrize("block_size", [16]) +@pytest.mark.parametrize("num_gpu_blocks", [16]) +@pytest.mark.parametrize("num_seqs_per_group", [1]) +@pytest.mark.parametrize("watermark", [0.0, 0.5]) +def test_allocate_encoder_decoder_fails_with_prefix_cache( + block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, + watermark: float): + + block_manager = BlockSpaceManagerV2( + block_size=block_size, + num_gpu_blocks=num_gpu_blocks, + num_cpu_blocks=1024, + watermark=watermark, + enable_caching=True # Prefix cache + ) + + num_output_blocks_per_seq = 1 + num_prompt_blocks = 1 + num_output_blocks = num_output_blocks_per_seq + seq_group = create_seq_group_encoder_decoder( + seq_prompt_len=block_size * num_prompt_blocks, + seq_output_lens=[ + block_size * num_output_blocks_per_seq + for _ in range(num_seqs_per_group) + ], + request_id="0") + + assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks + + # Assert that either can_allocate() fails with NotImplementedError + # due to combination of encoder/decoder and prefix cache + with pytest.raises(NotImplementedError) as exc_info: + block_manager.can_allocate(seq_group) + assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache + + @pytest.mark.parametrize("block_size", [1, 8]) @pytest.mark.parametrize("prompt_len", [1, 7, 8]) @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129]) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index cdaf2f22115e8..6039f568fcf1e 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -7,7 +7,9 @@ from vllm import SamplingParams from vllm.block import PhysicalTokenBlock from vllm.core.block_manager_v1 import (BlockSpaceManagerV1, - UncachedBlockAllocator) + UncachedBlockAllocator, + str_not_impl_enc_dec_prefix_cache, + str_not_impl_enc_dec_swa) from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device @@ -126,6 +128,66 @@ def test_allocate_encoder_decoder(): assert block_manager.can_allocate(seq_group) != AllocStatus.OK +def test_allocate_encoder_decoder_fails_with_swa(): + # SWA short for sliding window attention + + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0, + sliding_window=5) # swa + + # Allocate same sequence group to all available gpu blocks. + _, _, seq_group = create_dummy_prompt_encoder_decoder( + "0", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + + # Assert that can_allocate() fails due to SWA + with pytest.raises(NotImplementedError) as exc_info: + block_manager.can_allocate(seq_group) + + assert str(exc_info.value) == str_not_impl_enc_dec_swa + + # Assert that allocate() fails due to SWA + with pytest.raises(NotImplementedError) as exc_info: + block_manager.allocate(seq_group) + + assert str(exc_info.value) == str_not_impl_enc_dec_swa + + +def test_allocate_encoder_decoder_fails_with_prefix_caching(): + block_size = 4 + num_cpu_blocks = 4 + num_gpu_blocks = 4 + block_manager = BlockSpaceManagerV1(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0, + enable_caching=True) # Prefix cache + + # Allocate same sequence group to all available gpu blocks. + _, _, seq_group = create_dummy_prompt_encoder_decoder( + "0", + decoder_prompt_length=block_size, + encoder_prompt_length=block_size) + + # Assert that can_allocate() fails due to prefix caching + with pytest.raises(NotImplementedError) as exc_info: + block_manager.can_allocate(seq_group) + + assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache + + # Assert that allocate() fails due to prefix caching + with pytest.raises(NotImplementedError) as exc_info: + block_manager.allocate(seq_group) + + assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache + + def test_append_slot_single_seq(): block_size = 4 num_cpu_blocks = 4 diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 69a280c8bf9c6..904b12cd97b01 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -15,6 +15,17 @@ from vllm.utils import Device logger = init_logger(__name__) +''' +Exception strings for non-implemented encoder/decoder scenarios +''' + +str_not_impl_enc_dec_swa = \ + "Sliding window attention for encoder/decoder models " + \ + "is not currently supported." + +str_not_impl_enc_dec_prefix_cache = \ + "Prefix caching for encoder/decoder models " + \ + "is not currently supported." class BlockAllocatorBase(ABC): @@ -269,6 +280,10 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. + is_encoder_decoder = seq_group.is_encoder_decoder() + if self.enable_caching and is_encoder_decoder: + raise NotImplementedError(str_not_impl_enc_dec_prefix_cache) + self_num_required_blocks = self._get_seq_num_required_blocks( seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) cross_num_required_blocks = self._get_seq_num_required_blocks( @@ -277,10 +292,8 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: cross_num_required_blocks if self.block_sliding_window is not None: - if seq_group.is_encoder_decoder(): - raise NotImplementedError( - "Sliding window attention for encoder/decoder models " + \ - "is not currently supported.") + if is_encoder_decoder: + raise NotImplementedError(str_not_impl_enc_dec_swa) num_required_blocks = min(num_required_blocks, self.block_sliding_window) @@ -327,14 +340,10 @@ def allocate(self, seq_group: SequenceGroup) -> None: if (self.block_sliding_window is not None) and \ is_encoder_decoder: - raise NotImplementedError( - "Sliding window attention for encoder/decoder models " + \ - "is not currently supported.") + raise NotImplementedError(str_not_impl_enc_dec_swa) if self.enable_caching and is_encoder_decoder: - raise NotImplementedError( - "Automatic prefix caching currently not " + \ - "supported for encoder/decoder models.") + raise NotImplementedError(str_not_impl_enc_dec_prefix_cache) # Allocate decoder sequences # diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 0dd2ffcd182ec..d2dadd9a63dc2 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -8,6 +8,17 @@ from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device +''' +Exception strings for non-implemented encoder/decoder scenarios +''' + +str_not_impl_enc_dec_swa = \ + "Sliding window attention for encoder/decoder models " + \ + "is not currently supported." + +str_not_impl_enc_dec_prefix_cache = \ + "Prefix caching for encoder/decoder models " + \ + "is not currently supported." SeqId = int EncoderSeqId = str @@ -92,13 +103,20 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. + is_encoder_decoder = seq_group.is_encoder_decoder() + if self.enable_caching and is_encoder_decoder: + raise NotImplementedError(str_not_impl_enc_dec_prefix_cache) + + if self.block_sliding_window is not None and is_encoder_decoder: + raise NotImplementedError(str_not_impl_enc_dec_swa) + seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] num_required_blocks = BlockTable.get_num_required_blocks( seq.get_token_ids(), block_size=self.block_size, ) - if seq_group.is_encoder_decoder(): + if is_encoder_decoder: num_required_blocks += BlockTable.get_num_required_blocks( seq_group.get_encoder_seq().get_token_ids(), block_size=self.block_size, @@ -162,14 +180,10 @@ def allocate(self, seq_group: SequenceGroup) -> None: if (self.block_sliding_window is not None) and \ is_encoder_decoder: - raise NotImplementedError( - "Sliding window attention for encoder/decoder models " + \ - "is not currently supported.") + raise NotImplementedError(str_not_impl_enc_dec_swa) if self.enable_caching and is_encoder_decoder: - raise NotImplementedError( - "Automatic prefix caching currently not " + \ - "supported for encoder/decoder models.") + raise NotImplementedError(str_not_impl_enc_dec_prefix_cache) if is_encoder_decoder: block_table = self._allocate_sequence(encoder_seq) From d3935f73b5038ba7acc75fff07282b7f7fda6ed5 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 19:05:36 -0400 Subject: [PATCH 31/47] rename tests --- tests/core/block/test_block_manager_v2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index cf423d292a25e..c893bc8f4209e 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -109,10 +109,10 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, @pytest.mark.parametrize("num_gpu_blocks", [16]) @pytest.mark.parametrize("num_seqs_per_group", [1]) @pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_allocate_encoder_decoder_fails_with_swa(block_size: int, - num_seqs_per_group: int, - num_gpu_blocks: int, - watermark: float): +def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, + num_seqs_per_group: int, + num_gpu_blocks: int, + watermark: float): ''' SWA short for Sliding Window Attention. @@ -172,7 +172,7 @@ def test_allocate_encoder_decoder_fails_with_swa(block_size: int, @pytest.mark.parametrize("num_gpu_blocks", [16]) @pytest.mark.parametrize("num_seqs_per_group", [1]) @pytest.mark.parametrize("watermark", [0.0, 0.5]) -def test_allocate_encoder_decoder_fails_with_prefix_cache( +def test_can_allocate_encoder_decoder_fails_with_prefix_cache( block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, watermark: float): From e6a7125383488af42dd5020b65824394c9c112e9 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 19:10:35 -0400 Subject: [PATCH 32/47] spelling error --- tests/core/block/test_block_manager_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index c893bc8f4209e..19ea89d01ca7a 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -160,7 +160,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, # 1. Block manager v2 constructor fails with assertion that sliding window # is not yet supported (most likely near-term outcome at time of # writing), or - # 2. can_allocate() fails with NotImplementedError due to combiantion of + # 2. can_allocate() fails with NotImplementedError due to combination of # encoder/decoder and sliding window attention if isinstance(exc_info.value, NotImplementedError): assert str(exc_info.value) == str_not_impl_enc_dec_swa From 68b476203ba9c8342e3f6ba5d9db5e7d369a7a52 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Thu, 23 May 2024 19:14:25 -0400 Subject: [PATCH 33/47] isort --- vllm/core/block_manager_v2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index d2dadd9a63dc2..b43f39a8ffaef 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -8,6 +8,7 @@ from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device + ''' Exception strings for non-implemented encoder/decoder scenarios ''' From a80325dcbe4af189e3542f00ffe92a11a7243e92 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Sat, 25 May 2024 21:45:13 -0400 Subject: [PATCH 34/47] return output of SequenceGroup constructor --- tests/core/utils.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/core/utils.py b/tests/core/utils.py index 376af0f0eac4f..fb53b6cc5e18b 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -145,14 +145,11 @@ def create_seq_group_encoder_decoder( block_size=16, ) - seq_group = SequenceGroup(request_id=request_id, - seqs=seqs, - sampling_params=sampling_params, - arrival_time=time.time(), - encoder_seq=encoder_seq) - - return seq_group - + return SequenceGroup(request_id=request_id, + seqs=seqs, + sampling_params=sampling_params, + arrival_time=time.time(), + encoder_seq=encoder_seq) def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size From 8b387767512a657fd0051c674f4a594159b67eee Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Sat, 25 May 2024 21:56:25 -0400 Subject: [PATCH 35/47] capitalize constants --- tests/core/block/test_block_manager_v2.py | 8 ++++---- tests/core/test_block_manager.py | 12 ++++++------ vllm/core/block_manager_v1.py | 17 ++++++++--------- vllm/core/block_manager_v2.py | 12 ++++++------ 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 19ea89d01ca7a..3aed0c58bd264 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -1,8 +1,8 @@ import pytest from vllm.core.block_manager_v2 import (BlockSpaceManagerV2, - str_not_impl_enc_dec_prefix_cache, - str_not_impl_enc_dec_swa) + STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, + STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list @@ -163,7 +163,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, # 2. can_allocate() fails with NotImplementedError due to combination of # encoder/decoder and sliding window attention if isinstance(exc_info.value, NotImplementedError): - assert str(exc_info.value) == str_not_impl_enc_dec_swa + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA elif isinstance(exc_info.value, AssertionError): assert str(exc_info.value) == "Sliding window not yet supported" @@ -201,7 +201,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache( # due to combination of encoder/decoder and prefix cache with pytest.raises(NotImplementedError) as exc_info: block_manager.can_allocate(seq_group) - assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE @pytest.mark.parametrize("block_size", [1, 8]) diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 6039f568fcf1e..7e487a021d3c2 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -8,8 +8,8 @@ from vllm.block import PhysicalTokenBlock from vllm.core.block_manager_v1 import (BlockSpaceManagerV1, UncachedBlockAllocator, - str_not_impl_enc_dec_prefix_cache, - str_not_impl_enc_dec_swa) + STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, + STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device @@ -150,13 +150,13 @@ def test_allocate_encoder_decoder_fails_with_swa(): with pytest.raises(NotImplementedError) as exc_info: block_manager.can_allocate(seq_group) - assert str(exc_info.value) == str_not_impl_enc_dec_swa + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA # Assert that allocate() fails due to SWA with pytest.raises(NotImplementedError) as exc_info: block_manager.allocate(seq_group) - assert str(exc_info.value) == str_not_impl_enc_dec_swa + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA def test_allocate_encoder_decoder_fails_with_prefix_caching(): @@ -179,13 +179,13 @@ def test_allocate_encoder_decoder_fails_with_prefix_caching(): with pytest.raises(NotImplementedError) as exc_info: block_manager.can_allocate(seq_group) - assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE # Assert that allocate() fails due to prefix caching with pytest.raises(NotImplementedError) as exc_info: block_manager.allocate(seq_group) - assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache + assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE def test_append_slot_single_seq(): diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 904b12cd97b01..312690ee45893 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -19,11 +19,11 @@ Exception strings for non-implemented encoder/decoder scenarios ''' -str_not_impl_enc_dec_swa = \ +STR_NOT_IMPL_ENC_DEC_SWA = \ "Sliding window attention for encoder/decoder models " + \ "is not currently supported." -str_not_impl_enc_dec_prefix_cache = \ +STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ "Prefix caching for encoder/decoder models " + \ "is not currently supported." @@ -272,9 +272,8 @@ def __init__( self.cross_block_tables: Dict[str, BlockTable] = {} def _get_seq_num_required_blocks(self, seq: Sequence) -> int: - if seq is None: - return 0 - return len(seq.logical_token_blocks) + return 0 if seq is None \ + else len(seq.logical_token_blocks) def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share @@ -282,7 +281,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: is_encoder_decoder = seq_group.is_encoder_decoder() if self.enable_caching and is_encoder_decoder: - raise NotImplementedError(str_not_impl_enc_dec_prefix_cache) + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) self_num_required_blocks = self._get_seq_num_required_blocks( seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) @@ -293,7 +292,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: if self.block_sliding_window is not None: if is_encoder_decoder: - raise NotImplementedError(str_not_impl_enc_dec_swa) + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) num_required_blocks = min(num_required_blocks, self.block_sliding_window) @@ -340,10 +339,10 @@ def allocate(self, seq_group: SequenceGroup) -> None: if (self.block_sliding_window is not None) and \ is_encoder_decoder: - raise NotImplementedError(str_not_impl_enc_dec_swa) + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) if self.enable_caching and is_encoder_decoder: - raise NotImplementedError(str_not_impl_enc_dec_prefix_cache) + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) # Allocate decoder sequences # diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index b43f39a8ffaef..6113561032dd1 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -13,11 +13,11 @@ Exception strings for non-implemented encoder/decoder scenarios ''' -str_not_impl_enc_dec_swa = \ +STR_NOT_IMPL_ENC_DEC_SWA = \ "Sliding window attention for encoder/decoder models " + \ "is not currently supported." -str_not_impl_enc_dec_prefix_cache = \ +STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ "Prefix caching for encoder/decoder models " + \ "is not currently supported." @@ -106,10 +106,10 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: is_encoder_decoder = seq_group.is_encoder_decoder() if self.enable_caching and is_encoder_decoder: - raise NotImplementedError(str_not_impl_enc_dec_prefix_cache) + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) if self.block_sliding_window is not None and is_encoder_decoder: - raise NotImplementedError(str_not_impl_enc_dec_swa) + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] num_required_blocks = BlockTable.get_num_required_blocks( @@ -181,10 +181,10 @@ def allocate(self, seq_group: SequenceGroup) -> None: if (self.block_sliding_window is not None) and \ is_encoder_decoder: - raise NotImplementedError(str_not_impl_enc_dec_swa) + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) if self.enable_caching and is_encoder_decoder: - raise NotImplementedError(str_not_impl_enc_dec_prefix_cache) + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) if is_encoder_decoder: block_table = self._allocate_sequence(encoder_seq) From f39c3132af87d410507644c9ea86aec1156f3533 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Sat, 25 May 2024 22:20:06 -0400 Subject: [PATCH 36/47] refactored swap-block-table functionality --- vllm/core/block_manager_v1.py | 68 +++++++++++++++-------------------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 312690ee45893..90a485b39e9d6 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -541,23 +541,25 @@ def can_swap_in(self, else: return AllocStatus.LATER - def _swap_in_block_table( + def _swap_block_table( self, block_table: BlockTable, + src_allocator: BlockAllocatorBase, + dest_allocator: BlockAllocatorBase, mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock]) -> BlockTable: new_block_table = [] - for cpu_block in block_table: - if cpu_block in mapping: - gpu_block = mapping[cpu_block] - gpu_block.ref_count += 1 + for from_block in block_table: + if from_block in mapping: + to_block = mapping[from_block] + to_block.ref_count += 1 else: - gpu_block = self.gpu_allocator.allocate( - cpu_block.block_hash, cpu_block.num_hashed_tokens) - mapping[cpu_block] = gpu_block - new_block_table.append(gpu_block) - # Free the CPU block swapped in to GPU. - self.cpu_allocator.free(cpu_block) + to_block = dest_allocator.allocate( + from_block.block_hash, from_block.num_hashed_tokens) + mapping[from_block] = to_block + new_block_table.append(to_block) + # Free the source block swapped in to destination. + src_allocator.free(from_block) return new_block_table @@ -574,13 +576,17 @@ def swap_in(self, mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): self.block_tables[seq.seq_id] = \ - self._swap_in_block_table(self.block_tables[seq.seq_id], - mapping) + self._swap_block_table(self.block_tables[seq.seq_id], + self.cpu_allocator, + self.gpu_allocator, + mapping) if seq_group.is_encoder_decoder(): self.cross_block_tables[request_id] = \ - self._swap_in_block_table(self.cross_block_tables[request_id], - mapping) + self._swap_block_table(self.cross_block_tables[request_id], + self.cpu_allocator, + self.gpu_allocator, + mapping) return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()] @@ -589,26 +595,6 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: blocks = self._get_physical_blocks(seq_group) return len(blocks) <= self.cpu_allocator.get_num_free_blocks() - def _swap_out_block_table( - self, block_table: BlockTable, - mapping: Dict[PhysicalTokenBlock, - PhysicalTokenBlock]) -> BlockTable: - - new_block_table: BlockTable = [] - for gpu_block in block_table: - if gpu_block in mapping: - cpu_block = mapping[gpu_block] - cpu_block.ref_count += 1 - else: - cpu_block = self.cpu_allocator.allocate( - gpu_block.block_hash, gpu_block.num_hashed_tokens) - mapping[gpu_block] = cpu_block - new_block_table.append(cpu_block) - # Free the GPU block swapped out to CPU. - self.gpu_allocator.free(gpu_block) - - return new_block_table - def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: request_id = seq_group.request_id @@ -617,13 +603,17 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): self.block_tables[seq.seq_id] = \ - self._swap_out_block_table(self.block_tables[seq.seq_id], - mapping) + self._swap_block_table(self.block_tables[seq.seq_id], + self.gpu_allocator, + self.cpu_allocator, + mapping) if seq_group.is_encoder_decoder(): self.cross_block_tables[request_id] = \ - self._swap_out_block_table(self.cross_block_tables[request_id], - mapping) + self._swap_block_table(self.cross_block_tables[request_id], + self.gpu_allocator, + self.cpu_allocator, + mapping) return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()] From 90b5a0e5303c937e56c5b8893fc0cbaeb985ac3f Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Sat, 25 May 2024 22:51:09 -0400 Subject: [PATCH 37/47] Refactored block manager + enc dec + unsupported feature checks into utils --- tests/core/block/test_block_manager_v2.py | 6 ++-- tests/core/test_block_manager.py | 6 ++-- tests/core/utils.py | 1 + vllm/core/block/utils.py | 41 +++++++++++++++++++++++ vllm/core/block_manager_v1.py | 34 ++++--------------- vllm/core/block_manager_v2.py | 35 ++++--------------- 6 files changed, 60 insertions(+), 63 deletions(-) create mode 100644 vllm/core/block/utils.py diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 3aed0c58bd264..f1488916b508a 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -1,8 +1,8 @@ import pytest -from vllm.core.block_manager_v2 import (BlockSpaceManagerV2, - STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - STR_NOT_IMPL_ENC_DEC_SWA) +from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, + STR_NOT_IMPL_ENC_DEC_SWA) +from vllm.core.block_manager_v2 import BlockSpaceManagerV2 from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 7e487a021d3c2..2264fe80c9c03 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -6,10 +6,10 @@ from vllm import SamplingParams from vllm.block import PhysicalTokenBlock +from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, + STR_NOT_IMPL_ENC_DEC_SWA) from vllm.core.block_manager_v1 import (BlockSpaceManagerV1, - UncachedBlockAllocator, - STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - STR_NOT_IMPL_ENC_DEC_SWA) + UncachedBlockAllocator) from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device diff --git a/tests/core/utils.py b/tests/core/utils.py index fb53b6cc5e18b..7ac565c0eccf1 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -151,5 +151,6 @@ def create_seq_group_encoder_decoder( arrival_time=time.time(), encoder_seq=encoder_seq) + def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py new file mode 100644 index 0000000000000..6599011771cea --- /dev/null +++ b/vllm/core/block/utils.py @@ -0,0 +1,41 @@ +"""Block manager utils.""" +from typing import Union + +from vllm.core.block_manager_v1 import BlockSpaceManagerV1 +from vllm.core.block_manager_v2 import BlockSpaceManagerV2 +from vllm.sequence import SequenceGroup + +''' +Exception strings for non-implemented block manager encoder/decoder scenarios +''' + +STR_NOT_IMPL_ENC_DEC_SWA = \ + "Sliding window attention for encoder/decoder models " + \ + "is not currently supported." + +STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ + "Prefix caching for encoder/decoder models " + \ + "is not currently supported." + +def check_no_caching_or_swa_for_blckmgr_encdec( + block_mgr: Union[BlockSpaceManagerV1, + BlockSpaceManagerV2], + seq_group: SequenceGroup) -> None: + ''' + Enforce that prefix caching & sliding-window attention (SWA) + are currently unsupported *specifically* for encoder/decoder models. + + Raises NotImplementedError if unsupported scenario is detected. + + Arguments: + + * block_mgr: BlockSpaceManager instance + * seq_group: SequenceGroup passed to block_mgr + ''' + + if seq_group.is_encoder_decoder(): + if block_mgr.block_sliding_window is not None: + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) + + if block_mgr.enable_caching: + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) \ No newline at end of file diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 90a485b39e9d6..fa64b96a5e7dc 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -8,6 +8,7 @@ from typing import Set, Tuple from vllm.block import BlockTable, PhysicalTokenBlock +from vllm.core.block.utils import check_no_caching_or_swa_for_blckmgr_encdec from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger @@ -15,17 +16,6 @@ from vllm.utils import Device logger = init_logger(__name__) -''' -Exception strings for non-implemented encoder/decoder scenarios -''' - -STR_NOT_IMPL_ENC_DEC_SWA = \ - "Sliding window attention for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ - "Prefix caching for encoder/decoder models " + \ - "is not currently supported." class BlockAllocatorBase(ABC): @@ -279,9 +269,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - is_encoder_decoder = seq_group.is_encoder_decoder() - if self.enable_caching and is_encoder_decoder: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) + check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group) self_num_required_blocks = self._get_seq_num_required_blocks( seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) @@ -291,8 +279,6 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: cross_num_required_blocks if self.block_sliding_window is not None: - if is_encoder_decoder: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) num_required_blocks = min(num_required_blocks, self.block_sliding_window) @@ -334,15 +320,8 @@ def _allocate_sequence(self, \ return block_table def allocate(self, seq_group: SequenceGroup) -> None: - encoder_seq = seq_group.get_encoder_seq() is_encoder_decoder = seq_group.is_encoder_decoder() - - if (self.block_sliding_window is not None) and \ - is_encoder_decoder: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) - - if self.enable_caching and is_encoder_decoder: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) + check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group) # Allocate decoder sequences # @@ -362,8 +341,8 @@ def allocate(self, seq_group: SequenceGroup) -> None: if is_encoder_decoder: # A SequenceGroup has only a single encoder sequence (at most), # thus allocate with a ref count of 1 - block_table = self._allocate_sequence(encoder_seq, 1, - is_encoder_decoder) + block_table = self._allocate_sequence(seq_group.get_encoder_seq(), + 1, is_encoder_decoder) # Assign the cross-attention block table for the SequenceGroup. self.cross_block_tables[seq_group.request_id] = block_table @@ -542,8 +521,7 @@ def can_swap_in(self, return AllocStatus.LATER def _swap_block_table( - self, block_table: BlockTable, - src_allocator: BlockAllocatorBase, + self, block_table: BlockTable, src_allocator: BlockAllocatorBase, dest_allocator: BlockAllocatorBase, mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock]) -> BlockTable: diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 6113561032dd1..246ab9c297c5b 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -5,22 +5,11 @@ from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator +from vllm.core.block.utils import check_no_caching_or_swa_for_blckmgr_encdec from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device -''' -Exception strings for non-implemented encoder/decoder scenarios -''' - -STR_NOT_IMPL_ENC_DEC_SWA = \ - "Sliding window attention for encoder/decoder models " + \ - "is not currently supported." - -STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \ - "Prefix caching for encoder/decoder models " + \ - "is not currently supported." - SeqId = int EncoderSeqId = str @@ -104,12 +93,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - is_encoder_decoder = seq_group.is_encoder_decoder() - if self.enable_caching and is_encoder_decoder: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) - - if self.block_sliding_window is not None and is_encoder_decoder: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) + check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group) seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] num_required_blocks = BlockTable.get_num_required_blocks( @@ -117,7 +101,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: block_size=self.block_size, ) - if is_encoder_decoder: + if seq_group.is_encoder_decoder(): num_required_blocks += BlockTable.get_num_required_blocks( seq_group.get_encoder_seq().get_token_ids(), block_size=self.block_size, @@ -151,8 +135,6 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: return block_table def allocate(self, seq_group: SequenceGroup) -> None: - encoder_seq = seq_group.get_encoder_seq() - is_encoder_decoder = seq_group.is_encoder_decoder() # Allocate self-attention block tables for decoder sequences waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING) @@ -179,15 +161,10 @@ def allocate(self, seq_group: SequenceGroup) -> None: not in self.cross_block_tables), \ "block table already exists" - if (self.block_sliding_window is not None) and \ - is_encoder_decoder: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) - - if self.enable_caching and is_encoder_decoder: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) + check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group) - if is_encoder_decoder: - block_table = self._allocate_sequence(encoder_seq) + if seq_group.is_encoder_decoder(): + block_table = self._allocate_sequence(seq_group.get_encoder_seq()) self.cross_block_tables[request_id] = block_table def can_append_slots(self, seq_group: SequenceGroup, From 9ee2582172b2b273ede9cb0e3ced9d9f197ecc0b Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Sat, 25 May 2024 22:57:02 -0400 Subject: [PATCH 38/47] removed circular import --- vllm/core/block/utils.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 6599011771cea..14b99496b12dc 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,10 +1,5 @@ """Block manager utils.""" -from typing import Union - -from vllm.core.block_manager_v1 import BlockSpaceManagerV1 -from vllm.core.block_manager_v2 import BlockSpaceManagerV2 from vllm.sequence import SequenceGroup - ''' Exception strings for non-implemented block manager encoder/decoder scenarios ''' @@ -17,10 +12,9 @@ "Prefix caching for encoder/decoder models " + \ "is not currently supported." + def check_no_caching_or_swa_for_blckmgr_encdec( - block_mgr: Union[BlockSpaceManagerV1, - BlockSpaceManagerV2], - seq_group: SequenceGroup) -> None: + block_mgr, seq_group: SequenceGroup) -> None: ''' Enforce that prefix caching & sliding-window attention (SWA) are currently unsupported *specifically* for encoder/decoder models. @@ -38,4 +32,4 @@ def check_no_caching_or_swa_for_blckmgr_encdec( raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) if block_mgr.enable_caching: - raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) \ No newline at end of file + raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE) From 5d0ac231b751466771f25e9275acede785bf4344 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Sat, 25 May 2024 22:58:09 -0400 Subject: [PATCH 39/47] apparently isort has to run last? --- vllm/core/block/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 14b99496b12dc..4113f7e52b84f 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,5 +1,6 @@ """Block manager utils.""" from vllm.sequence import SequenceGroup + ''' Exception strings for non-implemented block manager encoder/decoder scenarios ''' From 1bcc949c7c4634da50d80d7bc4b47185e6ac6f18 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Sun, 26 May 2024 12:20:12 -0400 Subject: [PATCH 40/47] slight name change --- vllm/core/block/utils.py | 2 +- vllm/core/block_manager_v1.py | 6 +++--- vllm/core/block_manager_v2.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 4113f7e52b84f..3dee7ff16dd84 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -14,7 +14,7 @@ "is not currently supported." -def check_no_caching_or_swa_for_blckmgr_encdec( +def check_no_caching_or_swa_for_blockmgr_encdec( block_mgr, seq_group: SequenceGroup) -> None: ''' Enforce that prefix caching & sliding-window attention (SWA) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index fa64b96a5e7dc..201cba309f6ef 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -8,7 +8,7 @@ from typing import Set, Tuple from vllm.block import BlockTable, PhysicalTokenBlock -from vllm.core.block.utils import check_no_caching_or_swa_for_blckmgr_encdec +from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger @@ -269,7 +269,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group) + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) self_num_required_blocks = self._get_seq_num_required_blocks( seq_group.get_seqs(status=SequenceStatus.WAITING)[0]) @@ -321,7 +321,7 @@ def _allocate_sequence(self, \ def allocate(self, seq_group: SequenceGroup) -> None: is_encoder_decoder = seq_group.is_encoder_decoder() - check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group) + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) # Allocate decoder sequences # diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 246ab9c297c5b..6185a65983d3a 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -5,7 +5,7 @@ from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator -from vllm.core.block.utils import check_no_caching_or_swa_for_blckmgr_encdec +from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device @@ -93,7 +93,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. - check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group) + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0] num_required_blocks = BlockTable.get_num_required_blocks( @@ -161,7 +161,7 @@ def allocate(self, seq_group: SequenceGroup) -> None: not in self.cross_block_tables), \ "block table already exists" - check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group) + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) if seq_group.is_encoder_decoder(): block_table = self._allocate_sequence(seq_group.get_encoder_seq()) From 1bece71b45331ed5e371a3842e5a1bba5fe7a160 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 28 May 2024 12:27:47 -0400 Subject: [PATCH 41/47] wip merge --- vllm/core/block_manager_v2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index b19f4b184db94..cad42ab3c1ba2 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -138,7 +138,6 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable: block_allocator=self.block_allocator, max_block_sliding_window=self.max_block_sliding_window, ) - assert self.block_sliding_window is None block_table.allocate(seq.get_token_ids()) return block_table From 1d882ca8d5825ab68988740e81796abadd083b06 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 28 May 2024 12:38:45 -0400 Subject: [PATCH 42/47] fixed utils to correctly handle encoder/decoder unsupported scenarios --- vllm/core/block/utils.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 3dee7ff16dd84..dd9345ab52d40 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -13,6 +13,26 @@ "Prefix caching for encoder/decoder models " + \ "is not currently supported." +def _get_block_mgr_sliding_window_attr(block_mgr): + ''' + BlockManagerV1 and BlockManagerV2 have slightly different + members related to sliding window attention (SWA). This + function extracts the appropriate member to use for determining + whether SWA is enabled. + + Arguments: + + * block_mgr: BlockManagerV1 or BlockManagerV2 instance + ''' + + if hasattr(block_mgr, 'block_sliding_window'): + return block_mgr.block_sliding_window + if hasattr(block_mgr, 'max_block_sliding_window'): + return block_mgr.max_block_sliding_window + + raise AttributeError("Block manager instance has neither " + \ + "block_sliding_window nor " + \ + "max_block_sliding_window attributes.") def check_no_caching_or_swa_for_blockmgr_encdec( block_mgr, seq_group: SequenceGroup) -> None: @@ -29,7 +49,7 @@ def check_no_caching_or_swa_for_blockmgr_encdec( ''' if seq_group.is_encoder_decoder(): - if block_mgr.block_sliding_window is not None: + if _get_block_mgr_sliding_window_attr(block_mgr) is not None: raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA) if block_mgr.enable_caching: From dfd94692e0b35343e64aace3cd4a496564be5809 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 28 May 2024 12:39:17 -0400 Subject: [PATCH 43/47] formatting --- vllm/core/block/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index dd9345ab52d40..c582ab270473c 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -13,6 +13,7 @@ "Prefix caching for encoder/decoder models " + \ "is not currently supported." + def _get_block_mgr_sliding_window_attr(block_mgr): ''' BlockManagerV1 and BlockManagerV2 have slightly different @@ -34,6 +35,7 @@ def _get_block_mgr_sliding_window_attr(block_mgr): "block_sliding_window nor " + \ "max_block_sliding_window attributes.") + def check_no_caching_or_swa_for_blockmgr_encdec( block_mgr, seq_group: SequenceGroup) -> None: ''' From 611df433882c1e10235084426d63fd817466dd19 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 28 May 2024 22:27:41 -0400 Subject: [PATCH 44/47] yapf fix --- vllm/core/block/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index c582ab270473c..4da5a965616ac 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,6 +1,5 @@ """Block manager utils.""" from vllm.sequence import SequenceGroup - ''' Exception strings for non-implemented block manager encoder/decoder scenarios ''' From 8ee49dde309a93fd309f0117f74cde4949e958e4 Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 28 May 2024 22:30:12 -0400 Subject: [PATCH 45/47] yapf fix --- vllm/core/block/utils.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 4da5a965616ac..2c412a8f472e0 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,8 +1,7 @@ """Block manager utils.""" from vllm.sequence import SequenceGroup -''' -Exception strings for non-implemented block manager encoder/decoder scenarios -''' + +# Exception strings for non-implemented block manager enc/dec scenarios STR_NOT_IMPL_ENC_DEC_SWA = \ "Sliding window attention for encoder/decoder models " + \ From 039c25eb6661f2aa89b4239235451f2c6f61d63d Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 28 May 2024 23:03:44 -0400 Subject: [PATCH 46/47] upstream merge --- tests/core/utils.py | 36 +++++++++++++++++++++++++++--------- vllm/core/block/utils.py | 1 + 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/tests/core/utils.py b/tests/core/utils.py index 1ccc5c3cc0a8e..cd2045b8a1889 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -55,12 +55,24 @@ def create_dummy_prompt_encoder_decoder( # and prompt "0 ... block_size". decoder_prompt_tokens = list(range(decoder_prompt_length)) decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens]) - decoder_prompt = Sequence(int(request_id), decoder_prompt_str, - decoder_prompt_tokens, block_size) + + decoder_prompt = Sequence(int(request_id), + inputs={ + "prompt": decoder_prompt_str, + "prompt_token_ids": decoder_prompt_tokens, + "multi_modal_data": None, + }, + block_size=block_size) + encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) - encoder_prompt = Sequence(int(request_id), encoder_prompt_str, - encoder_prompt_tokens, block_size) + encoder_prompt = Sequence(int(request_id), + inputs={ + "prompt": encoder_prompt_str, + "prompt_token_ids": encoder_prompt_tokens, + "multi_modal_data": None, + }, + block_size=block_size) seq_group = SequenceGroup(request_id=request_id, seqs=[decoder_prompt], sampling_params=SamplingParams( @@ -134,8 +146,11 @@ def create_seq_group_encoder_decoder( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - prompt="", - prompt_token_ids=prompt_token_ids, + inputs={ + "prompt": "", + "prompt_token_ids": prompt_token_ids, + "multi_modal_data": None, + }, block_size=16, ) @@ -149,8 +164,11 @@ def create_seq_group_encoder_decoder( # Encoder sequence encoder_seq = Sequence( seq_id=seq_id_start + len(seq_output_lens), - prompt="", - prompt_token_ids=prompt_token_ids, + inputs={ + "prompt": "", + "prompt_token_ids": prompt_token_ids, + "multi_modal_data": None, + }, block_size=16, ) @@ -162,4 +180,4 @@ def create_seq_group_encoder_decoder( def round_up_to_next_block(seq_len: int, block_size: int) -> int: - return (seq_len + block_size - 1) // block_size + return (seq_len + block_size - 1) // block_size \ No newline at end of file diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index 4da5a965616ac..c582ab270473c 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,5 +1,6 @@ """Block manager utils.""" from vllm.sequence import SequenceGroup + ''' Exception strings for non-implemented block manager encoder/decoder scenarios ''' From 8e9ef5bb5ae7bc3ece7ae527e591df093ff7f31e Mon Sep 17 00:00:00 2001 From: Andrew Feldman Date: Tue, 28 May 2024 23:06:08 -0400 Subject: [PATCH 47/47] fix formatting issue --- vllm/core/block/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py index c582ab270473c..372bfb5ed2f9e 100644 --- a/vllm/core/block/utils.py +++ b/vllm/core/block/utils.py @@ -1,9 +1,7 @@ """Block manager utils.""" from vllm.sequence import SequenceGroup -''' -Exception strings for non-implemented block manager encoder/decoder scenarios -''' +# Exception strings for non-implemented block manager encoder/decoder scenarios STR_NOT_IMPL_ENC_DEC_SWA = \ "Sliding window attention for encoder/decoder models " + \