From 7eb0e0d7a42b3ac64a7912faf1f2822601da5f2a Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 May 2024 09:44:51 -0400
Subject: [PATCH 01/47] added block manager tests

---
 tests/core/test_block_manager.py | 132 ++++++++++++++++++++++++++++++-
 1 file changed, 131 insertions(+), 1 deletion(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 22a9f0cf47d32..6b2fa21f2ef46 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -12,7 +12,7 @@
 from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
 
-from .utils import create_dummy_prompt
+from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder
 
 
 def test_block_allocator_allocate():
@@ -89,6 +89,34 @@ def test_allocate():
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 
+def test_allocate_encoder_decoder():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_req_per_seq_group = 2
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    # Allocate same sequence group to all available gpu blocks.
+    for i in range(num_gpu_blocks//block_req_per_seq_group):
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(str(i), block_size, block_size)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
+
+    # Allocate same sequence group to all available gpu blocks.
+    # Use watermark to reserve one gpu block.
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=1 / num_gpu_blocks)
+    for i in range((num_gpu_blocks - 1)//block_req_per_seq_group):
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(str(i), block_size//2, block_size//2)
+        assert block_manager.can_allocate(seq_group)
+        block_manager.allocate(seq_group)
+    assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 
 def test_append_slot_single_seq():
     block_size = 4
@@ -240,6 +268,58 @@ def test_swap():
     assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
     assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
 
+def test_swap_encoder_decoder():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    decoder_prompt, encoder_prompt, seq_group = create_dummy_prompt_encoder_decoder("1", 
+                                                                                    decoder_prompt_length=block_size, 
+                                                                                    encoder_prompt_length=block_size)
+    decoder_prompt.status = SequenceStatus.WAITING
+    encoder_prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    decoder_prompt.status = SequenceStatus.RUNNING
+    decoder_prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap encoder/decoder seq group from GPU -> CPU.
+    decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt)
+    encoder_gpu_blocks = block_manager.get_encoder_block_table(seq_group)
+    gpu_blocks = decoder_gpu_blocks + encoder_gpu_blocks
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    assert [x[0] for x in mapping] == gpu_blocks
+    #assert list(mapping.keys()) == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    decoder_prompt.status = SequenceStatus.SWAPPED
+
+    # Swap decoder seq group from CPU -> GPU.
+    decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt)
+    encoder_cpu_blocks = block_manager.get_encoder_block_table(seq_group)
+    cpu_blocks = decoder_cpu_blocks + encoder_cpu_blocks
+    assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    assert [x[0] for x in mapping] == cpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
 
 def test_free():
     block_size = 4
@@ -264,6 +344,34 @@ def test_free():
     with pytest.raises(KeyError):
         block_manager.get_block_table(prompt)
 
+def test_free_encoder_decoder():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    decoder_prompt, encoder_prompt, seq_group = create_dummy_prompt_encoder_decoder("1", 
+                                                                                    decoder_prompt_length=block_size//2, 
+                                                                                    encoder_prompt_length=block_size//2)
+    block_manager.allocate(seq_group)
+
+    # Free allocated seq.
+    decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt))
+    encoder_prompt_blocks = len(block_manager.get_encoder_block_table(seq_group))
+    prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks
+    before_blocks = block_manager.get_num_free_gpu_blocks()
+    block_manager.free(decoder_prompt)
+    block_manager.free_encoder(seq_group)
+    after_blocks = block_manager.get_num_free_gpu_blocks()
+    assert after_blocks == before_blocks + prompt_blocks
+
+    # Block table for freed encoder & decoder seq's are deleted.
+    with pytest.raises(KeyError):
+        block_manager.get_block_table(decoder_prompt)
+        block_manager.get_block_table(encoder_prompt)
 
 def test_reset():
     block_size = 4
@@ -285,6 +393,28 @@ def test_reset():
     block_manager.reset()
     assert block_manager.get_num_free_gpu_blocks() == original_blocks
 
+def test_reset_encoder_decoder():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_req_per_seq_group = 2
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+
+    # Allocate same seq group on all available gpu blocks.
+    original_blocks = block_manager.get_num_free_gpu_blocks()
+    for i in range(num_gpu_blocks//block_req_per_seq_group):
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(f"{i}", 
+                                                              decoder_prompt_length=block_size, 
+                                                              encoder_prompt_length=block_size)
+        block_manager.allocate(seq_group)
+    assert block_manager.get_num_free_gpu_blocks() == 0
+
+    # Resetting block manager frees all allocated blocks.
+    block_manager.reset()
+    assert block_manager.get_num_free_gpu_blocks() == original_blocks
 
 def test_sliding_window_multi_seq():
     """

From 6e41c39b24e8bdcff76ebbab0b95e16c0603e0b3 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 May 2024 09:52:03 -0400
Subject: [PATCH 02/47] passing block manager encoder/decoder test

---
 tests/core/utils.py           |  29 ++++++++
 vllm/core/block_manager_v1.py | 130 ++++++++++++++++++++++++++++++++--
 vllm/sequence.py              |  12 ++++
 3 files changed, 166 insertions(+), 5 deletions(-)

diff --git a/tests/core/utils.py b/tests/core/utils.py
index 8fb13177a2d6c..170bf9fff3dd2 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -32,6 +32,35 @@ def create_dummy_prompt(
 
     return prompt, seq_group
 
+def create_dummy_prompt_encoder_decoder(
+    request_id: str,
+    decoder_prompt_length: int,
+    encoder_prompt_length: int,
+    block_size: Optional[int] = None,
+    lora_request: Optional[LoRARequest] = None,
+    use_beam_search: bool = False,
+    best_of: int = 1,
+) -> Tuple[Sequence, SequenceGroup]:
+    if not block_size:
+        block_size = decoder_prompt_length
+
+    # Create dummy prompt sequence with tokens 0...block_size-1
+    # and prompt "0 ... block_size".
+    decoder_prompt_tokens = list(range(decoder_prompt_length))
+    decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
+    decoder_prompt = Sequence(int(request_id), decoder_prompt_str, decoder_prompt_tokens, block_size)
+    encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
+    encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
+    encoder_prompt = Sequence(int(request_id), encoder_prompt_str, encoder_prompt_tokens, block_size)
+    seq_group = SequenceGroup(
+        request_id=request_id, 
+        seqs=[decoder_prompt],
+        sampling_params=SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
+        arrival_time=time.time(), 
+        lora_request=lora_request, 
+        encoder_seq=encoder_prompt)
+
+    return decoder_prompt, encoder_prompt, seq_group
 
 def create_seq_group(
         seq_prompt_len: int = 1024,
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 52a170d79e4e7..bd2ccbbb86572 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -255,12 +255,23 @@ def __init__(
                 Device.CPU, block_size, num_cpu_blocks)
         # Mapping: seq_id -> BlockTable.
         self.block_tables: Dict[int, BlockTable] = {}
+        # Mapping: req_id -> BlockTable
+        # Note that each SequenceGroup has a unique
+        # request ID
+        self.encoder_block_tables: Dict[str, BlockTable] = {}
+
+    def get_seq_num_required_blocks(self, seq: Sequence) -> int:
+        if seq is None:
+            return 0
+        return len(seq.logical_token_blocks)  
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
-        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
-        num_required_blocks = len(seq.logical_token_blocks)
+
+        decoder_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
+        encoder_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_encoder_seq())
+        num_required_blocks = decoder_num_required_blocks+encoder_num_required_blocks
 
         if self.block_sliding_window is not None:
             num_required_blocks = min(num_required_blocks,
@@ -276,9 +287,9 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         else:
             return AllocStatus.LATER
 
-    def allocate(self, seq_group: SequenceGroup) -> None:
+    def allocate_decoder(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
-        # prompt.
+        # decoder prompt.
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
 
         # Allocate new physical token blocks that will store the prompt tokens.
@@ -301,10 +312,46 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                 block.ref_count = seq_group.num_seqs()
             block_table.append(block)
 
-        # Assign the block table for each sequence.
+        # Assign the decoder block table for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
 
+    def allocate_encoder(self, seq_group: SequenceGroup) -> None:
+        # NOTE: Here we assume that all sequences in the group have the same
+        # encoder prompt.
+        seq = seq_group.get_encoder_seq()
+
+        # Allocate new physical token blocks that will store the prompt tokens.
+        block_table: BlockTable = []        
+        if seq is None:
+            # Assign empty encoder block table for the SequenceGroup
+            self.encoder_block_tables[seq_group.request_id] = block_table
+        else: 
+            num_prompt_blocks = len(seq.logical_token_blocks)
+            for logical_idx in range(num_prompt_blocks):
+                if (self.block_sliding_window is not None
+                        and logical_idx >= self.block_sliding_window):
+                    block = block_table[logical_idx % self.block_sliding_window]
+                    # Set the reference counts of the token blocks.
+                    block.ref_count = seq_group.num_seqs()
+                elif self.enable_caching:
+                    block = self.gpu_allocator.allocate(
+                        seq.hash_of_block(logical_idx),
+                        seq.num_hashed_tokens_of_block(logical_idx))
+                else:
+                    block = self.gpu_allocator.allocate()
+                    # Set the reference counts of the token blocks.
+                    # TODO: feature not supported with encoder/decoder
+                    block.ref_count = seq_group.num_seqs()
+                block_table.append(block)
+
+            # Assign the encoder block table for the SequenceGroup.
+            self.encoder_block_tables[seq_group.request_id] = block_table
+
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        self.allocate_decoder(seq_group)
+        self.allocate_encoder(seq_group)
+
     def can_append_slots(self,
                          seq_group: SequenceGroup,
                          num_lookahead_slots: int = 0) -> bool:
@@ -445,11 +492,15 @@ def _get_physical_blocks(
             self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:
         # NOTE: Here, we assume that the physical blocks are only shared by
         # the sequences in the same group.
+        request_id = seq_group.request_id
         blocks: Set[PhysicalTokenBlock] = set()
         for seq in seq_group.get_seqs():
             if seq.is_finished():
                 continue
             blocks.update(self.block_tables[seq.seq_id])
+        # Encoder blocks
+        if seq_group.encoder_seq is not None:
+            blocks.update(self.encoder_block_tables[request_id])
         return list(blocks)
 
     def can_swap_in(self,
@@ -459,6 +510,8 @@ def can_swap_in(self,
                 ), "BlockSpaceManagerV1 does not support lookahead allocation"
         blocks = self._get_physical_blocks(seq_group)
         num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
+        if seq_group.encoder_seq is not None:
+            num_swapped_seqs += 1
         num_free_blocks = self.gpu_allocator.get_num_free_blocks()
         # NOTE: Conservatively, we assume that every sequence will allocate
         # at least one free block right after the swap-in.
@@ -477,6 +530,8 @@ def swap_in(self,
         assert (num_lookahead_slots == 0
                 ), "BlockSpaceManagerV1 does not support lookahead allocation"
 
+        request_id = seq_group.request_id
+
         # CPU block -> GPU block.
         # dict is efficient in lookup `if cpu_block in mapping`
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
@@ -497,6 +552,23 @@ def swap_in(self,
                 self.cpu_allocator.free(cpu_block)
             self.block_tables[seq.seq_id] = new_block_table
 
+        if seq_group.encoder_seq is not None:
+            new_block_table: BlockTable = []
+            block_table = self.encoder_block_tables[request_id]
+
+            for cpu_block in block_table:
+                if cpu_block in mapping:
+                    gpu_block = mapping[cpu_block]
+                    gpu_block.ref_count += 1
+                else:
+                    gpu_block = self.gpu_allocator.allocate(
+                        cpu_block.block_hash, cpu_block.num_hashed_tokens)
+                    mapping[cpu_block] = gpu_block
+                new_block_table.append(gpu_block)
+                # Free the CPU block swapped in to GPU.
+                self.cpu_allocator.free(cpu_block)
+            self.encoder_block_tables[request_id] = new_block_table
+
         block_number_mapping = {
             cpu_block.block_number: gpu_block.block_number
             for cpu_block, gpu_block in mapping.items()
@@ -509,6 +581,8 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
 
     def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        request_id = seq_group.request_id
+
         # GPU block -> CPU block.
         # dict is efficient in lookup `if gpu_block in mapping`
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
@@ -529,6 +603,23 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
                 self.gpu_allocator.free(gpu_block)
             self.block_tables[seq.seq_id] = new_block_table
 
+        if seq_group.encoder_seq is not None:
+            new_block_table: BlockTable = []
+            block_table = self.encoder_block_tables[request_id]
+
+            for gpu_block in block_table:
+                if gpu_block in mapping:
+                    cpu_block = mapping[gpu_block]
+                    cpu_block.ref_count += 1
+                else:
+                    cpu_block = self.cpu_allocator.allocate(
+                        gpu_block.block_hash, gpu_block.num_hashed_tokens)
+                    mapping[gpu_block] = cpu_block
+                new_block_table.append(cpu_block)
+                # Free the GPU block swapped out to CPU.
+                self.gpu_allocator.free(gpu_block)
+            self.encoder_block_tables[request_id] = new_block_table
+
         block_number_mapping = {
             gpu_block.block_number: cpu_block.block_number
             for gpu_block, cpu_block in mapping.items()
@@ -559,15 +650,32 @@ def free(self, seq: Sequence) -> None:
         self._free_block_table(block_table)
         del self.block_tables[seq.seq_id]
 
+    def free_encoder(self, seq_group: SequenceGroup) -> None:
+        if seq_group.request_id not in self.encoder_block_tables:
+            # Already freed or hasn't ben scheduled yet.
+            return
+        block_table = self.encoder_block_tables[seq_group.request_id]
+        self._free_block_table(block_table)
+        del self.encoder_block_tables[seq_group.request_id]
+
     def reset(self) -> None:
+        # Free decoder block tables
         for block_table in self.block_tables.values():
             self._free_block_table(block_table)
         self.block_tables.clear()
+        # Free encoder block tables
+        for block_table in self.encoder_block_tables.values():
+            self._free_block_table(block_table)
+        self.encoder_block_tables.clear()
 
     def get_block_table(self, seq: Sequence) -> List[int]:
         block_table = self.block_tables[seq.seq_id]
         return [block.block_number for block in block_table]
 
+    def get_encoder_block_table(self, seq_group: SequenceGroup) -> List[int]:
+        block_table = self.encoder_block_tables[seq_group.request_id]
+        return [block.block_number for block in block_table]
+
     def get_num_free_gpu_blocks(self) -> int:
         return self.gpu_allocator.get_num_free_blocks()
 
@@ -586,6 +694,18 @@ def access_all_blocks_in_seq(
             for block in block_table:
                 block.last_accessed = access_time
 
+    def access_all_encoder_blocks_in_seq_group(
+        self,
+        seq_group: SequenceGroup,
+        access_time: float,
+    ) -> None:
+        if self.enable_caching:
+            # Update the last accessed time of all the blocks accessed
+            # in this step.
+            block_table = self.encoder_block_tables[seq_group.request_id]
+            for block in block_table:
+                block.last_accessed = access_time
+
     def compute_full_blocks_in_seq(self, seq: Sequence):
         if seq.seq_id not in self.block_tables:
             return
diff --git a/vllm/sequence.py b/vllm/sequence.py
index aa759448d82b1..ca2de3ef0d774 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -420,6 +420,7 @@ class SequenceGroup:
             for an embedding model.
         pooling_params: The pooling parameters used to generate the pooling
             for an embedding model.
+        encoder_seq: Optional, the single encoder sequence.
     """
 
     def __init__(
@@ -432,6 +433,7 @@ def __init__(
         multi_modal_data: Optional[MultiModalData] = None,
         embeddings: Optional[List[float]] = None,
         pooling_params: Optional[PoolingParams] = None,
+        encoder_seq: Optional[Sequence] = None,
     ) -> None:
         self.request_id = request_id
         self.seqs_dict = {seq.seq_id: seq for seq in seqs}
@@ -447,6 +449,7 @@ def __init__(
         self.multi_modal_data = multi_modal_data
         self.embeddings = embeddings
         self.pooling_params = pooling_params
+        self.encoder_seq = encoder_seq
 
     @property
     def prompt(self) -> str:
@@ -524,6 +527,9 @@ def get_seqs(
             seq for seq in self.seqs_dict.values() if seq.status == status
         ]
 
+    def get_encoder_seq(self) -> Sequence:
+        return self.encoder_seq
+
     def get_unfinished_seqs(self) -> List[Sequence]:
         return [
             seq for seq in self.seqs_dict.values() if not seq.is_finished()
@@ -607,6 +613,8 @@ class SequenceGroupMetadata:
             used in prefix caching.
         state: Internal state tied to this sequence group.
         multi_modal_data: Multi modal data.
+        encoder_seq_data: Optional, the sequence data for the single encoder prompt.
+        encoder_block_table: Optional, the block table for the single encoder prompt.
     """
 
     def __init__(
@@ -623,6 +631,8 @@ def __init__(
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
         multi_modal_data: Optional[MultiModalData] = None,
+        encoder_seq_data: Optional[SequenceData] = None,
+        encoder_block_table: Optional[Dict[int, List[int]]] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -634,6 +644,8 @@ def __init__(
         self.computed_block_nums = computed_block_nums
         self.multi_modal_data = multi_modal_data
         self.state = SequenceGroupState() if state is None else state
+        self.encoder_seq_data = encoder_seq_data
+        self.encoder_block_table = encoder_block_table
         self._token_chunk_size = token_chunk_size
         self.do_sample = do_sample
 

From f04ee73114eb50dbf03cb1d2a9ecd238705db035 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 May 2024 14:22:04 -0400
Subject: [PATCH 03/47] block manager v2 changes to pass
 test_can_allocate_seq_group_encoder_decoder

---
 tests/core/block/test_block_manager_v2.py | 49 ++++++++++++++++++++++-
 tests/core/utils.py                       | 49 +++++++++++++++++++++++
 vllm/core/block_manager_v2.py             |  6 +++
 3 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 1e8e4ccdfb151..6cb2f3708199f 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -5,7 +5,7 @@
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
 
-from ..utils import create_seq_group
+from ..utils import create_seq_group, create_seq_group_encoder_decoder
 
 
 @pytest.mark.parametrize("block_size", [16])
@@ -52,6 +52,53 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
             assert can_allocate_result == AllocStatus.LATER
 
 
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
+@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_group: int,
+                                                num_gpu_blocks: int, watermark: float):
+    block_manager = BlockSpaceManagerV2(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+    )
+    num_watermark_blocks = int(watermark * num_gpu_blocks)
+
+    num_output_blocks_per_seq = 1
+
+    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
+    # the current implementation assumes all seqs are new prompts / don't have
+    # different output lens.
+    num_output_blocks = num_output_blocks_per_seq
+
+    for bdx,num_prompt_blocks in enumerate(range(1, num_gpu_blocks - num_output_blocks)):
+        num_encoder_blocks_per_seq = num_prompt_blocks
+
+        seq_group = create_seq_group_encoder_decoder(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+            request_id=str(bdx)
+        )
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+        can_allocate_result = block_manager.can_allocate(seq_group)
+
+        num_required_blocks = num_prompt_blocks + num_output_blocks + num_encoder_blocks_per_seq
+
+        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
+            assert can_allocate_result == AllocStatus.NEVER
+        elif num_gpu_blocks >= num_required_blocks:
+            assert can_allocate_result == AllocStatus.OK
+        else:
+            assert can_allocate_result == AllocStatus.LATER
+
+
 @pytest.mark.parametrize("block_size", [1, 8])
 @pytest.mark.parametrize("prompt_len", [1, 7, 8])
 @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 170bf9fff3dd2..91930457bd25b 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -102,5 +102,54 @@ def create_seq_group(
     return seq_group
 
 
+def create_seq_group_encoder_decoder(
+        seq_prompt_len: int = 1024,
+        seq_output_lens: Iterable[int] = (128, ),
+        request_id: str = '0',
+        seq_id_start: int = 0,
+        sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
+
+    assert len(seq_output_lens) > 0
+
+    if sampling_params is None:
+        sampling_params = SamplingParams()
+
+    prompt_token_ids = [0] * seq_prompt_len
+
+    seqs = []
+    for seq_id_offset, output_len in enumerate(seq_output_lens):
+        seq = Sequence(
+            seq_id=seq_id_start + seq_id_offset,
+            prompt="",
+            prompt_token_ids=prompt_token_ids,
+            block_size=16,
+        )
+
+        for i in range(output_len):
+            seq.append_token_id(
+                token_id=i,
+                logprobs={i: Logprob(0.0)},
+            )
+        seqs.append(seq)
+
+    # Encoder sequence
+    encoder_seq = Sequence(
+            seq_id=seq_id_start + len(seq_output_lens),
+            prompt="",
+            prompt_token_ids=prompt_token_ids,
+            block_size=16,
+        )
+
+    seq_group = SequenceGroup(
+        request_id=request_id,
+        seqs=seqs,
+        sampling_params=sampling_params,
+        arrival_time=time.time(),
+        encoder_seq=encoder_seq
+    )
+
+    return seq_group
+
+
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index f0bc96564050a..06bfbba78dce6 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -96,6 +96,12 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
             block_size=self.block_size,
         )
 
+        if seq_group.encoder_seq is not None:
+            num_required_blocks += BlockTable.get_num_required_blocks(
+                seq_group.encoder_seq.get_token_ids(),
+                block_size=self.block_size,
+            )
+
         assert self.block_sliding_window is None
         if self.block_sliding_window is not None:
             num_required_blocks = min(num_required_blocks,

From 07bbd8ac4c44f50f42137350ee928483842d02ee Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 May 2024 14:47:47 -0400
Subject: [PATCH 04/47] block manager v2 support for encoder/decoder

---
 vllm/core/block_manager_v1.py |  9 ++----
 vllm/core/block_manager_v2.py | 59 ++++++++++++++++++++++++++++++++++-
 2 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index bd2ccbbb86572..812d1ee3197a5 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -319,14 +319,11 @@ def allocate_decoder(self, seq_group: SequenceGroup) -> None:
     def allocate_encoder(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
         # encoder prompt.
-        seq = seq_group.get_encoder_seq()
 
         # Allocate new physical token blocks that will store the prompt tokens.
-        block_table: BlockTable = []        
-        if seq is None:
-            # Assign empty encoder block table for the SequenceGroup
-            self.encoder_block_tables[seq_group.request_id] = block_table
-        else: 
+        seq = seq_group.get_encoder_seq()
+        if seq is not None:
+            block_table: BlockTable = []
             num_prompt_blocks = len(seq.logical_token_blocks)
             for logical_idx in range(num_prompt_blocks):
                 if (self.block_sliding_window is not None
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 06bfbba78dce6..2f7a11bacc1a1 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -10,6 +10,7 @@
 from vllm.utils import Device
 
 SeqId = int
+EncoderSeqId = str
 
 
 class BlockSpaceManagerV2(BlockSpaceManager):
@@ -85,6 +86,7 @@ def __init__(
         )
 
         self.block_tables: Dict[SeqId, BlockTable] = {}
+        self.encoder_block_tables: Dict[EncoderSeqId, BlockTable] = {}
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
@@ -119,7 +121,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         else:
             return AllocStatus.LATER
 
-    def allocate(self, seq_group: SequenceGroup) -> None:
+    def allocate_decoder(self, seq_group: SequenceGroup) -> None:
         waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
         assert not (set(seq.seq_id for seq in waiting_seqs)
                     & self.block_tables.keys()), "block table already exists"
@@ -140,6 +142,28 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         for seq in waiting_seqs[1:]:
             self.block_tables[seq.seq_id] = block_table.fork()
 
+    def allocate_encoder(self, seq_group: SequenceGroup) -> None:
+        # NOTE: Here we assume that all sequences in the group have the same
+        # prompt.
+        request_id = seq_group.request_id
+        seq = seq_group.encoder_seq
+
+        assert not (request_id in self.encoder_block_tables), "block table already exists"
+
+        seq = seq_group.get_encoder_seq()
+        if seq is not None:
+            block_table = BlockTable(
+                block_size=self.block_size,
+                block_allocator=self.block_allocator,
+            )
+            assert self.block_sliding_window is None
+            block_table.allocate(seq.get_token_ids())
+            self.encoder_block_tables[request_id] = block_table
+
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        self.allocate_decoder(seq_group)
+        self.allocate_encoder(seq_group)
+
     def can_append_slots(self, seq_group: SequenceGroup,
                          num_lookahead_slots: int) -> bool:
         """Determine if there is enough space in the GPU KV cache to continue
@@ -193,12 +217,29 @@ def free(self, seq: Sequence) -> None:
         self.block_tables[seq.seq_id].free()
         del self.block_tables[seq.seq_id]
 
+    def free_encoder(self, seq_group: SequenceGroup) -> None:
+        request_id = seq_group.request_id
+        if request_id not in self.encoder_block_tables:
+            # Already freed or hasn't ben scheduled yet.
+            return
+        self.encoder_block_tables[request_id].free()
+        del self.encoder_block_tables[request_id]
+
+        del self.encoder_block_tables[seq_group.request_id]
+
     def get_block_table(self, seq: Sequence) -> List[int]:
         assert seq.seq_id in self.block_tables
         block_ids = self.block_tables[seq.seq_id].physical_block_ids
         assert all(b is not None for b in block_ids)
         return block_ids  # type: ignore
 
+    def get_encoder_block_table(self, seq_group: SequenceGroup) -> List[int]:
+        request_id = seq_group.request_id
+        assert request_id in self.encoder_block_tables
+        block_ids = self.block_tables[request_id].physical_block_ids
+        assert all(b is not None for b in block_ids)
+        return block_ids
+
     def access_all_blocks_in_seq(self, seq: Sequence, now: float):
         # Update the last accessed time of all the blocks accessed
         # in this step.
@@ -215,6 +256,22 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float):
                 block_ids,  # type: ignore
                 now)
 
+    def access_all_encoder_blocks_in_seq_group(
+        self,
+        seq_group: SequenceGroup,
+        now: float,
+    ) -> None:
+        if self.enable_caching:
+            # Update the last accessed time of all the blocks accessed
+            # in this step.
+            block_table = self.encoder_block_tables[seq_group.request_id]
+            block_ids = []
+            for block_id in block_table.physical_block_ids:
+                block_ids.append(block_id)
+            self.block_allocator.mark_blocks_as_accessed(
+                block_ids,  # type: ignore
+                now)
+
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         # The only need for mark block as computed is for prefix caching,
         # while currently we could determine whether one block is computed

From 3e95602f9c408f82628e881f30540ac82b3cb5f7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 May 2024 15:11:35 -0400
Subject: [PATCH 05/47] renamed encoder to cross in block manager v2, regarding
 block tables

---
 vllm/core/block_manager_v2.py | 32 ++++++++++++++++----------------
 vllm/sequence.py              |  6 +++---
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 2f7a11bacc1a1..426612f615508 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -86,7 +86,7 @@ def __init__(
         )
 
         self.block_tables: Dict[SeqId, BlockTable] = {}
-        self.encoder_block_tables: Dict[EncoderSeqId, BlockTable] = {}
+        self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {}
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
@@ -121,7 +121,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         else:
             return AllocStatus.LATER
 
-    def allocate_decoder(self, seq_group: SequenceGroup) -> None:
+    def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None:
         waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
         assert not (set(seq.seq_id for seq in waiting_seqs)
                     & self.block_tables.keys()), "block table already exists"
@@ -142,13 +142,13 @@ def allocate_decoder(self, seq_group: SequenceGroup) -> None:
         for seq in waiting_seqs[1:]:
             self.block_tables[seq.seq_id] = block_table.fork()
 
-    def allocate_encoder(self, seq_group: SequenceGroup) -> None:
+    def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
         # prompt.
         request_id = seq_group.request_id
         seq = seq_group.encoder_seq
 
-        assert not (request_id in self.encoder_block_tables), "block table already exists"
+        assert not (request_id in self.cross_block_tables), "block table already exists"
 
         seq = seq_group.get_encoder_seq()
         if seq is not None:
@@ -158,11 +158,11 @@ def allocate_encoder(self, seq_group: SequenceGroup) -> None:
             )
             assert self.block_sliding_window is None
             block_table.allocate(seq.get_token_ids())
-            self.encoder_block_tables[request_id] = block_table
+            self.cross_block_tables[request_id] = block_table
 
     def allocate(self, seq_group: SequenceGroup) -> None:
-        self.allocate_decoder(seq_group)
-        self.allocate_encoder(seq_group)
+        self.allocate_self_block_tables(seq_group)
+        self.allocate_cross_block_table(seq_group)
 
     def can_append_slots(self, seq_group: SequenceGroup,
                          num_lookahead_slots: int) -> bool:
@@ -217,15 +217,15 @@ def free(self, seq: Sequence) -> None:
         self.block_tables[seq.seq_id].free()
         del self.block_tables[seq.seq_id]
 
-    def free_encoder(self, seq_group: SequenceGroup) -> None:
+    def free_cross(self, seq_group: SequenceGroup) -> None:
         request_id = seq_group.request_id
-        if request_id not in self.encoder_block_tables:
+        if request_id not in self.cross_block_tables:
             # Already freed or hasn't ben scheduled yet.
             return
-        self.encoder_block_tables[request_id].free()
-        del self.encoder_block_tables[request_id]
+        self.cross_block_tables[request_id].free()
+        del self.cross_block_tables[request_id]
 
-        del self.encoder_block_tables[seq_group.request_id]
+        del self.cross_block_tables[seq_group.request_id]
 
     def get_block_table(self, seq: Sequence) -> List[int]:
         assert seq.seq_id in self.block_tables
@@ -233,9 +233,9 @@ def get_block_table(self, seq: Sequence) -> List[int]:
         assert all(b is not None for b in block_ids)
         return block_ids  # type: ignore
 
-    def get_encoder_block_table(self, seq_group: SequenceGroup) -> List[int]:
+    def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
         request_id = seq_group.request_id
-        assert request_id in self.encoder_block_tables
+        assert request_id in self.cross_block_tables
         block_ids = self.block_tables[request_id].physical_block_ids
         assert all(b is not None for b in block_ids)
         return block_ids
@@ -256,7 +256,7 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float):
                 block_ids,  # type: ignore
                 now)
 
-    def access_all_encoder_blocks_in_seq_group(
+    def access_all_cross_blocks_in_seq_group(
         self,
         seq_group: SequenceGroup,
         now: float,
@@ -264,7 +264,7 @@ def access_all_encoder_blocks_in_seq_group(
         if self.enable_caching:
             # Update the last accessed time of all the blocks accessed
             # in this step.
-            block_table = self.encoder_block_tables[seq_group.request_id]
+            block_table = self.cross_block_tables[seq_group.request_id]
             block_ids = []
             for block_id in block_table.physical_block_ids:
                 block_ids.append(block_id)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ca2de3ef0d774..a73e70c1ae69d 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -614,7 +614,7 @@ class SequenceGroupMetadata:
         state: Internal state tied to this sequence group.
         multi_modal_data: Multi modal data.
         encoder_seq_data: Optional, the sequence data for the single encoder prompt.
-        encoder_block_table: Optional, the block table for the single encoder prompt.
+        cross_block_table: Optional, the cross-attention block table associated with the single encoder prompt.
     """
 
     def __init__(
@@ -632,7 +632,7 @@ def __init__(
         state: Optional[SequenceGroupState] = None,
         multi_modal_data: Optional[MultiModalData] = None,
         encoder_seq_data: Optional[SequenceData] = None,
-        encoder_block_table: Optional[Dict[int, List[int]]] = None,
+        cross_block_table: Optional[Dict[int, List[int]]] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt
@@ -645,7 +645,7 @@ def __init__(
         self.multi_modal_data = multi_modal_data
         self.state = SequenceGroupState() if state is None else state
         self.encoder_seq_data = encoder_seq_data
-        self.encoder_block_table = encoder_block_table
+        self.cross_block_table = cross_block_table
         self._token_chunk_size = token_chunk_size
         self.do_sample = do_sample
 

From 04f38a819445c0141246feeb6969cc4b1e67891f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 May 2024 15:22:53 -0400
Subject: [PATCH 06/47] renamed encoder to cross where appropriate

---
 tests/core/block/test_block_manager_v2.py |  4 +-
 tests/core/test_block_manager.py          | 12 ++---
 vllm/core/block_manager_v1.py             | 54 +++++++++++------------
 3 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 6cb2f3708199f..9b1c6cd68a15a 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -74,7 +74,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_gr
     num_output_blocks = num_output_blocks_per_seq
 
     for bdx,num_prompt_blocks in enumerate(range(1, num_gpu_blocks - num_output_blocks)):
-        num_encoder_blocks_per_seq = num_prompt_blocks
+        num_cross_blocks_per_seq = num_prompt_blocks
 
         seq_group = create_seq_group_encoder_decoder(
             seq_prompt_len=block_size * num_prompt_blocks,
@@ -89,7 +89,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_gr
 
         can_allocate_result = block_manager.can_allocate(seq_group)
 
-        num_required_blocks = num_prompt_blocks + num_output_blocks + num_encoder_blocks_per_seq
+        num_required_blocks = num_prompt_blocks + num_output_blocks + num_cross_blocks_per_seq
 
         if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
             assert can_allocate_result == AllocStatus.NEVER
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 6b2fa21f2ef46..62b7132e40462 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -293,8 +293,8 @@ def test_swap_encoder_decoder():
 
     # Swap encoder/decoder seq group from GPU -> CPU.
     decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt)
-    encoder_gpu_blocks = block_manager.get_encoder_block_table(seq_group)
-    gpu_blocks = decoder_gpu_blocks + encoder_gpu_blocks
+    cross_gpu_blocks = block_manager.get_cross_block_table(seq_group)
+    gpu_blocks = decoder_gpu_blocks + cross_gpu_blocks
     assert block_manager.can_swap_out(seq_group)
     before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
@@ -309,8 +309,8 @@ def test_swap_encoder_decoder():
 
     # Swap decoder seq group from CPU -> GPU.
     decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt)
-    encoder_cpu_blocks = block_manager.get_encoder_block_table(seq_group)
-    cpu_blocks = decoder_cpu_blocks + encoder_cpu_blocks
+    cross_cpu_blocks = block_manager.get_cross_block_table(seq_group)
+    cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks
     assert block_manager.can_swap_in(seq_group) == AllocStatus.OK
     before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
@@ -360,11 +360,11 @@ def test_free_encoder_decoder():
 
     # Free allocated seq.
     decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt))
-    encoder_prompt_blocks = len(block_manager.get_encoder_block_table(seq_group))
+    encoder_prompt_blocks = len(block_manager.get_cross_block_table(seq_group))
     prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks
     before_blocks = block_manager.get_num_free_gpu_blocks()
     block_manager.free(decoder_prompt)
-    block_manager.free_encoder(seq_group)
+    block_manager.free_cross(seq_group)
     after_blocks = block_manager.get_num_free_gpu_blocks()
     assert after_blocks == before_blocks + prompt_blocks
 
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 812d1ee3197a5..11a52b3618b44 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -258,7 +258,7 @@ def __init__(
         # Mapping: req_id -> BlockTable
         # Note that each SequenceGroup has a unique
         # request ID
-        self.encoder_block_tables: Dict[str, BlockTable] = {}
+        self.cross_block_tables: Dict[str, BlockTable] = {}
 
     def get_seq_num_required_blocks(self, seq: Sequence) -> int:
         if seq is None:
@@ -269,9 +269,9 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
-        decoder_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
-        encoder_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_encoder_seq())
-        num_required_blocks = decoder_num_required_blocks+encoder_num_required_blocks
+        self_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
+        cross_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_encoder_seq())
+        num_required_blocks = self_num_required_blocks+cross_num_required_blocks
 
         if self.block_sliding_window is not None:
             num_required_blocks = min(num_required_blocks,
@@ -287,7 +287,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         else:
             return AllocStatus.LATER
 
-    def allocate_decoder(self, seq_group: SequenceGroup) -> None:
+    def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
         # decoder prompt.
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
@@ -316,7 +316,7 @@ def allocate_decoder(self, seq_group: SequenceGroup) -> None:
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
 
-    def allocate_encoder(self, seq_group: SequenceGroup) -> None:
+    def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
         # encoder prompt.
 
@@ -342,12 +342,12 @@ def allocate_encoder(self, seq_group: SequenceGroup) -> None:
                     block.ref_count = seq_group.num_seqs()
                 block_table.append(block)
 
-            # Assign the encoder block table for the SequenceGroup.
-            self.encoder_block_tables[seq_group.request_id] = block_table
+            # Assign the cross-attention block table for the SequenceGroup.
+            self.cross_block_tables[seq_group.request_id] = block_table
 
     def allocate(self, seq_group: SequenceGroup) -> None:
-        self.allocate_decoder(seq_group)
-        self.allocate_encoder(seq_group)
+        self.allocate_self_block_tables(seq_group)
+        self.allocate_cross_block_table(seq_group)
 
     def can_append_slots(self,
                          seq_group: SequenceGroup,
@@ -495,9 +495,9 @@ def _get_physical_blocks(
             if seq.is_finished():
                 continue
             blocks.update(self.block_tables[seq.seq_id])
-        # Encoder blocks
+        # Cross-attention blocks
         if seq_group.encoder_seq is not None:
-            blocks.update(self.encoder_block_tables[request_id])
+            blocks.update(self.cross_block_tables[request_id])
         return list(blocks)
 
     def can_swap_in(self,
@@ -551,7 +551,7 @@ def swap_in(self,
 
         if seq_group.encoder_seq is not None:
             new_block_table: BlockTable = []
-            block_table = self.encoder_block_tables[request_id]
+            block_table = self.cross_block_tables[request_id]
 
             for cpu_block in block_table:
                 if cpu_block in mapping:
@@ -564,7 +564,7 @@ def swap_in(self,
                 new_block_table.append(gpu_block)
                 # Free the CPU block swapped in to GPU.
                 self.cpu_allocator.free(cpu_block)
-            self.encoder_block_tables[request_id] = new_block_table
+            self.cross_block_tables[request_id] = new_block_table
 
         block_number_mapping = {
             cpu_block.block_number: gpu_block.block_number
@@ -602,7 +602,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
 
         if seq_group.encoder_seq is not None:
             new_block_table: BlockTable = []
-            block_table = self.encoder_block_tables[request_id]
+            block_table = self.cross_block_tables[request_id]
 
             for gpu_block in block_table:
                 if gpu_block in mapping:
@@ -615,7 +615,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
                 new_block_table.append(cpu_block)
                 # Free the GPU block swapped out to CPU.
                 self.gpu_allocator.free(gpu_block)
-            self.encoder_block_tables[request_id] = new_block_table
+            self.cross_block_tables[request_id] = new_block_table
 
         block_number_mapping = {
             gpu_block.block_number: cpu_block.block_number
@@ -647,30 +647,30 @@ def free(self, seq: Sequence) -> None:
         self._free_block_table(block_table)
         del self.block_tables[seq.seq_id]
 
-    def free_encoder(self, seq_group: SequenceGroup) -> None:
-        if seq_group.request_id not in self.encoder_block_tables:
+    def free_cross(self, seq_group: SequenceGroup) -> None:
+        if seq_group.request_id not in self.cross_block_tables:
             # Already freed or hasn't ben scheduled yet.
             return
-        block_table = self.encoder_block_tables[seq_group.request_id]
+        block_table = self.cross_block_tables[seq_group.request_id]
         self._free_block_table(block_table)
-        del self.encoder_block_tables[seq_group.request_id]
+        del self.cross_block_tables[seq_group.request_id]
 
     def reset(self) -> None:
         # Free decoder block tables
         for block_table in self.block_tables.values():
             self._free_block_table(block_table)
         self.block_tables.clear()
-        # Free encoder block tables
-        for block_table in self.encoder_block_tables.values():
+        # Free cross-attention block tables
+        for block_table in self.cross_block_tables.values():
             self._free_block_table(block_table)
-        self.encoder_block_tables.clear()
+        self.cross_block_tables.clear()
 
     def get_block_table(self, seq: Sequence) -> List[int]:
         block_table = self.block_tables[seq.seq_id]
         return [block.block_number for block in block_table]
 
-    def get_encoder_block_table(self, seq_group: SequenceGroup) -> List[int]:
-        block_table = self.encoder_block_tables[seq_group.request_id]
+    def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
+        block_table = self.cross_block_tables[seq_group.request_id]
         return [block.block_number for block in block_table]
 
     def get_num_free_gpu_blocks(self) -> int:
@@ -691,7 +691,7 @@ def access_all_blocks_in_seq(
             for block in block_table:
                 block.last_accessed = access_time
 
-    def access_all_encoder_blocks_in_seq_group(
+    def access_all_cross_blocks_in_seq_group(
         self,
         seq_group: SequenceGroup,
         access_time: float,
@@ -699,7 +699,7 @@ def access_all_encoder_blocks_in_seq_group(
         if self.enable_caching:
             # Update the last accessed time of all the blocks accessed
             # in this step.
-            block_table = self.encoder_block_tables[seq_group.request_id]
+            block_table = self.cross_block_tables[seq_group.request_id]
             for block in block_table:
                 block.last_accessed = access_time
 

From 2dcd663d40bdcc1cf2aca19b9cec64395ac6d528 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 15 May 2024 15:45:12 -0400
Subject: [PATCH 07/47] formatting

---
 tests/core/block/test_block_manager_v2.py | 16 +++++---
 tests/core/test_block_manager.py          | 43 +++++++++++++++-------
 tests/core/utils.py                       | 45 ++++++++++++-----------
 vllm/core/block_manager_v1.py             | 18 +++++----
 vllm/core/block_manager_v2.py             |  8 ++--
 vllm/sequence.py                          |  9 +++--
 6 files changed, 85 insertions(+), 54 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 9b1c6cd68a15a..06c3389cfa0f0 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -56,8 +56,10 @@ def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
 @pytest.mark.parametrize("num_gpu_blocks", [16, 80, 160])
 @pytest.mark.parametrize("num_seqs_per_group", [1, 4])
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_group: int,
-                                                num_gpu_blocks: int, watermark: float):
+def test_can_allocate_seq_group_encoder_decoder(block_size: int,
+                                                num_seqs_per_group: int,
+                                                num_gpu_blocks: int,
+                                                watermark: float):
     block_manager = BlockSpaceManagerV2(
         block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
@@ -73,7 +75,8 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_gr
     # different output lens.
     num_output_blocks = num_output_blocks_per_seq
 
-    for bdx,num_prompt_blocks in enumerate(range(1, num_gpu_blocks - num_output_blocks)):
+    for bdx, num_prompt_blocks in enumerate(
+            range(1, num_gpu_blocks - num_output_blocks)):
         num_cross_blocks_per_seq = num_prompt_blocks
 
         seq_group = create_seq_group_encoder_decoder(
@@ -82,14 +85,15 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_gr
                 block_size * num_output_blocks_per_seq
                 for _ in range(num_seqs_per_group)
             ],
-            request_id=str(bdx)
-        )
+            request_id=str(bdx))
 
         assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
 
         can_allocate_result = block_manager.can_allocate(seq_group)
 
-        num_required_blocks = num_prompt_blocks + num_output_blocks + num_cross_blocks_per_seq
+        num_required_blocks = num_prompt_blocks + \
+                              num_output_blocks + \
+                              num_cross_blocks_per_seq
 
         if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
             assert can_allocate_result == AllocStatus.NEVER
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 62b7132e40462..d6ab246699903 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -89,6 +89,7 @@ def test_allocate():
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 
+
 def test_allocate_encoder_decoder():
     block_size = 4
     num_cpu_blocks = 4
@@ -100,8 +101,9 @@ def test_allocate_encoder_decoder():
                                         watermark=0)
 
     # Allocate same sequence group to all available gpu blocks.
-    for i in range(num_gpu_blocks//block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(str(i), block_size, block_size)
+    for i in range(num_gpu_blocks // block_req_per_seq_group):
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(
+            str(i), block_size, block_size)
         assert block_manager.can_allocate(seq_group)
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
@@ -112,12 +114,14 @@ def test_allocate_encoder_decoder():
                                         num_cpu_blocks,
                                         num_gpu_blocks,
                                         watermark=1 / num_gpu_blocks)
-    for i in range((num_gpu_blocks - 1)//block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(str(i), block_size//2, block_size//2)
+    for i in range((num_gpu_blocks - 1) // block_req_per_seq_group):
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(
+            str(i), block_size // 2, block_size // 2)
         assert block_manager.can_allocate(seq_group)
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 
+
 def test_append_slot_single_seq():
     block_size = 4
     num_cpu_blocks = 4
@@ -268,6 +272,7 @@ def test_swap():
     assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
     assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
 
+
 def test_swap_encoder_decoder():
     block_size = 4
     num_cpu_blocks = 4
@@ -277,9 +282,11 @@ def test_swap_encoder_decoder():
                                         num_gpu_blocks,
                                         watermark=0)
 
-    decoder_prompt, encoder_prompt, seq_group = create_dummy_prompt_encoder_decoder("1", 
-                                                                                    decoder_prompt_length=block_size, 
-                                                                                    encoder_prompt_length=block_size)
+    decoder_prompt, encoder_prompt, seq_group = \
+        create_dummy_prompt_encoder_decoder(
+        "1",
+        decoder_prompt_length=block_size,
+        encoder_prompt_length=block_size)
     decoder_prompt.status = SequenceStatus.WAITING
     encoder_prompt.status = SequenceStatus.WAITING
     block_manager.allocate(seq_group)
@@ -321,6 +328,7 @@ def test_swap_encoder_decoder():
     assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
     assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
 
+
 def test_free():
     block_size = 4
     num_cpu_blocks = 4
@@ -344,6 +352,7 @@ def test_free():
     with pytest.raises(KeyError):
         block_manager.get_block_table(prompt)
 
+
 def test_free_encoder_decoder():
     block_size = 4
     num_cpu_blocks = 4
@@ -353,9 +362,11 @@ def test_free_encoder_decoder():
                                         num_gpu_blocks,
                                         watermark=0)
 
-    decoder_prompt, encoder_prompt, seq_group = create_dummy_prompt_encoder_decoder("1", 
-                                                                                    decoder_prompt_length=block_size//2, 
-                                                                                    encoder_prompt_length=block_size//2)
+    decoder_prompt, encoder_prompt, seq_group = \
+        create_dummy_prompt_encoder_decoder(
+        "1",
+        decoder_prompt_length=block_size // 2,
+        encoder_prompt_length=block_size // 2)
     block_manager.allocate(seq_group)
 
     # Free allocated seq.
@@ -373,6 +384,7 @@ def test_free_encoder_decoder():
         block_manager.get_block_table(decoder_prompt)
         block_manager.get_block_table(encoder_prompt)
 
+
 def test_reset():
     block_size = 4
     num_cpu_blocks = 4
@@ -393,6 +405,7 @@ def test_reset():
     block_manager.reset()
     assert block_manager.get_num_free_gpu_blocks() == original_blocks
 
+
 def test_reset_encoder_decoder():
     block_size = 4
     num_cpu_blocks = 4
@@ -405,10 +418,11 @@ def test_reset_encoder_decoder():
 
     # Allocate same seq group on all available gpu blocks.
     original_blocks = block_manager.get_num_free_gpu_blocks()
-    for i in range(num_gpu_blocks//block_req_per_seq_group):
-        _, _, seq_group = create_dummy_prompt_encoder_decoder(f"{i}", 
-                                                              decoder_prompt_length=block_size, 
-                                                              encoder_prompt_length=block_size)
+    for i in range(num_gpu_blocks // block_req_per_seq_group):
+        _, _, seq_group = create_dummy_prompt_encoder_decoder(
+            f"{i}",
+            decoder_prompt_length=block_size,
+            encoder_prompt_length=block_size)
         block_manager.allocate(seq_group)
     assert block_manager.get_num_free_gpu_blocks() == 0
 
@@ -416,6 +430,7 @@ def test_reset_encoder_decoder():
     block_manager.reset()
     assert block_manager.get_num_free_gpu_blocks() == original_blocks
 
+
 def test_sliding_window_multi_seq():
     """
     Tests that memory allocation and deallocation is handled
diff --git a/tests/core/utils.py b/tests/core/utils.py
index 91930457bd25b..376af0f0eac4f 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -32,6 +32,7 @@ def create_dummy_prompt(
 
     return prompt, seq_group
 
+
 def create_dummy_prompt_encoder_decoder(
     request_id: str,
     decoder_prompt_length: int,
@@ -48,20 +49,24 @@ def create_dummy_prompt_encoder_decoder(
     # and prompt "0 ... block_size".
     decoder_prompt_tokens = list(range(decoder_prompt_length))
     decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
-    decoder_prompt = Sequence(int(request_id), decoder_prompt_str, decoder_prompt_tokens, block_size)
+    decoder_prompt = Sequence(int(request_id), decoder_prompt_str,
+                              decoder_prompt_tokens, block_size)
     encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
     encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
-    encoder_prompt = Sequence(int(request_id), encoder_prompt_str, encoder_prompt_tokens, block_size)
-    seq_group = SequenceGroup(
-        request_id=request_id, 
-        seqs=[decoder_prompt],
-        sampling_params=SamplingParams(use_beam_search=use_beam_search, best_of=best_of),
-        arrival_time=time.time(), 
-        lora_request=lora_request, 
-        encoder_seq=encoder_prompt)
+    encoder_prompt = Sequence(int(request_id), encoder_prompt_str,
+                              encoder_prompt_tokens, block_size)
+    seq_group = SequenceGroup(request_id=request_id,
+                              seqs=[decoder_prompt],
+                              sampling_params=SamplingParams(
+                                  use_beam_search=use_beam_search,
+                                  best_of=best_of),
+                              arrival_time=time.time(),
+                              lora_request=lora_request,
+                              encoder_seq=encoder_prompt)
 
     return decoder_prompt, encoder_prompt, seq_group
 
+
 def create_seq_group(
         seq_prompt_len: int = 1024,
         seq_output_lens: Iterable[int] = (128, ),
@@ -134,20 +139,18 @@ def create_seq_group_encoder_decoder(
 
     # Encoder sequence
     encoder_seq = Sequence(
-            seq_id=seq_id_start + len(seq_output_lens),
-            prompt="",
-            prompt_token_ids=prompt_token_ids,
-            block_size=16,
-        )
-
-    seq_group = SequenceGroup(
-        request_id=request_id,
-        seqs=seqs,
-        sampling_params=sampling_params,
-        arrival_time=time.time(),
-        encoder_seq=encoder_seq
+        seq_id=seq_id_start + len(seq_output_lens),
+        prompt="",
+        prompt_token_ids=prompt_token_ids,
+        block_size=16,
     )
 
+    seq_group = SequenceGroup(request_id=request_id,
+                              seqs=seqs,
+                              sampling_params=sampling_params,
+                              arrival_time=time.time(),
+                              encoder_seq=encoder_seq)
+
     return seq_group
 
 
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 11a52b3618b44..03eba2e80c78d 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -263,15 +263,18 @@ def __init__(
     def get_seq_num_required_blocks(self, seq: Sequence) -> int:
         if seq is None:
             return 0
-        return len(seq.logical_token_blocks)  
+        return len(seq.logical_token_blocks)
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
-        self_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
-        cross_num_required_blocks = self.get_seq_num_required_blocks(seq_group.get_encoder_seq())
-        num_required_blocks = self_num_required_blocks+cross_num_required_blocks
+        self_num_required_blocks = self.get_seq_num_required_blocks(
+            seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
+        cross_num_required_blocks = self.get_seq_num_required_blocks(
+            seq_group.get_encoder_seq())
+        num_required_blocks = self_num_required_blocks + \
+                              cross_num_required_blocks
 
         if self.block_sliding_window is not None:
             num_required_blocks = min(num_required_blocks,
@@ -328,7 +331,8 @@ def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None:
             for logical_idx in range(num_prompt_blocks):
                 if (self.block_sliding_window is not None
                         and logical_idx >= self.block_sliding_window):
-                    block = block_table[logical_idx % self.block_sliding_window]
+                    block = block_table[logical_idx %
+                                        self.block_sliding_window]
                     # Set the reference counts of the token blocks.
                     block.ref_count = seq_group.num_seqs()
                 elif self.enable_caching:
@@ -550,7 +554,7 @@ def swap_in(self,
             self.block_tables[seq.seq_id] = new_block_table
 
         if seq_group.encoder_seq is not None:
-            new_block_table: BlockTable = []
+            new_block_table = []
             block_table = self.cross_block_tables[request_id]
 
             for cpu_block in block_table:
@@ -601,7 +605,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
             self.block_tables[seq.seq_id] = new_block_table
 
         if seq_group.encoder_seq is not None:
-            new_block_table: BlockTable = []
+            new_block_table = []
             block_table = self.cross_block_tables[request_id]
 
             for gpu_block in block_table:
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 426612f615508..4ae3361e7b234 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -148,7 +148,9 @@ def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None:
         request_id = seq_group.request_id
         seq = seq_group.encoder_seq
 
-        assert not (request_id in self.cross_block_tables), "block table already exists"
+        assert (request_id
+                not in self.cross_block_tables), \
+                "block table already exists"
 
         seq = seq_group.get_encoder_seq()
         if seq is not None:
@@ -236,9 +238,9 @@ def get_block_table(self, seq: Sequence) -> List[int]:
     def get_cross_block_table(self, seq_group: SequenceGroup) -> List[int]:
         request_id = seq_group.request_id
         assert request_id in self.cross_block_tables
-        block_ids = self.block_tables[request_id].physical_block_ids
+        block_ids = self.cross_block_tables[request_id].physical_block_ids
         assert all(b is not None for b in block_ids)
-        return block_ids
+        return block_ids  # type: ignore
 
     def access_all_blocks_in_seq(self, seq: Sequence, now: float):
         # Update the last accessed time of all the blocks accessed
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a73e70c1ae69d..a11c411876ea8 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -528,7 +528,7 @@ def get_seqs(
         ]
 
     def get_encoder_seq(self) -> Sequence:
-        return self.encoder_seq
+        return self.encoder_seq  # type: ignore
 
     def get_unfinished_seqs(self) -> List[Sequence]:
         return [
@@ -613,8 +613,11 @@ class SequenceGroupMetadata:
             used in prefix caching.
         state: Internal state tied to this sequence group.
         multi_modal_data: Multi modal data.
-        encoder_seq_data: Optional, the sequence data for the single encoder prompt.
-        cross_block_table: Optional, the cross-attention block table associated with the single encoder prompt.
+        encoder_seq_data: Optional, the sequence data
+                          for the single encoder prompt.
+        cross_block_table: Optional, the cross-attention 
+                           block table associated with 
+                           the single encoder prompt.
     """
 
     def __init__(

From 2ced012a3e51a77abbbab2268d88730fdffa4a3f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 21 May 2024 06:19:19 -0400
Subject: [PATCH 08/47] fix wording nits (ben->been, decoder->encoder/decoder)

---
 tests/core/test_block_manager.py | 2 +-
 vllm/core/block_manager_v2.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index d6ab246699903..81e3444815d4e 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -314,7 +314,7 @@ def test_swap_encoder_decoder():
     assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
     decoder_prompt.status = SequenceStatus.SWAPPED
 
-    # Swap decoder seq group from CPU -> GPU.
+    # Swap encoder/decoder seq group from CPU -> GPU.
     decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt)
     cross_cpu_blocks = block_manager.get_cross_block_table(seq_group)
     cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 4ae3361e7b234..978acd915b69b 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -222,7 +222,7 @@ def free(self, seq: Sequence) -> None:
     def free_cross(self, seq_group: SequenceGroup) -> None:
         request_id = seq_group.request_id
         if request_id not in self.cross_block_tables:
-            # Already freed or hasn't ben scheduled yet.
+            # Already freed or hasn't been scheduled yet.
             return
         self.cross_block_tables[request_id].free()
         del self.cross_block_tables[request_id]

From 8286b4cfbe57001767617a9ee33066945f6baa3d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 13:46:58 -0400
Subject: [PATCH 09/47] changed two block manager tests to construct fake
 prompts that are equal in length to the bock size, rather than half the block
 size (which had been the case

---
 tests/core/test_block_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 81e3444815d4e..9dc1c88819b70 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -116,7 +116,7 @@ def test_allocate_encoder_decoder():
                                         watermark=1 / num_gpu_blocks)
     for i in range((num_gpu_blocks - 1) // block_req_per_seq_group):
         _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i), block_size // 2, block_size // 2)
+            str(i), block_size, block_size)
         assert block_manager.can_allocate(seq_group)
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
@@ -365,8 +365,8 @@ def test_free_encoder_decoder():
     decoder_prompt, encoder_prompt, seq_group = \
         create_dummy_prompt_encoder_decoder(
         "1",
-        decoder_prompt_length=block_size // 2,
-        encoder_prompt_length=block_size // 2)
+        decoder_prompt_length=block_size,
+        encoder_prompt_length=block_size)
     block_manager.allocate(seq_group)
 
     # Free allocated seq.

From eba551cd7e1d53911cb392d773eec05cfe40cc4f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 13:50:04 -0400
Subject: [PATCH 10/47] keyword args for dummy prompt construction in block
 manager encoder/decoder tests

---
 tests/core/test_block_manager.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 9dc1c88819b70..19dfc09dbb001 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -103,7 +103,9 @@ def test_allocate_encoder_decoder():
     # Allocate same sequence group to all available gpu blocks.
     for i in range(num_gpu_blocks // block_req_per_seq_group):
         _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i), block_size, block_size)
+            str(i), 
+            decoder_prompt_length=block_size,
+            decoder_prompt_length=block_size)
         assert block_manager.can_allocate(seq_group)
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
@@ -116,7 +118,9 @@ def test_allocate_encoder_decoder():
                                         watermark=1 / num_gpu_blocks)
     for i in range((num_gpu_blocks - 1) // block_req_per_seq_group):
         _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i), block_size, block_size)
+            str(i), 
+            decoder_prompt_length=block_size, 
+            decoder_prompt_length=block_size)
         assert block_manager.can_allocate(seq_group)
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK

From a7c8b192cd7c6e6c815caf5acbbd4ed24b16925d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 14:00:05 -0400
Subject: [PATCH 11/47] bugfix - decoder prompt kwarg repeated in lieu of
 encoder prompt kwarg

---
 tests/core/test_block_manager.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 19dfc09dbb001..29956ff028143 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -73,7 +73,7 @@ def test_allocate():
     # Allocate same sequence group to all available gpu blocks.
     for i in range(num_gpu_blocks):
         _, seq_group = create_dummy_prompt(str(i), block_size)
-        assert block_manager.can_allocate(seq_group)
+        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 
@@ -85,7 +85,7 @@ def test_allocate():
                                         watermark=1 / num_gpu_blocks)
     for i in range(num_gpu_blocks - 1):
         _, seq_group = create_dummy_prompt(str(i), block_size)
-        assert block_manager.can_allocate(seq_group)
+        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 
@@ -105,8 +105,8 @@ def test_allocate_encoder_decoder():
         _, _, seq_group = create_dummy_prompt_encoder_decoder(
             str(i), 
             decoder_prompt_length=block_size,
-            decoder_prompt_length=block_size)
-        assert block_manager.can_allocate(seq_group)
+            encoder_prompt_length=block_size)
+        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 
@@ -120,8 +120,8 @@ def test_allocate_encoder_decoder():
         _, _, seq_group = create_dummy_prompt_encoder_decoder(
             str(i), 
             decoder_prompt_length=block_size, 
-            decoder_prompt_length=block_size)
-        assert block_manager.can_allocate(seq_group)
+            encoder_prompt_length=block_size)
+        assert block_manager.can_allocate(seq_group) == AllocStatus.OK
         block_manager.allocate(seq_group)
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 

From 9feb994966e365fac63bbec526cafb24cf00dcde Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 14:09:42 -0400
Subject: [PATCH 12/47] In block manager test which used with block to detect
 error - created a second with block for encoder-related call that previously
 shared a with block with the corresponding decoder-related call

---
 tests/core/test_block_manager.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 29956ff028143..808b0a5e651eb 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -386,6 +386,9 @@ def test_free_encoder_decoder():
     # Block table for freed encoder & decoder seq's are deleted.
     with pytest.raises(KeyError):
         block_manager.get_block_table(decoder_prompt)
+
+    # Block table for freed encoder & decoder seq's are deleted.
+    with pytest.raises(KeyError):
         block_manager.get_block_table(encoder_prompt)
 
 

From 5eb0032bfaaf5bc43fab66f1fc8bea30045915b7 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 14:17:50 -0400
Subject: [PATCH 13/47] refactoring block manager v1/v2 swap in/swap out
 functions

---
 vllm/core/block_manager_v1.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 03eba2e80c78d..119e444df1b11 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -570,12 +570,7 @@ def swap_in(self,
                 self.cpu_allocator.free(cpu_block)
             self.cross_block_tables[request_id] = new_block_table
 
-        block_number_mapping = {
-            cpu_block.block_number: gpu_block.block_number
-            for cpu_block, gpu_block in mapping.items()
-        }
-        # convert to list of tuples once here
-        return list(block_number_mapping.items())
+        return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()]
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         blocks = self._get_physical_blocks(seq_group)
@@ -621,12 +616,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
                 self.gpu_allocator.free(gpu_block)
             self.cross_block_tables[request_id] = new_block_table
 
-        block_number_mapping = {
-            gpu_block.block_number: cpu_block.block_number
-            for gpu_block, cpu_block in mapping.items()
-        }
-        # convert to list of tuples once here
-        return list(block_number_mapping.items())
+        return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()]
 
     def _free_block_table(self, block_table: BlockTable) -> None:
         # when using a sliding window, each seq will only use up

From 0644cde2aced6d7fb6c279025b2a4a3d8f5625d2 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 14:23:50 -0400
Subject: [PATCH 14/47] formatting; changed blocktable type specifier from Dict
 to List[int]

---
 tests/core/test_block_manager.py | 6 +++---
 vllm/core/block_manager_v1.py    | 6 ++++--
 vllm/sequence.py                 | 6 +++---
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 808b0a5e651eb..cdaf2f22115e8 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -103,7 +103,7 @@ def test_allocate_encoder_decoder():
     # Allocate same sequence group to all available gpu blocks.
     for i in range(num_gpu_blocks // block_req_per_seq_group):
         _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i), 
+            str(i),
             decoder_prompt_length=block_size,
             encoder_prompt_length=block_size)
         assert block_manager.can_allocate(seq_group) == AllocStatus.OK
@@ -118,8 +118,8 @@ def test_allocate_encoder_decoder():
                                         watermark=1 / num_gpu_blocks)
     for i in range((num_gpu_blocks - 1) // block_req_per_seq_group):
         _, _, seq_group = create_dummy_prompt_encoder_decoder(
-            str(i), 
-            decoder_prompt_length=block_size, 
+            str(i),
+            decoder_prompt_length=block_size,
             encoder_prompt_length=block_size)
         assert block_manager.can_allocate(seq_group) == AllocStatus.OK
         block_manager.allocate(seq_group)
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 119e444df1b11..2482cf17956f2 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -570,7 +570,8 @@ def swap_in(self,
                 self.cpu_allocator.free(cpu_block)
             self.cross_block_tables[request_id] = new_block_table
 
-        return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()]
+        return [(cpu_block.block_number, gpu_block.block_number)
+                for cpu_block, gpu_block in mapping.items()]
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         blocks = self._get_physical_blocks(seq_group)
@@ -616,7 +617,8 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
                 self.gpu_allocator.free(gpu_block)
             self.cross_block_tables[request_id] = new_block_table
 
-        return [(cpu_block.block_number, gpu_block.block_number) for cpu_block, gpu_block in mapping.items()]
+        return [(cpu_block.block_number, gpu_block.block_number)
+                for cpu_block, gpu_block in mapping.items()]
 
     def _free_block_table(self, block_table: BlockTable) -> None:
         # when using a sliding window, each seq will only use up
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a11c411876ea8..6b07a00f09c6f 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -527,8 +527,8 @@ def get_seqs(
             seq for seq in self.seqs_dict.values() if seq.status == status
         ]
 
-    def get_encoder_seq(self) -> Sequence:
-        return self.encoder_seq  # type: ignore
+    def get_encoder_seq(self) -> Optional[Sequence]:
+        return self.encoder_seq
 
     def get_unfinished_seqs(self) -> List[Sequence]:
         return [
@@ -635,7 +635,7 @@ def __init__(
         state: Optional[SequenceGroupState] = None,
         multi_modal_data: Optional[MultiModalData] = None,
         encoder_seq_data: Optional[SequenceData] = None,
-        cross_block_table: Optional[Dict[int, List[int]]] = None,
+        cross_block_table: Optional[List[int]] = None,
     ) -> None:
         self.request_id = request_id
         self.is_prompt = is_prompt

From 19ed7413e315ce665cc07722d72fb874a362fafd Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 14:39:50 -0400
Subject: [PATCH 15/47] prefixed internal method with _

---
 vllm/core/block_manager_v1.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 2482cf17956f2..648ff843fd4e5 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -260,7 +260,7 @@ def __init__(
         # request ID
         self.cross_block_tables: Dict[str, BlockTable] = {}
 
-    def get_seq_num_required_blocks(self, seq: Sequence) -> int:
+    def _get_seq_num_required_blocks(self, seq: Sequence) -> int:
         if seq is None:
             return 0
         return len(seq.logical_token_blocks)
@@ -269,9 +269,9 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
-        self_num_required_blocks = self.get_seq_num_required_blocks(
+        self_num_required_blocks = self._get_seq_num_required_blocks(
             seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
-        cross_num_required_blocks = self.get_seq_num_required_blocks(
+        cross_num_required_blocks = self._get_seq_num_required_blocks(
             seq_group.get_encoder_seq())
         num_required_blocks = self_num_required_blocks + \
                               cross_num_required_blocks

From a5579729928c4151e501138f82340c0afa2dc327 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 17:47:19 -0400
Subject: [PATCH 16/47] refactored self-/cross-attention allocation functions
 into a single helper function

---
 vllm/core/block_manager_v1.py | 57 ++++++++++++-----------------------
 1 file changed, 19 insertions(+), 38 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 648ff843fd4e5..9f08d4a7939aa 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -290,11 +290,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         else:
             return AllocStatus.LATER
 
-    def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None:
-        # NOTE: Here we assume that all sequences in the group have the same
-        # decoder prompt.
-        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
-
+    def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable:
         # Allocate new physical token blocks that will store the prompt tokens.
         num_prompt_blocks = len(seq.logical_token_blocks)
 
@@ -304,7 +300,7 @@ def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None:
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
                 # Set the reference counts of the token blocks.
-                block.ref_count = seq_group.num_seqs()
+                block.ref_count = ref_count #seq_group.num_seqs()
             elif self.enable_caching:
                 block = self.gpu_allocator.allocate(
                     seq.hash_of_block(logical_idx),
@@ -312,47 +308,32 @@ def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None:
             else:
                 block = self.gpu_allocator.allocate()
                 # Set the reference counts of the token blocks.
-                block.ref_count = seq_group.num_seqs()
+                block.ref_count = ref_count #seq_group.num_seqs()
             block_table.append(block)
 
-        # Assign the decoder block table for each sequence.
-        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
-            self.block_tables[seq.seq_id] = block_table.copy()
+        return block_table
 
-    def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None:
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        # Allocate decoder sequences
+        #
         # NOTE: Here we assume that all sequences in the group have the same
-        # encoder prompt.
+        # decoder prompt.
+        seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
+        block_table: BlockTable = self._allocate_sequence(seq, seq_group.num_seqs())
 
-        # Allocate new physical token blocks that will store the prompt tokens.
-        seq = seq_group.get_encoder_seq()
-        if seq is not None:
-            block_table: BlockTable = []
-            num_prompt_blocks = len(seq.logical_token_blocks)
-            for logical_idx in range(num_prompt_blocks):
-                if (self.block_sliding_window is not None
-                        and logical_idx >= self.block_sliding_window):
-                    block = block_table[logical_idx %
-                                        self.block_sliding_window]
-                    # Set the reference counts of the token blocks.
-                    block.ref_count = seq_group.num_seqs()
-                elif self.enable_caching:
-                    block = self.gpu_allocator.allocate(
-                        seq.hash_of_block(logical_idx),
-                        seq.num_hashed_tokens_of_block(logical_idx))
-                else:
-                    block = self.gpu_allocator.allocate()
-                    # Set the reference counts of the token blocks.
-                    # TODO: feature not supported with encoder/decoder
-                    block.ref_count = seq_group.num_seqs()
-                block_table.append(block)
+        # Assign the self-attention block tables for each sequence.
+        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
+            self.block_tables[seq.seq_id] = block_table.copy()
 
+        # Allocate encoder sequence
+        encoder_seq = seq_group.get_encoder_seq()
+        if encoder_seq is not None:
+            # A SequenceGroup has only a single encoder sequence (at most),
+            # thus allocate with a ref count of 1
+            block_table: BlockTable = self._allocate_sequence(encoder_seq, 1)
             # Assign the cross-attention block table for the SequenceGroup.
             self.cross_block_tables[seq_group.request_id] = block_table
 
-    def allocate(self, seq_group: SequenceGroup) -> None:
-        self.allocate_self_block_tables(seq_group)
-        self.allocate_cross_block_table(seq_group)
-
     def can_append_slots(self,
                          seq_group: SequenceGroup,
                          num_lookahead_slots: int = 0) -> bool:

From e48bebf727ae67ffbdff206d168eab3e77b988da Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 17:59:44 -0400
Subject: [PATCH 17/47] Refactored block manager v2 self-/cross-block-table
 alloc functions together

---
 vllm/core/block_manager_v2.py | 38 ++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 978acd915b69b..a8085f54ac79d 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -121,7 +121,18 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         else:
             return AllocStatus.LATER
 
-    def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None:
+    def _allocate_sequence(self, seq: Sequence) -> BlockTable:
+        block_table = BlockTable(
+            block_size=self.block_size,
+            block_allocator=self.block_allocator,
+        )
+        assert self.block_sliding_window is None
+        block_table.allocate(seq.get_token_ids())
+
+        return block_table
+
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        # Allocate self-attention block tables for decoder sequences
         waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
         assert not (set(seq.seq_id for seq in waiting_seqs)
                     & self.block_tables.keys()), "block table already exists"
@@ -129,43 +140,34 @@ def allocate_self_block_tables(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
         # prompt.
         seq = waiting_seqs[0]
-
-        block_table = BlockTable(
-            block_size=self.block_size,
-            block_allocator=self.block_allocator,
-        )
-        assert self.block_sliding_window is None
-        block_table.allocate(seq.get_token_ids())
+        block_table: BlockTable = self._allocate_sequence(seq)
         self.block_tables[seq.seq_id] = block_table
 
         # Assign the block table for each sequence.
         for seq in waiting_seqs[1:]:
             self.block_tables[seq.seq_id] = block_table.fork()
 
-    def allocate_cross_block_table(self, seq_group: SequenceGroup) -> None:
+        # Allocate cross-attention block table for encoder sequence
+        #
         # NOTE: Here we assume that all sequences in the group have the same
-        # prompt.
+        # encoder prompt.
         request_id = seq_group.request_id
-        seq = seq_group.encoder_seq
+        encoder_seq = seq_group.encoder_seq
 
         assert (request_id
                 not in self.cross_block_tables), \
                 "block table already exists"
 
-        seq = seq_group.get_encoder_seq()
-        if seq is not None:
+        encoder_seq = seq_group.get_encoder_seq()
+        if encoder_seq is not None:
             block_table = BlockTable(
                 block_size=self.block_size,
                 block_allocator=self.block_allocator,
             )
             assert self.block_sliding_window is None
-            block_table.allocate(seq.get_token_ids())
+            block_table.allocate(encoder_seq.get_token_ids())
             self.cross_block_tables[request_id] = block_table
 
-    def allocate(self, seq_group: SequenceGroup) -> None:
-        self.allocate_self_block_tables(seq_group)
-        self.allocate_cross_block_table(seq_group)
-
     def can_append_slots(self, seq_group: SequenceGroup,
                          num_lookahead_slots: int) -> bool:
         """Determine if there is enough space in the GPU KV cache to continue

From ac2da978c786d998247cfe55a3d2a788109b71e4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 18:11:58 -0400
Subject: [PATCH 18/47] formatting

---
 vllm/core/block_manager_v1.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 9f08d4a7939aa..fa53b3cd33229 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -319,7 +319,8 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
         # decoder prompt.
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
-        block_table: BlockTable = self._allocate_sequence(seq, seq_group.num_seqs())
+        block_table: BlockTable = \
+            self._allocate_sequence(seq, seq_group.num_seqs())
 
         # Assign the self-attention block tables for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
@@ -330,7 +331,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         if encoder_seq is not None:
             # A SequenceGroup has only a single encoder sequence (at most),
             # thus allocate with a ref count of 1
-            block_table: BlockTable = self._allocate_sequence(encoder_seq, 1)
+            block_table = self._allocate_sequence(encoder_seq, 1)
             # Assign the cross-attention block table for the SequenceGroup.
             self.cross_block_tables[seq_group.request_id] = block_table
 

From e985a2f05080a0e311f52adf119447993322541f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 18:40:07 -0400
Subject: [PATCH 19/47] refactored out block manager v1 swap_n/swap_out helper
 functions

---
 vllm/core/block_manager_v1.py | 116 ++++++++++++++++------------------
 1 file changed, 54 insertions(+), 62 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index fa53b3cd33229..dd6d8d702fae0 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -300,7 +300,7 @@ def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable:
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
                 # Set the reference counts of the token blocks.
-                block.ref_count = ref_count #seq_group.num_seqs()
+                block.ref_count = ref_count  #seq_group.num_seqs()
             elif self.enable_caching:
                 block = self.gpu_allocator.allocate(
                     seq.hash_of_block(logical_idx),
@@ -308,7 +308,7 @@ def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable:
             else:
                 block = self.gpu_allocator.allocate()
                 # Set the reference counts of the token blocks.
-                block.ref_count = ref_count #seq_group.num_seqs()
+                block.ref_count = ref_count  #seq_group.num_seqs()
             block_table.append(block)
 
         return block_table
@@ -507,6 +507,26 @@ def can_swap_in(self,
         else:
             return AllocStatus.LATER
 
+    def _swap_in_block_table(
+            self, block_table: BlockTable,
+            mapping: Dict[PhysicalTokenBlock,
+                          PhysicalTokenBlock]) -> BlockTable:
+        new_block_table = []
+
+        for cpu_block in block_table:
+            if cpu_block in mapping:
+                gpu_block = mapping[cpu_block]
+                gpu_block.ref_count += 1
+            else:
+                gpu_block = self.gpu_allocator.allocate(
+                    cpu_block.block_hash, cpu_block.num_hashed_tokens)
+                mapping[cpu_block] = gpu_block
+            new_block_table.append(gpu_block)
+            # Free the CPU block swapped in to GPU.
+            self.cpu_allocator.free(cpu_block)
+
+        return new_block_table
+
     def swap_in(self,
                 seq_group: SequenceGroup,
                 num_lookahead_slots: int = 0) -> List[Tuple[int, int]]:
@@ -519,38 +539,14 @@ def swap_in(self,
         # dict is efficient in lookup `if cpu_block in mapping`
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-            new_block_table: BlockTable = []
-            block_table = self.block_tables[seq.seq_id]
-
-            for cpu_block in block_table:
-                if cpu_block in mapping:
-                    gpu_block = mapping[cpu_block]
-                    gpu_block.ref_count += 1
-                else:
-                    gpu_block = self.gpu_allocator.allocate(
-                        cpu_block.block_hash, cpu_block.num_hashed_tokens)
-                    mapping[cpu_block] = gpu_block
-                new_block_table.append(gpu_block)
-                # Free the CPU block swapped in to GPU.
-                self.cpu_allocator.free(cpu_block)
-            self.block_tables[seq.seq_id] = new_block_table
+            self.block_tables[seq.seq_id] = \
+                self._swap_in_block_table(self.block_tables[seq.seq_id],
+                                          mapping)
 
         if seq_group.encoder_seq is not None:
-            new_block_table = []
-            block_table = self.cross_block_tables[request_id]
-
-            for cpu_block in block_table:
-                if cpu_block in mapping:
-                    gpu_block = mapping[cpu_block]
-                    gpu_block.ref_count += 1
-                else:
-                    gpu_block = self.gpu_allocator.allocate(
-                        cpu_block.block_hash, cpu_block.num_hashed_tokens)
-                    mapping[cpu_block] = gpu_block
-                new_block_table.append(gpu_block)
-                # Free the CPU block swapped in to GPU.
-                self.cpu_allocator.free(cpu_block)
-            self.cross_block_tables[request_id] = new_block_table
+            self.cross_block_tables[request_id] = \
+                self._swap_in_block_table(self.cross_block_tables[request_id],
+                                          mapping)
 
         return [(cpu_block.block_number, gpu_block.block_number)
                 for cpu_block, gpu_block in mapping.items()]
@@ -559,6 +555,26 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         blocks = self._get_physical_blocks(seq_group)
         return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
 
+    def _swap_out_block_table(
+            self, block_table: BlockTable,
+            mapping: Dict[PhysicalTokenBlock,
+                          PhysicalTokenBlock]) -> BlockTable:
+
+        new_block_table: BlockTable = []
+        for gpu_block in block_table:
+            if gpu_block in mapping:
+                cpu_block = mapping[gpu_block]
+                cpu_block.ref_count += 1
+            else:
+                cpu_block = self.cpu_allocator.allocate(
+                    gpu_block.block_hash, gpu_block.num_hashed_tokens)
+                mapping[gpu_block] = cpu_block
+            new_block_table.append(cpu_block)
+            # Free the GPU block swapped out to CPU.
+            self.gpu_allocator.free(gpu_block)
+
+        return new_block_table
+
     def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         request_id = seq_group.request_id
 
@@ -566,38 +582,14 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         # dict is efficient in lookup `if gpu_block in mapping`
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            new_block_table: BlockTable = []
-            block_table = self.block_tables[seq.seq_id]
-
-            for gpu_block in block_table:
-                if gpu_block in mapping:
-                    cpu_block = mapping[gpu_block]
-                    cpu_block.ref_count += 1
-                else:
-                    cpu_block = self.cpu_allocator.allocate(
-                        gpu_block.block_hash, gpu_block.num_hashed_tokens)
-                    mapping[gpu_block] = cpu_block
-                new_block_table.append(cpu_block)
-                # Free the GPU block swapped out to CPU.
-                self.gpu_allocator.free(gpu_block)
-            self.block_tables[seq.seq_id] = new_block_table
+            self.block_tables[seq.seq_id] = \
+                self._swap_out_block_table(self.block_tables[seq.seq_id],
+                                           mapping)
 
         if seq_group.encoder_seq is not None:
-            new_block_table = []
-            block_table = self.cross_block_tables[request_id]
-
-            for gpu_block in block_table:
-                if gpu_block in mapping:
-                    cpu_block = mapping[gpu_block]
-                    cpu_block.ref_count += 1
-                else:
-                    cpu_block = self.cpu_allocator.allocate(
-                        gpu_block.block_hash, gpu_block.num_hashed_tokens)
-                    mapping[gpu_block] = cpu_block
-                new_block_table.append(cpu_block)
-                # Free the GPU block swapped out to CPU.
-                self.gpu_allocator.free(gpu_block)
-            self.cross_block_tables[request_id] = new_block_table
+            self.cross_block_tables[request_id] = \
+                self._swap_out_block_table(self.cross_block_tables[request_id],
+                                           mapping)
 
         return [(cpu_block.block_number, gpu_block.block_number)
                 for cpu_block, gpu_block in mapping.items()]

From 98c5863ef946dbd52221b6b83517e483f48b3848 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Wed, 22 May 2024 18:53:40 -0400
Subject: [PATCH 20/47] Help function avoids prefix caching code in
 encoder/decoder scenarios; alloc method asserts no prefix caching + enc/dec;
 refactoring

---
 vllm/core/block_manager_v1.py | 36 +++++++++++++++++------------------
 vllm/core/block_manager_v2.py | 16 ----------------
 2 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index dd6d8d702fae0..40274bd29e9b0 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -290,7 +290,10 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         else:
             return AllocStatus.LATER
 
-    def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable:
+    def _allocate_sequence(self, \
+                           seq: Sequence, \
+                           ref_count: int, \
+                           decoder_only: bool = True) -> BlockTable:
         # Allocate new physical token blocks that will store the prompt tokens.
         num_prompt_blocks = len(seq.logical_token_blocks)
 
@@ -300,27 +303,36 @@ def _allocate_sequence(self, seq: Sequence, ref_count: int) -> BlockTable:
                     and logical_idx >= self.block_sliding_window):
                 block = block_table[logical_idx % self.block_sliding_window]
                 # Set the reference counts of the token blocks.
-                block.ref_count = ref_count  #seq_group.num_seqs()
-            elif self.enable_caching:
+                block.ref_count = ref_count
+            elif decoder_only and self.enable_caching:
                 block = self.gpu_allocator.allocate(
                     seq.hash_of_block(logical_idx),
                     seq.num_hashed_tokens_of_block(logical_idx))
             else:
                 block = self.gpu_allocator.allocate()
                 # Set the reference counts of the token blocks.
-                block.ref_count = ref_count  #seq_group.num_seqs()
+                block.ref_count = ref_count
             block_table.append(block)
 
         return block_table
 
     def allocate(self, seq_group: SequenceGroup) -> None:
+        decoder_only = \
+            seq_group.get_encoder_seq() is None
+
+        assert decoder_only or (not self.enable_caching), \
+               "Automatic prefix caching currently not " + \
+               "supported for encoder/decoder models."
+
         # Allocate decoder sequences
         #
         # NOTE: Here we assume that all sequences in the group have the same
         # decoder prompt.
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
         block_table: BlockTable = \
-            self._allocate_sequence(seq, seq_group.num_seqs())
+            self._allocate_sequence(seq,
+                                    seq_group.num_seqs(),
+                                    decoder_only)
 
         # Assign the self-attention block tables for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
@@ -331,7 +343,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         if encoder_seq is not None:
             # A SequenceGroup has only a single encoder sequence (at most),
             # thus allocate with a ref count of 1
-            block_table = self._allocate_sequence(encoder_seq, 1)
+            block_table = self._allocate_sequence(encoder_seq, 1, decoder_only)
             # Assign the cross-attention block table for the SequenceGroup.
             self.cross_block_tables[seq_group.request_id] = block_table
 
@@ -661,18 +673,6 @@ def access_all_blocks_in_seq(
             for block in block_table:
                 block.last_accessed = access_time
 
-    def access_all_cross_blocks_in_seq_group(
-        self,
-        seq_group: SequenceGroup,
-        access_time: float,
-    ) -> None:
-        if self.enable_caching:
-            # Update the last accessed time of all the blocks accessed
-            # in this step.
-            block_table = self.cross_block_tables[seq_group.request_id]
-            for block in block_table:
-                block.last_accessed = access_time
-
     def compute_full_blocks_in_seq(self, seq: Sequence):
         if seq.seq_id not in self.block_tables:
             return
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index a8085f54ac79d..31d1a60657832 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -260,22 +260,6 @@ def access_all_blocks_in_seq(self, seq: Sequence, now: float):
                 block_ids,  # type: ignore
                 now)
 
-    def access_all_cross_blocks_in_seq_group(
-        self,
-        seq_group: SequenceGroup,
-        now: float,
-    ) -> None:
-        if self.enable_caching:
-            # Update the last accessed time of all the blocks accessed
-            # in this step.
-            block_table = self.cross_block_tables[seq_group.request_id]
-            block_ids = []
-            for block_id in block_table.physical_block_ids:
-                block_ids.append(block_id)
-            self.block_allocator.mark_blocks_as_accessed(
-                block_ids,  # type: ignore
-                now)
-
     def mark_blocks_as_computed(self, seq_group: SequenceGroup):
         # The only need for mark block as computed is for prefix caching,
         # while currently we could determine whether one block is computed

From 84f5510a0a4e7d0b81b32e772e1cf710be83112b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 13:49:30 -0400
Subject: [PATCH 21/47] block manager v1 NotImplementError's for sliding window
 and automatic prefix caching

---
 vllm/core/block_manager_v1.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 40274bd29e9b0..d5da128f1a691 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -277,6 +277,11 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
                               cross_num_required_blocks
 
         if self.block_sliding_window is not None:
+            if seq_group.get_encoder_seq() is not None:
+                raise NotImplementedError(
+                    "Sliding window attention for encoder/decoder models " + \
+                    "is not currently supported.")
+            
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
         num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
@@ -320,9 +325,16 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         decoder_only = \
             seq_group.get_encoder_seq() is None
 
-        assert decoder_only or (not self.enable_caching), \
-               "Automatic prefix caching currently not " + \
-               "supported for encoder/decoder models."
+        if (self.block_sliding_window is not None) and \
+           (not decoder_only):
+            raise NotImplementedError(
+                "Sliding window attention for encoder/decoder models " + \
+                "is not currently supported.")
+
+        if self.enable_caching and (not decoder_only):
+            raise NotImplementedError(
+                "Automatic prefix caching currently not " + \
+                "supported for encoder/decoder models.")
 
         # Allocate decoder sequences
         #

From cc61959d2075816ee49fa7a802e3c2240e737546 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 13:56:11 -0400
Subject: [PATCH 22/47] Fixes

---
 vllm/core/block_manager_v2.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 31d1a60657832..9c6466de468e5 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -152,7 +152,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         # NOTE: Here we assume that all sequences in the group have the same
         # encoder prompt.
         request_id = seq_group.request_id
-        encoder_seq = seq_group.encoder_seq
 
         assert (request_id
                 not in self.cross_block_tables), \
@@ -160,12 +159,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
         encoder_seq = seq_group.get_encoder_seq()
         if encoder_seq is not None:
-            block_table = BlockTable(
-                block_size=self.block_size,
-                block_allocator=self.block_allocator,
-            )
-            assert self.block_sliding_window is None
-            block_table.allocate(encoder_seq.get_token_ids())
+            block_table: BlockTable = self._allocate_sequence(encoder_seq)
             self.cross_block_tables[request_id] = block_table
 
     def can_append_slots(self, seq_group: SequenceGroup,
@@ -229,8 +223,6 @@ def free_cross(self, seq_group: SequenceGroup) -> None:
         self.cross_block_tables[request_id].free()
         del self.cross_block_tables[request_id]
 
-        del self.cross_block_tables[seq_group.request_id]
-
     def get_block_table(self, seq: Sequence) -> List[int]:
         assert seq.seq_id in self.block_tables
         block_ids = self.block_tables[seq.seq_id].physical_block_ids

From dcb9abe115cfd6bfa8f2131c645cbc0bb6acb2ab Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 13:58:30 -0400
Subject: [PATCH 23/47] formatting

---
 vllm/core/block_manager_v1.py | 2 +-
 vllm/core/block_manager_v2.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index d5da128f1a691..95e9e5e20940d 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -281,7 +281,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
                 raise NotImplementedError(
                     "Sliding window attention for encoder/decoder models " + \
                     "is not currently supported.")
-            
+
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
         num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 9c6466de468e5..b89f1cd05d1c1 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -159,7 +159,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
         encoder_seq = seq_group.get_encoder_seq()
         if encoder_seq is not None:
-            block_table: BlockTable = self._allocate_sequence(encoder_seq)
+            block_table = self._allocate_sequence(encoder_seq)
             self.cross_block_tables[request_id] = block_table
 
     def can_append_slots(self, seq_group: SequenceGroup,

From e8c40fcf152c5d2f6514830644c8eb683eee7aa9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 17:08:00 -0400
Subject: [PATCH 24/47] explanatory comment

---
 vllm/sequence.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 6b07a00f09c6f..a456ecc111e4c 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -613,11 +613,15 @@ class SequenceGroupMetadata:
             used in prefix caching.
         state: Internal state tied to this sequence group.
         multi_modal_data: Multi modal data.
-        encoder_seq_data: Optional, the sequence data
-                          for the single encoder prompt.
-        cross_block_table: Optional, the cross-attention 
-                           block table associated with 
-                           the single encoder prompt.
+        encoder_seq_data: Optional sequence data for encoder prompt
+                          (SequenceGroup.encoder_seq). Should be None 
+                          unless you are working with an encoder/decoder
+                          model.
+        cross_block_table: Optional cross-attention block table associated
+                           with the encoder prompt
+                           (SequenceGroup.encoder_seq). Should be None
+                           unless you are working with an encoder/decoder
+                           model.
     """
 
     def __init__(

From 5ccb70be1209521d0aa1e3d7cae7bf7707ac2fd8 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 17:18:03 -0400
Subject: [PATCH 25/47] various fixes according to reviews

---
 vllm/core/block_manager_v1.py |  2 +-
 vllm/core/block_manager_v2.py | 14 ++++++++++++++
 vllm/sequence.py              |  3 ++-
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 95e9e5e20940d..1c81edb7a2df3 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -352,7 +352,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
         # Allocate encoder sequence
         encoder_seq = seq_group.get_encoder_seq()
-        if encoder_seq is not None:
+        if not decoder_only:
             # A SequenceGroup has only a single encoder sequence (at most),
             # thus allocate with a ref count of 1
             block_table = self._allocate_sequence(encoder_seq, 1, decoder_only)
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index b89f1cd05d1c1..f094bf99e3201 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -132,6 +132,9 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
         return block_table
 
     def allocate(self, seq_group: SequenceGroup) -> None:
+        decoder_only = \
+            seq_group.get_encoder_seq() is None
+
         # Allocate self-attention block tables for decoder sequences
         waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
         assert not (set(seq.seq_id for seq in waiting_seqs)
@@ -157,6 +160,17 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                 not in self.cross_block_tables), \
                 "block table already exists"
 
+        if (self.block_sliding_window is not None) and \
+           (not decoder_only):
+            raise NotImplementedError(
+                "Sliding window attention for encoder/decoder models " + \
+                "is not currently supported.")
+
+        if self.enable_caching and (not decoder_only):
+            raise NotImplementedError(
+                "Automatic prefix caching currently not " + \
+                "supported for encoder/decoder models.")
+
         encoder_seq = seq_group.get_encoder_seq()
         if encoder_seq is not None:
             block_table = self._allocate_sequence(encoder_seq)
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a456ecc111e4c..9c8fcccab75ae 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -420,7 +420,8 @@ class SequenceGroup:
             for an embedding model.
         pooling_params: The pooling parameters used to generate the pooling
             for an embedding model.
-        encoder_seq: Optional, the single encoder sequence.
+        encoder_seq: Optional, the single encoder sequence. Should be None
+                     unless you are working with an encoder/decoder model.
     """
 
     def __init__(

From dfcc28b19188a11c74aee06265051eb8fbbe599f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 17:22:41 -0400
Subject: [PATCH 26/47] slight refactoring

---
 vllm/core/block_manager_v1.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 1c81edb7a2df3..2daf45182bba9 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -322,8 +322,8 @@ def _allocate_sequence(self, \
         return block_table
 
     def allocate(self, seq_group: SequenceGroup) -> None:
-        decoder_only = \
-            seq_group.get_encoder_seq() is None
+        encoder_seq = seq_group.get_encoder_seq()
+        decoder_only = encoder_seq is None
 
         if (self.block_sliding_window is not None) and \
            (not decoder_only):
@@ -351,7 +351,6 @@ def allocate(self, seq_group: SequenceGroup) -> None:
             self.block_tables[seq.seq_id] = block_table.copy()
 
         # Allocate encoder sequence
-        encoder_seq = seq_group.get_encoder_seq()
         if not decoder_only:
             # A SequenceGroup has only a single encoder sequence (at most),
             # thus allocate with a ref count of 1

From 8d3ad05a9f7d568f16eea6e090f6803869fc5443 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 17:26:54 -0400
Subject: [PATCH 27/47] small refactor

---
 vllm/core/block_manager_v2.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index f094bf99e3201..6e02359f51782 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -132,8 +132,9 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
         return block_table
 
     def allocate(self, seq_group: SequenceGroup) -> None:
+        encoder_seq = seq_group.get_encoder_seq()
         decoder_only = \
-            seq_group.get_encoder_seq() is None
+            encoder_seq is None
 
         # Allocate self-attention block tables for decoder sequences
         waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
@@ -171,8 +172,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                 "Automatic prefix caching currently not " + \
                 "supported for encoder/decoder models.")
 
-        encoder_seq = seq_group.get_encoder_seq()
-        if encoder_seq is not None:
+        if not decoder_only:
             block_table = self._allocate_sequence(encoder_seq)
             self.cross_block_tables[request_id] = block_table
 

From 5a7697976a964cf23d6141d9e432abb63d3f9e9d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 17:34:34 -0400
Subject: [PATCH 28/47] replaced all encoder_seq is not None with not
 decoder_only

---
 vllm/core/block_manager_v1.py | 19 +++++++++++++++----
 vllm/core/block_manager_v2.py |  5 ++++-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 2daf45182bba9..2e5d531565379 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -496,6 +496,10 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
 
     def _get_physical_blocks(
             self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:
+        encoder_seq = seq_group.get_encoder_seq()
+        decoder_only = \
+            encoder_seq is None
+
         # NOTE: Here, we assume that the physical blocks are only shared by
         # the sequences in the same group.
         request_id = seq_group.request_id
@@ -505,7 +509,7 @@ def _get_physical_blocks(
                 continue
             blocks.update(self.block_tables[seq.seq_id])
         # Cross-attention blocks
-        if seq_group.encoder_seq is not None:
+        if not decoder_only:
             blocks.update(self.cross_block_tables[request_id])
         return list(blocks)
 
@@ -514,9 +518,12 @@ def can_swap_in(self,
                     num_lookahead_slots: int = 0) -> AllocStatus:
         assert (num_lookahead_slots == 0
                 ), "BlockSpaceManagerV1 does not support lookahead allocation"
+        encoder_seq = seq_group.get_encoder_seq()
+        decoder_only = encoder_seq is None
+
         blocks = self._get_physical_blocks(seq_group)
         num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
-        if seq_group.encoder_seq is not None:
+        if not decoder_only:
             num_swapped_seqs += 1
         num_free_blocks = self.gpu_allocator.get_num_free_blocks()
         # NOTE: Conservatively, we assume that every sequence will allocate
@@ -556,6 +563,8 @@ def swap_in(self,
         assert (num_lookahead_slots == 0
                 ), "BlockSpaceManagerV1 does not support lookahead allocation"
 
+        encoder_seq = seq_group.get_encoder_seq()
+        decoder_only = encoder_seq is None
         request_id = seq_group.request_id
 
         # CPU block -> GPU block.
@@ -566,7 +575,7 @@ def swap_in(self,
                 self._swap_in_block_table(self.block_tables[seq.seq_id],
                                           mapping)
 
-        if seq_group.encoder_seq is not None:
+        if not decoder_only:
             self.cross_block_tables[request_id] = \
                 self._swap_in_block_table(self.cross_block_tables[request_id],
                                           mapping)
@@ -600,6 +609,8 @@ def _swap_out_block_table(
 
     def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         request_id = seq_group.request_id
+        encoder_seq = seq_group.get_encoder_seq()
+        decoder_only = encoder_seq is None
 
         # GPU block -> CPU block.
         # dict is efficient in lookup `if gpu_block in mapping`
@@ -609,7 +620,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
                 self._swap_out_block_table(self.block_tables[seq.seq_id],
                                            mapping)
 
-        if seq_group.encoder_seq is not None:
+        if not decoder_only:
             self.cross_block_tables[request_id] = \
                 self._swap_out_block_table(self.cross_block_tables[request_id],
                                            mapping)
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 6e02359f51782..a8090c1f93b5a 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -91,6 +91,9 @@ def __init__(
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
+        encoder_seq = seq_group.get_encoder_seq()
+        decoder_only = encoder_seq is None
+
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
 
         num_required_blocks = BlockTable.get_num_required_blocks(
@@ -98,7 +101,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
             block_size=self.block_size,
         )
 
-        if seq_group.encoder_seq is not None:
+        if not decoder_only:
             num_required_blocks += BlockTable.get_num_required_blocks(
                 seq_group.encoder_seq.get_token_ids(),
                 block_size=self.block_size,

From 09ae4adb656b79897d62d28015f968b0c7471d8e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 17:51:23 -0400
Subject: [PATCH 29/47] added is_encoder_decoder() method to sequence group

---
 vllm/core/block_manager_v1.py | 36 ++++++++++++++---------------------
 vllm/core/block_manager_v2.py | 16 ++++++----------
 vllm/sequence.py              |  3 +++
 3 files changed, 23 insertions(+), 32 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 2e5d531565379..69a280c8bf9c6 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -277,7 +277,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
                               cross_num_required_blocks
 
         if self.block_sliding_window is not None:
-            if seq_group.get_encoder_seq() is not None:
+            if seq_group.is_encoder_decoder():
                 raise NotImplementedError(
                     "Sliding window attention for encoder/decoder models " + \
                     "is not currently supported.")
@@ -298,7 +298,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
     def _allocate_sequence(self, \
                            seq: Sequence, \
                            ref_count: int, \
-                           decoder_only: bool = True) -> BlockTable:
+                           is_encoder_decoder: bool = True) -> BlockTable:
         # Allocate new physical token blocks that will store the prompt tokens.
         num_prompt_blocks = len(seq.logical_token_blocks)
 
@@ -309,7 +309,7 @@ def _allocate_sequence(self, \
                 block = block_table[logical_idx % self.block_sliding_window]
                 # Set the reference counts of the token blocks.
                 block.ref_count = ref_count
-            elif decoder_only and self.enable_caching:
+            elif not is_encoder_decoder and self.enable_caching:
                 block = self.gpu_allocator.allocate(
                     seq.hash_of_block(logical_idx),
                     seq.num_hashed_tokens_of_block(logical_idx))
@@ -323,15 +323,15 @@ def _allocate_sequence(self, \
 
     def allocate(self, seq_group: SequenceGroup) -> None:
         encoder_seq = seq_group.get_encoder_seq()
-        decoder_only = encoder_seq is None
+        is_encoder_decoder = seq_group.is_encoder_decoder()
 
         if (self.block_sliding_window is not None) and \
-           (not decoder_only):
+           is_encoder_decoder:
             raise NotImplementedError(
                 "Sliding window attention for encoder/decoder models " + \
                 "is not currently supported.")
 
-        if self.enable_caching and (not decoder_only):
+        if self.enable_caching and is_encoder_decoder:
             raise NotImplementedError(
                 "Automatic prefix caching currently not " + \
                 "supported for encoder/decoder models.")
@@ -344,17 +344,18 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         block_table: BlockTable = \
             self._allocate_sequence(seq,
                                     seq_group.num_seqs(),
-                                    decoder_only)
+                                    is_encoder_decoder)
 
         # Assign the self-attention block tables for each sequence.
         for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
             self.block_tables[seq.seq_id] = block_table.copy()
 
         # Allocate encoder sequence
-        if not decoder_only:
+        if is_encoder_decoder:
             # A SequenceGroup has only a single encoder sequence (at most),
             # thus allocate with a ref count of 1
-            block_table = self._allocate_sequence(encoder_seq, 1, decoder_only)
+            block_table = self._allocate_sequence(encoder_seq, 1,
+                                                  is_encoder_decoder)
             # Assign the cross-attention block table for the SequenceGroup.
             self.cross_block_tables[seq_group.request_id] = block_table
 
@@ -496,9 +497,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
 
     def _get_physical_blocks(
             self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:
-        encoder_seq = seq_group.get_encoder_seq()
-        decoder_only = \
-            encoder_seq is None
 
         # NOTE: Here, we assume that the physical blocks are only shared by
         # the sequences in the same group.
@@ -509,7 +507,7 @@ def _get_physical_blocks(
                 continue
             blocks.update(self.block_tables[seq.seq_id])
         # Cross-attention blocks
-        if not decoder_only:
+        if seq_group.is_encoder_decoder():
             blocks.update(self.cross_block_tables[request_id])
         return list(blocks)
 
@@ -518,12 +516,10 @@ def can_swap_in(self,
                     num_lookahead_slots: int = 0) -> AllocStatus:
         assert (num_lookahead_slots == 0
                 ), "BlockSpaceManagerV1 does not support lookahead allocation"
-        encoder_seq = seq_group.get_encoder_seq()
-        decoder_only = encoder_seq is None
 
         blocks = self._get_physical_blocks(seq_group)
         num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
-        if not decoder_only:
+        if seq_group.is_encoder_decoder():
             num_swapped_seqs += 1
         num_free_blocks = self.gpu_allocator.get_num_free_blocks()
         # NOTE: Conservatively, we assume that every sequence will allocate
@@ -563,8 +559,6 @@ def swap_in(self,
         assert (num_lookahead_slots == 0
                 ), "BlockSpaceManagerV1 does not support lookahead allocation"
 
-        encoder_seq = seq_group.get_encoder_seq()
-        decoder_only = encoder_seq is None
         request_id = seq_group.request_id
 
         # CPU block -> GPU block.
@@ -575,7 +569,7 @@ def swap_in(self,
                 self._swap_in_block_table(self.block_tables[seq.seq_id],
                                           mapping)
 
-        if not decoder_only:
+        if seq_group.is_encoder_decoder():
             self.cross_block_tables[request_id] = \
                 self._swap_in_block_table(self.cross_block_tables[request_id],
                                           mapping)
@@ -609,8 +603,6 @@ def _swap_out_block_table(
 
     def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         request_id = seq_group.request_id
-        encoder_seq = seq_group.get_encoder_seq()
-        decoder_only = encoder_seq is None
 
         # GPU block -> CPU block.
         # dict is efficient in lookup `if gpu_block in mapping`
@@ -620,7 +612,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
                 self._swap_out_block_table(self.block_tables[seq.seq_id],
                                            mapping)
 
-        if not decoder_only:
+        if seq_group.is_encoder_decoder():
             self.cross_block_tables[request_id] = \
                 self._swap_out_block_table(self.cross_block_tables[request_id],
                                            mapping)
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index a8090c1f93b5a..0dd2ffcd182ec 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -91,19 +91,16 @@ def __init__(
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
-        encoder_seq = seq_group.get_encoder_seq()
-        decoder_only = encoder_seq is None
 
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
-
         num_required_blocks = BlockTable.get_num_required_blocks(
             seq.get_token_ids(),
             block_size=self.block_size,
         )
 
-        if not decoder_only:
+        if seq_group.is_encoder_decoder():
             num_required_blocks += BlockTable.get_num_required_blocks(
-                seq_group.encoder_seq.get_token_ids(),
+                seq_group.get_encoder_seq().get_token_ids(),
                 block_size=self.block_size,
             )
 
@@ -136,8 +133,7 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
 
     def allocate(self, seq_group: SequenceGroup) -> None:
         encoder_seq = seq_group.get_encoder_seq()
-        decoder_only = \
-            encoder_seq is None
+        is_encoder_decoder = seq_group.is_encoder_decoder()
 
         # Allocate self-attention block tables for decoder sequences
         waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
@@ -165,17 +161,17 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                 "block table already exists"
 
         if (self.block_sliding_window is not None) and \
-           (not decoder_only):
+           is_encoder_decoder:
             raise NotImplementedError(
                 "Sliding window attention for encoder/decoder models " + \
                 "is not currently supported.")
 
-        if self.enable_caching and (not decoder_only):
+        if self.enable_caching and is_encoder_decoder:
             raise NotImplementedError(
                 "Automatic prefix caching currently not " + \
                 "supported for encoder/decoder models.")
 
-        if not decoder_only:
+        if is_encoder_decoder:
             block_table = self._allocate_sequence(encoder_seq)
             self.cross_block_tables[request_id] = block_table
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 9c8fcccab75ae..ad6c8d54974c3 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -528,6 +528,9 @@ def get_seqs(
             seq for seq in self.seqs_dict.values() if seq.status == status
         ]
 
+    def is_encoder_decoder(self) -> bool:
+        return self.encoder_seq is not None
+
     def get_encoder_seq(self) -> Optional[Sequence]:
         return self.encoder_seq
 

From ecd1a998579ac171ce1936444fe9f7c8a6a09c92 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 18:59:03 -0400
Subject: [PATCH 30/47] tests for NotImplemented errors when encoder/decoder
 models are used with prefix cache or SWA

---
 tests/core/block/test_block_manager_v2.py | 103 +++++++++++++++++++++-
 tests/core/test_block_manager.py          |  64 +++++++++++++-
 vllm/core/block_manager_v1.py             |  29 +++---
 vllm/core/block_manager_v2.py             |  28 ++++--
 4 files changed, 205 insertions(+), 19 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 06c3389cfa0f0..cf423d292a25e 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -1,6 +1,8 @@
 import pytest
 
-from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+from vllm.core.block_manager_v2 import (BlockSpaceManagerV2,
+                                        str_not_impl_enc_dec_prefix_cache,
+                                        str_not_impl_enc_dec_swa)
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
@@ -103,6 +105,105 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
             assert can_allocate_result == AllocStatus.LATER
 
 
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16])
+@pytest.mark.parametrize("num_seqs_per_group", [1])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_allocate_encoder_decoder_fails_with_swa(block_size: int,
+                                                 num_seqs_per_group: int,
+                                                 num_gpu_blocks: int,
+                                                 watermark: float):
+    '''
+    SWA short for Sliding Window Attention.
+
+    At time of writing block manager v2 does not support SWA.
+
+    However even when SWA is implemented for block manager v2,
+    there will still most likely be a separate workstream required
+    to enable SWA for encoder/decoder models.
+
+    Therefore this test enforces that one of the following cases
+    hold true:
+    1. Block manager v2 does not support SWA at all (true at time of writing)
+    2. Block manager v2 fails with NotImplementError when SWA is enabled
+       AND a SequenceGroup with an encoder sequence (i.e. in support of an
+       encoder/decoder model) is passed into can_allocate() as an argument
+
+    The setup for this test is stripped down version of
+    test_can_allocate_seq_group_encoder_decoder()
+    '''
+
+    with pytest.raises((NotImplementedError, AssertionError)) as exc_info:
+        block_manager = BlockSpaceManagerV2(
+            block_size=block_size,
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=1024,
+            watermark=watermark,
+            sliding_window=5  # SWA
+        )
+
+        num_output_blocks_per_seq = 1
+        num_prompt_blocks = 1
+        num_output_blocks = num_output_blocks_per_seq
+        seq_group = create_seq_group_encoder_decoder(
+            seq_prompt_len=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+            request_id="0")
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+        block_manager.can_allocate(seq_group)
+
+    # Assert that either
+    # 1. Block manager v2 constructor fails with assertion that sliding window
+    #    is not yet supported (most likely near-term outcome at time of
+    #    writing), or
+    # 2. can_allocate() fails with NotImplementedError due to combiantion of
+    #    encoder/decoder and sliding window attention
+    if isinstance(exc_info.value, NotImplementedError):
+        assert str(exc_info.value) == str_not_impl_enc_dec_swa
+    elif isinstance(exc_info.value, AssertionError):
+        assert str(exc_info.value) == "Sliding window not yet supported"
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [16])
+@pytest.mark.parametrize("num_seqs_per_group", [1])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_allocate_encoder_decoder_fails_with_prefix_cache(
+        block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
+        watermark: float):
+
+    block_manager = BlockSpaceManagerV2(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+        enable_caching=True  # Prefix cache
+    )
+
+    num_output_blocks_per_seq = 1
+    num_prompt_blocks = 1
+    num_output_blocks = num_output_blocks_per_seq
+    seq_group = create_seq_group_encoder_decoder(
+        seq_prompt_len=block_size * num_prompt_blocks,
+        seq_output_lens=[
+            block_size * num_output_blocks_per_seq
+            for _ in range(num_seqs_per_group)
+        ],
+        request_id="0")
+
+    assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+    # Assert that either can_allocate() fails with NotImplementedError
+    # due to combination of encoder/decoder and prefix cache
+    with pytest.raises(NotImplementedError) as exc_info:
+        block_manager.can_allocate(seq_group)
+    assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache
+
+
 @pytest.mark.parametrize("block_size", [1, 8])
 @pytest.mark.parametrize("prompt_len", [1, 7, 8])
 @pytest.mark.parametrize("num_slots_to_append", [1, 8, 129])
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index cdaf2f22115e8..6039f568fcf1e 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -7,7 +7,9 @@
 from vllm import SamplingParams
 from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager_v1 import (BlockSpaceManagerV1,
-                                        UncachedBlockAllocator)
+                                        UncachedBlockAllocator,
+                                        str_not_impl_enc_dec_prefix_cache,
+                                        str_not_impl_enc_dec_swa)
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
@@ -126,6 +128,66 @@ def test_allocate_encoder_decoder():
     assert block_manager.can_allocate(seq_group) != AllocStatus.OK
 
 
+def test_allocate_encoder_decoder_fails_with_swa():
+    # SWA short for sliding window attention
+
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0,
+                                        sliding_window=5)  # swa
+
+    # Allocate same sequence group to all available gpu blocks.
+    _, _, seq_group = create_dummy_prompt_encoder_decoder(
+        "0",
+        decoder_prompt_length=block_size,
+        encoder_prompt_length=block_size)
+
+    # Assert that can_allocate() fails due to SWA
+    with pytest.raises(NotImplementedError) as exc_info:
+        block_manager.can_allocate(seq_group)
+
+    assert str(exc_info.value) == str_not_impl_enc_dec_swa
+
+    # Assert that allocate() fails due to SWA
+    with pytest.raises(NotImplementedError) as exc_info:
+        block_manager.allocate(seq_group)
+
+    assert str(exc_info.value) == str_not_impl_enc_dec_swa
+
+
+def test_allocate_encoder_decoder_fails_with_prefix_caching():
+    block_size = 4
+    num_cpu_blocks = 4
+    num_gpu_blocks = 4
+    block_manager = BlockSpaceManagerV1(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0,
+                                        enable_caching=True)  # Prefix cache
+
+    # Allocate same sequence group to all available gpu blocks.
+    _, _, seq_group = create_dummy_prompt_encoder_decoder(
+        "0",
+        decoder_prompt_length=block_size,
+        encoder_prompt_length=block_size)
+
+    # Assert that can_allocate() fails due to prefix caching
+    with pytest.raises(NotImplementedError) as exc_info:
+        block_manager.can_allocate(seq_group)
+
+    assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache
+
+    # Assert that allocate() fails due to prefix caching
+    with pytest.raises(NotImplementedError) as exc_info:
+        block_manager.allocate(seq_group)
+
+    assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache
+
+
 def test_append_slot_single_seq():
     block_size = 4
     num_cpu_blocks = 4
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 69a280c8bf9c6..904b12cd97b01 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -15,6 +15,17 @@
 from vllm.utils import Device
 
 logger = init_logger(__name__)
+'''
+Exception strings for non-implemented encoder/decoder scenarios
+'''
+
+str_not_impl_enc_dec_swa = \
+    "Sliding window attention for encoder/decoder models " + \
+                    "is not currently supported."
+
+str_not_impl_enc_dec_prefix_cache = \
+    "Prefix caching for encoder/decoder models " + \
+                    "is not currently supported."
 
 
 class BlockAllocatorBase(ABC):
@@ -269,6 +280,10 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
+        is_encoder_decoder = seq_group.is_encoder_decoder()
+        if self.enable_caching and is_encoder_decoder:
+            raise NotImplementedError(str_not_impl_enc_dec_prefix_cache)
+
         self_num_required_blocks = self._get_seq_num_required_blocks(
             seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
         cross_num_required_blocks = self._get_seq_num_required_blocks(
@@ -277,10 +292,8 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
                               cross_num_required_blocks
 
         if self.block_sliding_window is not None:
-            if seq_group.is_encoder_decoder():
-                raise NotImplementedError(
-                    "Sliding window attention for encoder/decoder models " + \
-                    "is not currently supported.")
+            if is_encoder_decoder:
+                raise NotImplementedError(str_not_impl_enc_dec_swa)
 
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
@@ -327,14 +340,10 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
         if (self.block_sliding_window is not None) and \
            is_encoder_decoder:
-            raise NotImplementedError(
-                "Sliding window attention for encoder/decoder models " + \
-                "is not currently supported.")
+            raise NotImplementedError(str_not_impl_enc_dec_swa)
 
         if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(
-                "Automatic prefix caching currently not " + \
-                "supported for encoder/decoder models.")
+            raise NotImplementedError(str_not_impl_enc_dec_prefix_cache)
 
         # Allocate decoder sequences
         #
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 0dd2ffcd182ec..d2dadd9a63dc2 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -8,6 +8,17 @@
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
+'''
+Exception strings for non-implemented encoder/decoder scenarios
+'''
+
+str_not_impl_enc_dec_swa = \
+    "Sliding window attention for encoder/decoder models " + \
+                    "is not currently supported."
+
+str_not_impl_enc_dec_prefix_cache = \
+    "Prefix caching for encoder/decoder models " + \
+                    "is not currently supported."
 
 SeqId = int
 EncoderSeqId = str
@@ -92,13 +103,20 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
+        is_encoder_decoder = seq_group.is_encoder_decoder()
+        if self.enable_caching and is_encoder_decoder:
+            raise NotImplementedError(str_not_impl_enc_dec_prefix_cache)
+
+        if self.block_sliding_window is not None and is_encoder_decoder:
+            raise NotImplementedError(str_not_impl_enc_dec_swa)
+
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
         num_required_blocks = BlockTable.get_num_required_blocks(
             seq.get_token_ids(),
             block_size=self.block_size,
         )
 
-        if seq_group.is_encoder_decoder():
+        if is_encoder_decoder:
             num_required_blocks += BlockTable.get_num_required_blocks(
                 seq_group.get_encoder_seq().get_token_ids(),
                 block_size=self.block_size,
@@ -162,14 +180,10 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
         if (self.block_sliding_window is not None) and \
            is_encoder_decoder:
-            raise NotImplementedError(
-                "Sliding window attention for encoder/decoder models " + \
-                "is not currently supported.")
+            raise NotImplementedError(str_not_impl_enc_dec_swa)
 
         if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(
-                "Automatic prefix caching currently not " + \
-                "supported for encoder/decoder models.")
+            raise NotImplementedError(str_not_impl_enc_dec_prefix_cache)
 
         if is_encoder_decoder:
             block_table = self._allocate_sequence(encoder_seq)

From d3935f73b5038ba7acc75fff07282b7f7fda6ed5 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 19:05:36 -0400
Subject: [PATCH 31/47] rename tests

---
 tests/core/block/test_block_manager_v2.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index cf423d292a25e..c893bc8f4209e 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -109,10 +109,10 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int,
 @pytest.mark.parametrize("num_gpu_blocks", [16])
 @pytest.mark.parametrize("num_seqs_per_group", [1])
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_allocate_encoder_decoder_fails_with_swa(block_size: int,
-                                                 num_seqs_per_group: int,
-                                                 num_gpu_blocks: int,
-                                                 watermark: float):
+def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
+                                                     num_seqs_per_group: int,
+                                                     num_gpu_blocks: int,
+                                                     watermark: float):
     '''
     SWA short for Sliding Window Attention.
 
@@ -172,7 +172,7 @@ def test_allocate_encoder_decoder_fails_with_swa(block_size: int,
 @pytest.mark.parametrize("num_gpu_blocks", [16])
 @pytest.mark.parametrize("num_seqs_per_group", [1])
 @pytest.mark.parametrize("watermark", [0.0, 0.5])
-def test_allocate_encoder_decoder_fails_with_prefix_cache(
+def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
         block_size: int, num_seqs_per_group: int, num_gpu_blocks: int,
         watermark: float):
 

From e6a7125383488af42dd5020b65824394c9c112e9 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 19:10:35 -0400
Subject: [PATCH 32/47] spelling error

---
 tests/core/block/test_block_manager_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index c893bc8f4209e..19ea89d01ca7a 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -160,7 +160,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
     # 1. Block manager v2 constructor fails with assertion that sliding window
     #    is not yet supported (most likely near-term outcome at time of
     #    writing), or
-    # 2. can_allocate() fails with NotImplementedError due to combiantion of
+    # 2. can_allocate() fails with NotImplementedError due to combination of
     #    encoder/decoder and sliding window attention
     if isinstance(exc_info.value, NotImplementedError):
         assert str(exc_info.value) == str_not_impl_enc_dec_swa

From 68b476203ba9c8342e3f6ba5d9db5e7d369a7a52 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Thu, 23 May 2024 19:14:25 -0400
Subject: [PATCH 33/47] isort

---
 vllm/core/block_manager_v2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index d2dadd9a63dc2..b43f39a8ffaef 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -8,6 +8,7 @@
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
+
 '''
 Exception strings for non-implemented encoder/decoder scenarios
 '''

From a80325dcbe4af189e3542f00ffe92a11a7243e92 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 25 May 2024 21:45:13 -0400
Subject: [PATCH 34/47] return output of SequenceGroup constructor

---
 tests/core/utils.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/core/utils.py b/tests/core/utils.py
index 376af0f0eac4f..fb53b6cc5e18b 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -145,14 +145,11 @@ def create_seq_group_encoder_decoder(
         block_size=16,
     )
 
-    seq_group = SequenceGroup(request_id=request_id,
-                              seqs=seqs,
-                              sampling_params=sampling_params,
-                              arrival_time=time.time(),
-                              encoder_seq=encoder_seq)
-
-    return seq_group
-
+    return SequenceGroup(request_id=request_id,
+                         seqs=seqs,
+                         sampling_params=sampling_params,
+                         arrival_time=time.time(),
+                         encoder_seq=encoder_seq)
 
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size

From 8b387767512a657fd0051c674f4a594159b67eee Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 25 May 2024 21:56:25 -0400
Subject: [PATCH 35/47] capitalize constants

---
 tests/core/block/test_block_manager_v2.py |  8 ++++----
 tests/core/test_block_manager.py          | 12 ++++++------
 vllm/core/block_manager_v1.py             | 17 ++++++++---------
 vllm/core/block_manager_v2.py             | 12 ++++++------
 4 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 19ea89d01ca7a..3aed0c58bd264 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -1,8 +1,8 @@
 import pytest
 
 from vllm.core.block_manager_v2 import (BlockSpaceManagerV2,
-                                        str_not_impl_enc_dec_prefix_cache,
-                                        str_not_impl_enc_dec_swa)
+                                        STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                                        STR_NOT_IMPL_ENC_DEC_SWA)
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
@@ -163,7 +163,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int,
     # 2. can_allocate() fails with NotImplementedError due to combination of
     #    encoder/decoder and sliding window attention
     if isinstance(exc_info.value, NotImplementedError):
-        assert str(exc_info.value) == str_not_impl_enc_dec_swa
+        assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
     elif isinstance(exc_info.value, AssertionError):
         assert str(exc_info.value) == "Sliding window not yet supported"
 
@@ -201,7 +201,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache(
     # due to combination of encoder/decoder and prefix cache
     with pytest.raises(NotImplementedError) as exc_info:
         block_manager.can_allocate(seq_group)
-    assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache
+    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
 
 
 @pytest.mark.parametrize("block_size", [1, 8])
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 6039f568fcf1e..7e487a021d3c2 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -8,8 +8,8 @@
 from vllm.block import PhysicalTokenBlock
 from vllm.core.block_manager_v1 import (BlockSpaceManagerV1,
                                         UncachedBlockAllocator,
-                                        str_not_impl_enc_dec_prefix_cache,
-                                        str_not_impl_enc_dec_swa)
+                                        STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                                        STR_NOT_IMPL_ENC_DEC_SWA)
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
@@ -150,13 +150,13 @@ def test_allocate_encoder_decoder_fails_with_swa():
     with pytest.raises(NotImplementedError) as exc_info:
         block_manager.can_allocate(seq_group)
 
-    assert str(exc_info.value) == str_not_impl_enc_dec_swa
+    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
 
     # Assert that allocate() fails due to SWA
     with pytest.raises(NotImplementedError) as exc_info:
         block_manager.allocate(seq_group)
 
-    assert str(exc_info.value) == str_not_impl_enc_dec_swa
+    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA
 
 
 def test_allocate_encoder_decoder_fails_with_prefix_caching():
@@ -179,13 +179,13 @@ def test_allocate_encoder_decoder_fails_with_prefix_caching():
     with pytest.raises(NotImplementedError) as exc_info:
         block_manager.can_allocate(seq_group)
 
-    assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache
+    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
 
     # Assert that allocate() fails due to prefix caching
     with pytest.raises(NotImplementedError) as exc_info:
         block_manager.allocate(seq_group)
 
-    assert str(exc_info.value) == str_not_impl_enc_dec_prefix_cache
+    assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE
 
 
 def test_append_slot_single_seq():
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 904b12cd97b01..312690ee45893 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -19,11 +19,11 @@
 Exception strings for non-implemented encoder/decoder scenarios
 '''
 
-str_not_impl_enc_dec_swa = \
+STR_NOT_IMPL_ENC_DEC_SWA = \
     "Sliding window attention for encoder/decoder models " + \
                     "is not currently supported."
 
-str_not_impl_enc_dec_prefix_cache = \
+STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
     "Prefix caching for encoder/decoder models " + \
                     "is not currently supported."
 
@@ -272,9 +272,8 @@ def __init__(
         self.cross_block_tables: Dict[str, BlockTable] = {}
 
     def _get_seq_num_required_blocks(self, seq: Sequence) -> int:
-        if seq is None:
-            return 0
-        return len(seq.logical_token_blocks)
+        return 0 if seq is None \
+            else len(seq.logical_token_blocks)
 
     def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
@@ -282,7 +281,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
 
         is_encoder_decoder = seq_group.is_encoder_decoder()
         if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(str_not_impl_enc_dec_prefix_cache)
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
 
         self_num_required_blocks = self._get_seq_num_required_blocks(
             seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
@@ -293,7 +292,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
 
         if self.block_sliding_window is not None:
             if is_encoder_decoder:
-                raise NotImplementedError(str_not_impl_enc_dec_swa)
+                raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
 
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
@@ -340,10 +339,10 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
         if (self.block_sliding_window is not None) and \
            is_encoder_decoder:
-            raise NotImplementedError(str_not_impl_enc_dec_swa)
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
 
         if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(str_not_impl_enc_dec_prefix_cache)
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
 
         # Allocate decoder sequences
         #
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index b43f39a8ffaef..6113561032dd1 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -13,11 +13,11 @@
 Exception strings for non-implemented encoder/decoder scenarios
 '''
 
-str_not_impl_enc_dec_swa = \
+STR_NOT_IMPL_ENC_DEC_SWA = \
     "Sliding window attention for encoder/decoder models " + \
                     "is not currently supported."
 
-str_not_impl_enc_dec_prefix_cache = \
+STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
     "Prefix caching for encoder/decoder models " + \
                     "is not currently supported."
 
@@ -106,10 +106,10 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
 
         is_encoder_decoder = seq_group.is_encoder_decoder()
         if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(str_not_impl_enc_dec_prefix_cache)
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
 
         if self.block_sliding_window is not None and is_encoder_decoder:
-            raise NotImplementedError(str_not_impl_enc_dec_swa)
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
 
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
         num_required_blocks = BlockTable.get_num_required_blocks(
@@ -181,10 +181,10 @@ def allocate(self, seq_group: SequenceGroup) -> None:
 
         if (self.block_sliding_window is not None) and \
            is_encoder_decoder:
-            raise NotImplementedError(str_not_impl_enc_dec_swa)
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
 
         if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(str_not_impl_enc_dec_prefix_cache)
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
 
         if is_encoder_decoder:
             block_table = self._allocate_sequence(encoder_seq)

From f39c3132af87d410507644c9ea86aec1156f3533 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 25 May 2024 22:20:06 -0400
Subject: [PATCH 36/47] refactored swap-block-table functionality

---
 vllm/core/block_manager_v1.py | 68 +++++++++++++++--------------------
 1 file changed, 29 insertions(+), 39 deletions(-)

diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 312690ee45893..90a485b39e9d6 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -541,23 +541,25 @@ def can_swap_in(self,
         else:
             return AllocStatus.LATER
 
-    def _swap_in_block_table(
+    def _swap_block_table(
             self, block_table: BlockTable,
+            src_allocator: BlockAllocatorBase,
+            dest_allocator: BlockAllocatorBase,
             mapping: Dict[PhysicalTokenBlock,
                           PhysicalTokenBlock]) -> BlockTable:
         new_block_table = []
 
-        for cpu_block in block_table:
-            if cpu_block in mapping:
-                gpu_block = mapping[cpu_block]
-                gpu_block.ref_count += 1
+        for from_block in block_table:
+            if from_block in mapping:
+                to_block = mapping[from_block]
+                to_block.ref_count += 1
             else:
-                gpu_block = self.gpu_allocator.allocate(
-                    cpu_block.block_hash, cpu_block.num_hashed_tokens)
-                mapping[cpu_block] = gpu_block
-            new_block_table.append(gpu_block)
-            # Free the CPU block swapped in to GPU.
-            self.cpu_allocator.free(cpu_block)
+                to_block = dest_allocator.allocate(
+                    from_block.block_hash, from_block.num_hashed_tokens)
+                mapping[from_block] = to_block
+            new_block_table.append(to_block)
+            # Free the source block swapped in to destination.
+            src_allocator.free(from_block)
 
         return new_block_table
 
@@ -574,13 +576,17 @@ def swap_in(self,
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             self.block_tables[seq.seq_id] = \
-                self._swap_in_block_table(self.block_tables[seq.seq_id],
-                                          mapping)
+                self._swap_block_table(self.block_tables[seq.seq_id],
+                                       self.cpu_allocator,
+                                       self.gpu_allocator,
+                                       mapping)
 
         if seq_group.is_encoder_decoder():
             self.cross_block_tables[request_id] = \
-                self._swap_in_block_table(self.cross_block_tables[request_id],
-                                          mapping)
+                self._swap_block_table(self.cross_block_tables[request_id],
+                                       self.cpu_allocator,
+                                       self.gpu_allocator,
+                                       mapping)
 
         return [(cpu_block.block_number, gpu_block.block_number)
                 for cpu_block, gpu_block in mapping.items()]
@@ -589,26 +595,6 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         blocks = self._get_physical_blocks(seq_group)
         return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
 
-    def _swap_out_block_table(
-            self, block_table: BlockTable,
-            mapping: Dict[PhysicalTokenBlock,
-                          PhysicalTokenBlock]) -> BlockTable:
-
-        new_block_table: BlockTable = []
-        for gpu_block in block_table:
-            if gpu_block in mapping:
-                cpu_block = mapping[gpu_block]
-                cpu_block.ref_count += 1
-            else:
-                cpu_block = self.cpu_allocator.allocate(
-                    gpu_block.block_hash, gpu_block.num_hashed_tokens)
-                mapping[gpu_block] = cpu_block
-            new_block_table.append(cpu_block)
-            # Free the GPU block swapped out to CPU.
-            self.gpu_allocator.free(gpu_block)
-
-        return new_block_table
-
     def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         request_id = seq_group.request_id
 
@@ -617,13 +603,17 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             self.block_tables[seq.seq_id] = \
-                self._swap_out_block_table(self.block_tables[seq.seq_id],
-                                           mapping)
+                self._swap_block_table(self.block_tables[seq.seq_id],
+                                       self.gpu_allocator,
+                                       self.cpu_allocator,
+                                       mapping)
 
         if seq_group.is_encoder_decoder():
             self.cross_block_tables[request_id] = \
-                self._swap_out_block_table(self.cross_block_tables[request_id],
-                                           mapping)
+                self._swap_block_table(self.cross_block_tables[request_id],
+                                       self.gpu_allocator,
+                                       self.cpu_allocator,
+                                       mapping)
 
         return [(cpu_block.block_number, gpu_block.block_number)
                 for cpu_block, gpu_block in mapping.items()]

From 90b5a0e5303c937e56c5b8893fc0cbaeb985ac3f Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 25 May 2024 22:51:09 -0400
Subject: [PATCH 37/47] Refactored block manager + enc dec + unsupported
 feature checks into utils

---
 tests/core/block/test_block_manager_v2.py |  6 ++--
 tests/core/test_block_manager.py          |  6 ++--
 tests/core/utils.py                       |  1 +
 vllm/core/block/utils.py                  | 41 +++++++++++++++++++++++
 vllm/core/block_manager_v1.py             | 34 ++++---------------
 vllm/core/block_manager_v2.py             | 35 ++++---------------
 6 files changed, 60 insertions(+), 63 deletions(-)
 create mode 100644 vllm/core/block/utils.py

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 3aed0c58bd264..f1488916b508a 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -1,8 +1,8 @@
 import pytest
 
-from vllm.core.block_manager_v2 import (BlockSpaceManagerV2,
-                                        STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
-                                        STR_NOT_IMPL_ENC_DEC_SWA)
+from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                                   STR_NOT_IMPL_ENC_DEC_SWA)
+from vllm.core.block_manager_v2 import BlockSpaceManagerV2
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py
index 7e487a021d3c2..2264fe80c9c03 100644
--- a/tests/core/test_block_manager.py
+++ b/tests/core/test_block_manager.py
@@ -6,10 +6,10 @@
 
 from vllm import SamplingParams
 from vllm.block import PhysicalTokenBlock
+from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                                   STR_NOT_IMPL_ENC_DEC_SWA)
 from vllm.core.block_manager_v1 import (BlockSpaceManagerV1,
-                                        UncachedBlockAllocator,
-                                        STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
-                                        STR_NOT_IMPL_ENC_DEC_SWA)
+                                        UncachedBlockAllocator)
 from vllm.core.interfaces import AllocStatus
 from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
diff --git a/tests/core/utils.py b/tests/core/utils.py
index fb53b6cc5e18b..7ac565c0eccf1 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -151,5 +151,6 @@ def create_seq_group_encoder_decoder(
                          arrival_time=time.time(),
                          encoder_seq=encoder_seq)
 
+
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
     return (seq_len + block_size - 1) // block_size
diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
new file mode 100644
index 0000000000000..6599011771cea
--- /dev/null
+++ b/vllm/core/block/utils.py
@@ -0,0 +1,41 @@
+"""Block manager utils."""
+from typing import Union
+
+from vllm.core.block_manager_v1 import BlockSpaceManagerV1
+from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+from vllm.sequence import SequenceGroup
+
+'''
+Exception strings for non-implemented block manager encoder/decoder scenarios
+'''
+
+STR_NOT_IMPL_ENC_DEC_SWA = \
+    "Sliding window attention for encoder/decoder models " + \
+                    "is not currently supported."
+
+STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
+    "Prefix caching for encoder/decoder models " + \
+                    "is not currently supported."
+
+def check_no_caching_or_swa_for_blckmgr_encdec(
+        block_mgr: Union[BlockSpaceManagerV1, 
+                         BlockSpaceManagerV2], 
+        seq_group: SequenceGroup) -> None:
+    '''
+    Enforce that prefix caching & sliding-window attention (SWA)
+    are currently unsupported *specifically* for encoder/decoder models.
+
+    Raises NotImplementedError if unsupported scenario is detected.
+
+    Arguments:
+
+    * block_mgr: BlockSpaceManager instance
+    * seq_group: SequenceGroup passed to block_mgr
+    '''
+
+    if seq_group.is_encoder_decoder():
+        if block_mgr.block_sliding_window is not None:
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
+
+        if block_mgr.enable_caching:
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
\ No newline at end of file
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 90a485b39e9d6..fa64b96a5e7dc 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -8,6 +8,7 @@
 from typing import Set, Tuple
 
 from vllm.block import BlockTable, PhysicalTokenBlock
+from vllm.core.block.utils import check_no_caching_or_swa_for_blckmgr_encdec
 from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
@@ -15,17 +16,6 @@
 from vllm.utils import Device
 
 logger = init_logger(__name__)
-'''
-Exception strings for non-implemented encoder/decoder scenarios
-'''
-
-STR_NOT_IMPL_ENC_DEC_SWA = \
-    "Sliding window attention for encoder/decoder models " + \
-                    "is not currently supported."
-
-STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
-    "Prefix caching for encoder/decoder models " + \
-                    "is not currently supported."
 
 
 class BlockAllocatorBase(ABC):
@@ -279,9 +269,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
-        is_encoder_decoder = seq_group.is_encoder_decoder()
-        if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
+        check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group)
 
         self_num_required_blocks = self._get_seq_num_required_blocks(
             seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
@@ -291,8 +279,6 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
                               cross_num_required_blocks
 
         if self.block_sliding_window is not None:
-            if is_encoder_decoder:
-                raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
 
             num_required_blocks = min(num_required_blocks,
                                       self.block_sliding_window)
@@ -334,15 +320,8 @@ def _allocate_sequence(self, \
         return block_table
 
     def allocate(self, seq_group: SequenceGroup) -> None:
-        encoder_seq = seq_group.get_encoder_seq()
         is_encoder_decoder = seq_group.is_encoder_decoder()
-
-        if (self.block_sliding_window is not None) and \
-           is_encoder_decoder:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
-
-        if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
+        check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group)
 
         # Allocate decoder sequences
         #
@@ -362,8 +341,8 @@ def allocate(self, seq_group: SequenceGroup) -> None:
         if is_encoder_decoder:
             # A SequenceGroup has only a single encoder sequence (at most),
             # thus allocate with a ref count of 1
-            block_table = self._allocate_sequence(encoder_seq, 1,
-                                                  is_encoder_decoder)
+            block_table = self._allocate_sequence(seq_group.get_encoder_seq(),
+                                                  1, is_encoder_decoder)
             # Assign the cross-attention block table for the SequenceGroup.
             self.cross_block_tables[seq_group.request_id] = block_table
 
@@ -542,8 +521,7 @@ def can_swap_in(self,
             return AllocStatus.LATER
 
     def _swap_block_table(
-            self, block_table: BlockTable,
-            src_allocator: BlockAllocatorBase,
+            self, block_table: BlockTable, src_allocator: BlockAllocatorBase,
             dest_allocator: BlockAllocatorBase,
             mapping: Dict[PhysicalTokenBlock,
                           PhysicalTokenBlock]) -> BlockTable:
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 6113561032dd1..246ab9c297c5b 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -5,22 +5,11 @@
 
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.core.block.utils import check_no_caching_or_swa_for_blckmgr_encdec
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
 
-'''
-Exception strings for non-implemented encoder/decoder scenarios
-'''
-
-STR_NOT_IMPL_ENC_DEC_SWA = \
-    "Sliding window attention for encoder/decoder models " + \
-                    "is not currently supported."
-
-STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE = \
-    "Prefix caching for encoder/decoder models " + \
-                    "is not currently supported."
-
 SeqId = int
 EncoderSeqId = str
 
@@ -104,12 +93,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
-        is_encoder_decoder = seq_group.is_encoder_decoder()
-        if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
-
-        if self.block_sliding_window is not None and is_encoder_decoder:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
+        check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group)
 
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
         num_required_blocks = BlockTable.get_num_required_blocks(
@@ -117,7 +101,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
             block_size=self.block_size,
         )
 
-        if is_encoder_decoder:
+        if seq_group.is_encoder_decoder():
             num_required_blocks += BlockTable.get_num_required_blocks(
                 seq_group.get_encoder_seq().get_token_ids(),
                 block_size=self.block_size,
@@ -151,8 +135,6 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
         return block_table
 
     def allocate(self, seq_group: SequenceGroup) -> None:
-        encoder_seq = seq_group.get_encoder_seq()
-        is_encoder_decoder = seq_group.is_encoder_decoder()
 
         # Allocate self-attention block tables for decoder sequences
         waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
@@ -179,15 +161,10 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                 not in self.cross_block_tables), \
                 "block table already exists"
 
-        if (self.block_sliding_window is not None) and \
-           is_encoder_decoder:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
-
-        if self.enable_caching and is_encoder_decoder:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
+        check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group)
 
-        if is_encoder_decoder:
-            block_table = self._allocate_sequence(encoder_seq)
+        if seq_group.is_encoder_decoder():
+            block_table = self._allocate_sequence(seq_group.get_encoder_seq())
             self.cross_block_tables[request_id] = block_table
 
     def can_append_slots(self, seq_group: SequenceGroup,

From 9ee2582172b2b273ede9cb0e3ced9d9f197ecc0b Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 25 May 2024 22:57:02 -0400
Subject: [PATCH 38/47] removed circular import

---
 vllm/core/block/utils.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 6599011771cea..14b99496b12dc 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,10 +1,5 @@
 """Block manager utils."""
-from typing import Union
-
-from vllm.core.block_manager_v1 import BlockSpaceManagerV1
-from vllm.core.block_manager_v2 import BlockSpaceManagerV2
 from vllm.sequence import SequenceGroup
-
 '''
 Exception strings for non-implemented block manager encoder/decoder scenarios
 '''
@@ -17,10 +12,9 @@
     "Prefix caching for encoder/decoder models " + \
                     "is not currently supported."
 
+
 def check_no_caching_or_swa_for_blckmgr_encdec(
-        block_mgr: Union[BlockSpaceManagerV1, 
-                         BlockSpaceManagerV2], 
-        seq_group: SequenceGroup) -> None:
+        block_mgr, seq_group: SequenceGroup) -> None:
     '''
     Enforce that prefix caching & sliding-window attention (SWA)
     are currently unsupported *specifically* for encoder/decoder models.
@@ -38,4 +32,4 @@ def check_no_caching_or_swa_for_blckmgr_encdec(
             raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
 
         if block_mgr.enable_caching:
-            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
\ No newline at end of file
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)

From 5d0ac231b751466771f25e9275acede785bf4344 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sat, 25 May 2024 22:58:09 -0400
Subject: [PATCH 39/47] apparently isort has to run last?

---
 vllm/core/block/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 14b99496b12dc..4113f7e52b84f 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,5 +1,6 @@
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
+
 '''
 Exception strings for non-implemented block manager encoder/decoder scenarios
 '''

From 1bcc949c7c4634da50d80d7bc4b47185e6ac6f18 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Sun, 26 May 2024 12:20:12 -0400
Subject: [PATCH 40/47] slight name change

---
 vllm/core/block/utils.py      | 2 +-
 vllm/core/block_manager_v1.py | 6 +++---
 vllm/core/block_manager_v2.py | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 4113f7e52b84f..3dee7ff16dd84 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -14,7 +14,7 @@
                     "is not currently supported."
 
 
-def check_no_caching_or_swa_for_blckmgr_encdec(
+def check_no_caching_or_swa_for_blockmgr_encdec(
         block_mgr, seq_group: SequenceGroup) -> None:
     '''
     Enforce that prefix caching & sliding-window attention (SWA)
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index fa64b96a5e7dc..201cba309f6ef 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -8,7 +8,7 @@
 from typing import Set, Tuple
 
 from vllm.block import BlockTable, PhysicalTokenBlock
-from vllm.core.block.utils import check_no_caching_or_swa_for_blckmgr_encdec
+from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
 from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
@@ -269,7 +269,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
-        check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group)
+        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
 
         self_num_required_blocks = self._get_seq_num_required_blocks(
             seq_group.get_seqs(status=SequenceStatus.WAITING)[0])
@@ -321,7 +321,7 @@ def _allocate_sequence(self, \
 
     def allocate(self, seq_group: SequenceGroup) -> None:
         is_encoder_decoder = seq_group.is_encoder_decoder()
-        check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group)
+        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
 
         # Allocate decoder sequences
         #
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 246ab9c297c5b..6185a65983d3a 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -5,7 +5,7 @@
 
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.core.block.utils import check_no_caching_or_swa_for_blckmgr_encdec
+from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
@@ -93,7 +93,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
         # FIXME(woosuk): Here we assume that all sequences in the group share
         # the same prompt. This may not be true for preempted sequences.
 
-        check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group)
+        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
 
         seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
         num_required_blocks = BlockTable.get_num_required_blocks(
@@ -161,7 +161,7 @@ def allocate(self, seq_group: SequenceGroup) -> None:
                 not in self.cross_block_tables), \
                 "block table already exists"
 
-        check_no_caching_or_swa_for_blckmgr_encdec(self, seq_group)
+        check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group)
 
         if seq_group.is_encoder_decoder():
             block_table = self._allocate_sequence(seq_group.get_encoder_seq())

From 1bece71b45331ed5e371a3842e5a1bba5fe7a160 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 28 May 2024 12:27:47 -0400
Subject: [PATCH 41/47] wip merge

---
 vllm/core/block_manager_v2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index b19f4b184db94..cad42ab3c1ba2 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -138,7 +138,6 @@ def _allocate_sequence(self, seq: Sequence) -> BlockTable:
             block_allocator=self.block_allocator,
             max_block_sliding_window=self.max_block_sliding_window,
         )
-        assert self.block_sliding_window is None
         block_table.allocate(seq.get_token_ids())
 
         return block_table

From 1d882ca8d5825ab68988740e81796abadd083b06 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 28 May 2024 12:38:45 -0400
Subject: [PATCH 42/47] fixed utils to correctly handle encoder/decoder
 unsupported scenarios

---
 vllm/core/block/utils.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 3dee7ff16dd84..dd9345ab52d40 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -13,6 +13,26 @@
     "Prefix caching for encoder/decoder models " + \
                     "is not currently supported."
 
+def _get_block_mgr_sliding_window_attr(block_mgr):
+    '''
+    BlockManagerV1 and BlockManagerV2 have slightly different
+    members related to sliding window attention (SWA). This
+    function extracts the appropriate member to use for determining
+    whether SWA is enabled.
+
+    Arguments:
+
+    * block_mgr: BlockManagerV1 or BlockManagerV2 instance
+    '''
+
+    if hasattr(block_mgr, 'block_sliding_window'):
+        return block_mgr.block_sliding_window
+    if hasattr(block_mgr, 'max_block_sliding_window'):
+        return block_mgr.max_block_sliding_window
+
+    raise AttributeError("Block manager instance has neither " + \
+                         "block_sliding_window nor " + \
+                         "max_block_sliding_window attributes.")
 
 def check_no_caching_or_swa_for_blockmgr_encdec(
         block_mgr, seq_group: SequenceGroup) -> None:
@@ -29,7 +49,7 @@ def check_no_caching_or_swa_for_blockmgr_encdec(
     '''
 
     if seq_group.is_encoder_decoder():
-        if block_mgr.block_sliding_window is not None:
+        if _get_block_mgr_sliding_window_attr(block_mgr) is not None:
             raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
 
         if block_mgr.enable_caching:

From dfd94692e0b35343e64aace3cd4a496564be5809 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 28 May 2024 12:39:17 -0400
Subject: [PATCH 43/47] formatting

---
 vllm/core/block/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index dd9345ab52d40..c582ab270473c 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -13,6 +13,7 @@
     "Prefix caching for encoder/decoder models " + \
                     "is not currently supported."
 
+
 def _get_block_mgr_sliding_window_attr(block_mgr):
     '''
     BlockManagerV1 and BlockManagerV2 have slightly different
@@ -34,6 +35,7 @@ def _get_block_mgr_sliding_window_attr(block_mgr):
                          "block_sliding_window nor " + \
                          "max_block_sliding_window attributes.")
 
+
 def check_no_caching_or_swa_for_blockmgr_encdec(
         block_mgr, seq_group: SequenceGroup) -> None:
     '''

From 611df433882c1e10235084426d63fd817466dd19 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 28 May 2024 22:27:41 -0400
Subject: [PATCH 44/47] yapf fix

---
 vllm/core/block/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index c582ab270473c..4da5a965616ac 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,6 +1,5 @@
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
-
 '''
 Exception strings for non-implemented block manager encoder/decoder scenarios
 '''

From 8ee49dde309a93fd309f0117f74cde4949e958e4 Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 28 May 2024 22:30:12 -0400
Subject: [PATCH 45/47] yapf fix

---
 vllm/core/block/utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 4da5a965616ac..2c412a8f472e0 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,8 +1,7 @@
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
-'''
-Exception strings for non-implemented block manager encoder/decoder scenarios
-'''
+
+# Exception strings for non-implemented block manager enc/dec scenarios
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
     "Sliding window attention for encoder/decoder models " + \

From 039c25eb6661f2aa89b4239235451f2c6f61d63d Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 28 May 2024 23:03:44 -0400
Subject: [PATCH 46/47] upstream merge

---
 tests/core/utils.py      | 36 +++++++++++++++++++++++++++---------
 vllm/core/block/utils.py |  1 +
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/tests/core/utils.py b/tests/core/utils.py
index 1ccc5c3cc0a8e..cd2045b8a1889 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -55,12 +55,24 @@ def create_dummy_prompt_encoder_decoder(
     # and prompt "0 ... block_size".
     decoder_prompt_tokens = list(range(decoder_prompt_length))
     decoder_prompt_str = " ".join([str(t) for t in decoder_prompt_tokens])
-    decoder_prompt = Sequence(int(request_id), decoder_prompt_str,
-                              decoder_prompt_tokens, block_size)
+
+    decoder_prompt = Sequence(int(request_id),
+                              inputs={
+                                  "prompt": decoder_prompt_str,
+                                  "prompt_token_ids": decoder_prompt_tokens,
+                                  "multi_modal_data": None,
+                              },
+                              block_size=block_size)
+
     encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length))))
     encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens])
-    encoder_prompt = Sequence(int(request_id), encoder_prompt_str,
-                              encoder_prompt_tokens, block_size)
+    encoder_prompt = Sequence(int(request_id),
+                              inputs={
+                                  "prompt": encoder_prompt_str,
+                                  "prompt_token_ids": encoder_prompt_tokens,
+                                  "multi_modal_data": None,
+                              },
+                              block_size=block_size)
     seq_group = SequenceGroup(request_id=request_id,
                               seqs=[decoder_prompt],
                               sampling_params=SamplingParams(
@@ -134,8 +146,11 @@ def create_seq_group_encoder_decoder(
     for seq_id_offset, output_len in enumerate(seq_output_lens):
         seq = Sequence(
             seq_id=seq_id_start + seq_id_offset,
-            prompt="",
-            prompt_token_ids=prompt_token_ids,
+            inputs={
+                "prompt": "",
+                "prompt_token_ids": prompt_token_ids,
+                "multi_modal_data": None,
+            },
             block_size=16,
         )
 
@@ -149,8 +164,11 @@ def create_seq_group_encoder_decoder(
     # Encoder sequence
     encoder_seq = Sequence(
         seq_id=seq_id_start + len(seq_output_lens),
-        prompt="",
-        prompt_token_ids=prompt_token_ids,
+        inputs={
+            "prompt": "",
+            "prompt_token_ids": prompt_token_ids,
+            "multi_modal_data": None,
+        },
         block_size=16,
     )
 
@@ -162,4 +180,4 @@ def create_seq_group_encoder_decoder(
 
 
 def round_up_to_next_block(seq_len: int, block_size: int) -> int:
-    return (seq_len + block_size - 1) // block_size
+    return (seq_len + block_size - 1) // block_size
\ No newline at end of file
diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index 4da5a965616ac..c582ab270473c 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,5 +1,6 @@
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
+
 '''
 Exception strings for non-implemented block manager encoder/decoder scenarios
 '''

From 8e9ef5bb5ae7bc3ece7ae527e591df093ff7f31e Mon Sep 17 00:00:00 2001
From: Andrew Feldman <afeldman@neuralmagic.com>
Date: Tue, 28 May 2024 23:06:08 -0400
Subject: [PATCH 47/47] fix formatting issue

---
 vllm/core/block/utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/core/block/utils.py b/vllm/core/block/utils.py
index c582ab270473c..372bfb5ed2f9e 100644
--- a/vllm/core/block/utils.py
+++ b/vllm/core/block/utils.py
@@ -1,9 +1,7 @@
 """Block manager utils."""
 from vllm.sequence import SequenceGroup
 
-'''
-Exception strings for non-implemented block manager encoder/decoder scenarios
-'''
+# Exception strings for non-implemented block manager encoder/decoder scenarios
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
     "Sliding window attention for encoder/decoder models " + \