Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core][WIP] Add Automatic Prefix Caching to PrefixCachingBlockAllocator #4146

Closed
128 changes: 127 additions & 1 deletion tests/core/block/test_prefix_caching_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,5 +380,131 @@ def create_immutable_chain(
prev_block = allocator.allocate_immutable(
prev_block=prev_block, token_ids=block_token_ids)
blocks.append(prev_block)

return blocks

@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(20)))
def test_eviction_order(num_blocks: int, block_size: int, seed: int):
"""Verify sharing occurs by allocating two sequences that share prefixes
and incrementally freeing blocks.
"""
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
num_blocks_to_consume = num_blocks + 1

token_ids = list(range(num_blocks_to_consume * block_size))

# First chain takes the first block
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[:block_size],
allocator=allocator,
)

# There should only be one block allocated at this point
assert allocator.get_num_free_blocks() == (num_blocks - 1)

# Set the last accessed time of the first block to 1
allocator.access_all_blocks(1)

# Second chain takes the rest of the blocks
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[block_size:-block_size],
allocator=allocator,
)

# There shouldn't be any blocks left at this point
assert allocator.get_num_free_blocks() == (0)

# Free the one block in the first chain
assert len(first_chain) == 1
first_block_id = first_chain[0].block_id
allocator.free(first_chain[0])

# Set the last accessed time on all of the blocks in the second chain
# to 2
allocator.access_all_blocks(2)

# Free each block in the second chain.
for i, block in enumerate(second_chain):
allocator.free(block)

# Allocate a new block and check that it's the least recently used block
# from the first chain.
new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[-block_size:],
allocator=allocator,
)

assert new_block[0].block_id == first_block_id

# Test case where two last accessed times are equal
@staticmethod
@pytest.mark.parametrize("num_blocks", [1024])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("seed", list(range(20)))
def test_eviction_order_num_tokens(num_blocks: int, block_size: int,
seed: int):
"""Verify sharing occurs by allocating two sequences that share prefixes
and incrementally freeing blocks.
"""
random.seed(seed)
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
num_blocks_to_consume = num_blocks + 1

token_ids = list(range(num_blocks_to_consume * block_size))

num_blocks_in_first_chain = 2
num_tokens_in_first_chain = block_size * num_blocks_in_first_chain
# First chain takes the first block
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[:num_tokens_in_first_chain],
allocator=allocator,
)
# There should only be one block allocated at this point
assert allocator.get_num_free_blocks() == (num_blocks -
num_blocks_in_first_chain)

# Set the last accessed time of the first block to 1
allocator.access_all_blocks(1)

# Second chain takes the rest of the blocks
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[num_tokens_in_first_chain:-block_size],
allocator=allocator,
)

# There shouldn't be any blocks left at this point
assert allocator.get_num_free_blocks() == (0)

assert len(first_chain) == num_blocks_in_first_chain
last_block_id = first_chain[-1].block_id
# Free each block in the first chain.
for i, block in enumerate(first_chain):
allocator.free(block)

# Set the last accessed time on all of the blocks in the second chain
# to 2
allocator.access_all_blocks(2)

# Free each block in the second chain.
for i, block in enumerate(second_chain):
allocator.free(block)

# Allocate a new block and check that it's the least recently used block
# from the first chain.
new_block = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids[-block_size:],
allocator=allocator,
)

assert new_block[0].block_id == last_block_id
2 changes: 1 addition & 1 deletion tests/models/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_models(
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model

vllm_model = vllm_runner(model, dtype=dtype)
vllm_model = vllm_runner(model, dtype=dtype, enable_prefix_caching=True)
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
del vllm_model

Expand Down
5 changes: 5 additions & 0 deletions vllm/core/block/cpu_gpu_block_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,11 @@ def mark_blocks_as_computed(self) -> None:
device = Device.GPU
return self._allocators[device].mark_blocks_as_computed()

def access_all_blocks_in_seq(self, seq: List[int], now: float) -> None:
# Prefix caching only supported on GPU.
device = Device.GPU
return self._allocators[device].access_all_blocks_in_seq(seq, now)

def get_common_computed_block_ids(
self, seq_block_ids: List[List[int]]) -> List[int]:
# Prefix caching only supported on GPU.
Expand Down
Loading
Loading