Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] [Bugfix] Refactor block manager subsystem for better testability #3492

Merged
merged 96 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from 95 commits
Commits
Show all changes
96 commits
Select commit Hold shift + click to select a range
85fb179
logical block test
cadedaniel Mar 11, 2024
0f19984
sequence
cadedaniel Mar 11, 2024
0306a8c
notes
cadedaniel Mar 11, 2024
de14e54
wip
cadedaniel Mar 11, 2024
7d66c4a
prefix caching bug when prompt len < block size
cadedaniel Mar 11, 2024
e03e057
wip
cadedaniel Mar 19, 2024
c162283
refcount
cadedaniel Mar 19, 2024
99a5b59
wip
cadedaniel Mar 19, 2024
1fe4cbb
wip
cadedaniel Mar 19, 2024
5e70924
wip
cadedaniel Mar 19, 2024
ea94ecc
wip
cadedaniel Mar 19, 2024
376cdb6
wip
cadedaniel Mar 19, 2024
d7e122e
wip
cadedaniel Mar 19, 2024
2b821dc
wip
cadedaniel Mar 19, 2024
0a6fbd2
wip
cadedaniel Mar 19, 2024
658b4c5
wip
cadedaniel Mar 19, 2024
e976541
wip
cadedaniel Mar 19, 2024
085f419
content hash
cadedaniel Mar 19, 2024
6fc22ef
wip
cadedaniel Mar 19, 2024
cbea543
unused cached blocks
cadedaniel Mar 19, 2024
029d39a
wip
cadedaniel Mar 19, 2024
d2ca90b
wip
cadedaniel Mar 21, 2024
1eee08c
break files
cadedaniel Mar 22, 2024
ebe6ccf
wip
cadedaniel Mar 23, 2024
9dfc821
wip
cadedaniel Mar 23, 2024
619fb0d
wip
cadedaniel Mar 23, 2024
ea49f23
device aware
cadedaniel Mar 23, 2024
1252223
wip
cadedaniel Mar 23, 2024
c1e1b2f
wip
cadedaniel Mar 23, 2024
d0b4f20
wip0
cadedaniel Mar 23, 2024
a3cffb9
wip
cadedaniel Mar 23, 2024
cd75992
wip
cadedaniel Mar 24, 2024
1d25cf2
wip
cadedaniel Mar 24, 2024
335a218
wip
cadedaniel Mar 24, 2024
960da58
wip
cadedaniel Mar 25, 2024
63f5dd5
fork
cadedaniel Mar 25, 2024
d5ebfd2
fork
cadedaniel Mar 25, 2024
c127343
wip
cadedaniel Mar 25, 2024
a20051a
remove
cadedaniel Mar 25, 2024
02e4154
simple generation works
cadedaniel Mar 25, 2024
6f88528
interfaces
cadedaniel Mar 25, 2024
70c3fff
wip
cadedaniel Mar 25, 2024
5867272
wip
cadedaniel Mar 25, 2024
65cfac8
wip
cadedaniel Mar 25, 2024
7d059a6
lint
cadedaniel Mar 25, 2024
c286632
lint2
cadedaniel Mar 25, 2024
46bbd14
lint3
cadedaniel Mar 25, 2024
2e794de
lint4
cadedaniel Mar 25, 2024
2416c22
lint5
cadedaniel Mar 25, 2024
558ad36
v2 config
cadedaniel Mar 25, 2024
3fa5b2b
lint
cadedaniel Mar 25, 2024
de2a5c9
Merge remote-tracking branch 'upstream/main' into block-manager-tests
cadedaniel Mar 25, 2024
0464d48
clean
cadedaniel Mar 25, 2024
6ac0318
wip
cadedaniel Mar 25, 2024
9fb053c
wip
cadedaniel Mar 25, 2024
7f33d2f
wip
cadedaniel Mar 25, 2024
9455a46
cow in naive
cadedaniel Mar 25, 2024
2f9ebac
wip
cadedaniel Mar 25, 2024
26b6ce7
fix cow bug
cadedaniel Mar 25, 2024
548aec8
cow test
cadedaniel Mar 25, 2024
f0025ab
wip
cadedaniel Mar 25, 2024
3be4040
wip
cadedaniel Mar 25, 2024
62a616b
wip
cadedaniel Mar 25, 2024
9fd6c08
wip prefix cow
cadedaniel Mar 26, 2024
6ded181
wip
cadedaniel Mar 26, 2024
95b65f1
wip
cadedaniel Mar 26, 2024
b03693c
wip
cadedaniel Mar 26, 2024
d582cb6
wip
cadedaniel Mar 26, 2024
70b1f60
lint
cadedaniel Mar 26, 2024
3ce9347
lint2
cadedaniel Mar 26, 2024
ed6c2e6
Merge remote-tracking branch 'upstream/main' into block-manager-tests
cadedaniel Mar 26, 2024
640d7e5
isort
cadedaniel Mar 26, 2024
0f0daf8
fix
cadedaniel Mar 26, 2024
ba8acbd
wip
cadedaniel Mar 27, 2024
b51287c
adding to entrypoint tests
cadedaniel Mar 27, 2024
1f3483f
try
cadedaniel Mar 27, 2024
4ebc0c0
docstrings!
cadedaniel Mar 27, 2024
1f09fd0
wip
cadedaniel Mar 27, 2024
80cdc3c
more docstring / format
cadedaniel Mar 27, 2024
36bd93f
entrypoints
cadedaniel Mar 27, 2024
79dac79
model correctness test
cadedaniel Mar 27, 2024
b392a5d
remove
cadedaniel Mar 27, 2024
8d42bd7
lint
cadedaniel Mar 27, 2024
1b3fe9f
note
cadedaniel Mar 27, 2024
9680dc8
remove
cadedaniel Mar 27, 2024
dd4bcee
clean
cadedaniel Mar 27, 2024
9000b41
name
cadedaniel Mar 27, 2024
a2897b0
rename
cadedaniel Mar 27, 2024
bead69a
wip
cadedaniel Mar 27, 2024
132e7a3
clean
cadedaniel Mar 27, 2024
0d75e12
comment
cadedaniel Mar 27, 2024
70d1812
lint
cadedaniel Mar 27, 2024
321dc16
comment
cadedaniel Mar 27, 2024
887496b
fix test
cadedaniel Mar 27, 2024
f0b1bf1
empty
cadedaniel Mar 27, 2024
5b86297
pr feedback
cadedaniel Mar 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added tests/core/block/__init__.py
Empty file.
59 changes: 59 additions & 0 deletions tests/core/block/e2e/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import contextlib
import gc

import pytest
import ray
import torch

from vllm import LLM
from vllm.model_executor.parallel_utils.parallel_state import (
destroy_model_parallel)
from vllm.model_executor.utils import set_random_seed


def cleanup():
destroy_model_parallel()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
ray.shutdown()


@pytest.fixture
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, seed):
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
baseline_llm_kwargs, seed)


@pytest.fixture
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed):
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed)


def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
distinct_llm_kwargs, seed):
kwargs = {
**common_llm_kwargs,
**per_test_common_llm_kwargs,
**distinct_llm_kwargs,
}

def generator_inner():
llm = LLM(**kwargs)

set_random_seed(seed)

yield llm
del llm
cleanup()

def generator_outer():
for llm in generator_inner():
yield llm
del llm
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need another level of wrapper? would it work without it? if not please comment why

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the usage of the llm generator below but still confused since we are only yielding one llm instance

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh good catch, not necessary


return generator_outer()
86 changes: 86 additions & 0 deletions tests/core/block/e2e/test_correctness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from itertools import cycle

import pytest

from vllm import SamplingParams


@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
"model": "facebook/opt-125m",

# skip cuda graph creation for fast test.
"enforce_eager": True,

# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"forced_num_gpu_blocks": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"use_v2_block_manager": False
}])
@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
@pytest.mark.parametrize("batch_size", [10])
@pytest.mark.parametrize("seed", [1])
def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
test_llm_generator, batch_size):
"""Verify block manager v2 produces same outputs as block manager v1, even
when there is preemption.

This constructs two LLM, each with limited number of GPU blocks. The limit
is decided such that as the sequences in the batch grow, sequences must be
preempted and removed from cache.

If the output token ids are equivalent, then we have confidence that the KV
cache is not corrupted in the v2 block manager.

NOTE: We want a significant number of generated tokens so that any incorrect
KV mapping has time to build up error.
"""
output_len = 1024
temperature = 0.0

# We want to ensure equality even with preemption.
# We force the total block size to be 1 + cdiv(output_len, block_size)
# so that only one sequence can fit at a time (once the sequences grow).

prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]

prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]

sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)

print('Getting token ids from block manager v1')
baseline_token_ids = get_token_ids_from_llm_generator(
baseline_llm_generator, prompts, sampling_params)

print('Getting token ids from block manager v2')
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
prompts, sampling_params)

for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
test_token_ids):
assert expected_token_ids == actual_token_ids

assert baseline_token_ids == test_token_ids


def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
for llm in llm_generator:
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
token_ids = [output.outputs[0].token_ids for output in outputs]
del llm

return token_ids
50 changes: 50 additions & 0 deletions tests/core/block/test_block_space_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pytest

from vllm.core.block_manager_v2 import BlockSpaceManagerV2
from vllm.core.interfaces import AllocStatus

from ..utils import create_seq_group


@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
@pytest.mark.parametrize("watermark", [0.0, 0.5])
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
num_gpu_blocks: int, watermark: float):
block_manager = BlockSpaceManagerV2(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=1024,
watermark=watermark,
)
num_watermark_blocks = int(watermark * num_gpu_blocks)

num_output_blocks_per_seq = 1

# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
# the current implementation assumes all seqs are new prompts / don't have
# different output lens.
num_output_blocks = num_output_blocks_per_seq

for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
seq_group = create_seq_group(
seq_prompt_lens=block_size * num_prompt_blocks,
seq_output_lens=[
block_size * num_output_blocks_per_seq
for _ in range(num_seqs_per_group)
],
)

assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks

can_allocate_result = block_manager.can_allocate(seq_group)

num_required_blocks = num_prompt_blocks + num_output_blocks

if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
assert can_allocate_result == AllocStatus.NEVER
elif num_gpu_blocks >= num_required_blocks:
assert can_allocate_result == AllocStatus.OK
else:
assert can_allocate_result == AllocStatus.LATER
Loading
Loading