forked from mesolitica/vllm-whisper
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Core][Bugfix]Refactor block manager for better testability (vllm-pro…
- Loading branch information
1 parent
227581b
commit d522b98
Showing
30 changed files
with
3,285 additions
and
77 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import contextlib | ||
import gc | ||
|
||
import pytest | ||
import ray | ||
import torch | ||
|
||
from vllm import LLM | ||
from vllm.model_executor.parallel_utils.parallel_state import ( | ||
destroy_model_parallel) | ||
from vllm.model_executor.utils import set_random_seed | ||
|
||
|
||
def cleanup(): | ||
destroy_model_parallel() | ||
with contextlib.suppress(AssertionError): | ||
torch.distributed.destroy_process_group() | ||
gc.collect() | ||
torch.cuda.empty_cache() | ||
ray.shutdown() | ||
|
||
|
||
@pytest.fixture | ||
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, | ||
baseline_llm_kwargs, seed): | ||
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, | ||
baseline_llm_kwargs, seed) | ||
|
||
|
||
@pytest.fixture | ||
def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, | ||
test_llm_kwargs, seed): | ||
return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, | ||
test_llm_kwargs, seed) | ||
|
||
|
||
def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, | ||
distinct_llm_kwargs, seed): | ||
kwargs = { | ||
**common_llm_kwargs, | ||
**per_test_common_llm_kwargs, | ||
**distinct_llm_kwargs, | ||
} | ||
|
||
def generator_inner(): | ||
llm = LLM(**kwargs) | ||
|
||
set_random_seed(seed) | ||
|
||
yield llm | ||
del llm | ||
cleanup() | ||
|
||
for llm in generator_inner(): | ||
yield llm | ||
del llm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
from itertools import cycle | ||
|
||
import pytest | ||
|
||
from vllm import SamplingParams | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"common_llm_kwargs", | ||
[{ | ||
# Use a small model for a fast test. | ||
"model": "facebook/opt-125m", | ||
# skip cuda graph creation for fast test. | ||
"enforce_eager": True, | ||
# Allow only 5 sequences of ~1024 tokens in worst case. | ||
"block_size": 16, | ||
"forced_num_gpu_blocks": 5 * (64 + 1), | ||
}]) | ||
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) | ||
@pytest.mark.parametrize("baseline_llm_kwargs", [{ | ||
"use_v2_block_manager": False | ||
}]) | ||
@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}]) | ||
@pytest.mark.parametrize("batch_size", [10]) | ||
@pytest.mark.parametrize("seed", [1]) | ||
def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, | ||
test_llm_generator, batch_size): | ||
"""Verify block manager v2 produces same outputs as block manager v1, even | ||
when there is preemption. | ||
This constructs two LLM, each with limited number of GPU blocks. The limit | ||
is decided such that as the sequences in the batch grow, sequences must be | ||
preempted and removed from cache. | ||
If the output token ids are equivalent, then we have confidence that the KV | ||
cache is not corrupted in the v2 block manager. | ||
NOTE: We want a significant number of generated tokens so that any incorrect | ||
KV mapping has time to build up error. | ||
""" | ||
output_len = 1024 | ||
temperature = 0.0 | ||
|
||
# We want to ensure equality even with preemption. | ||
# We force the total block size to be 1 + cdiv(output_len, block_size) | ||
# so that only one sequence can fit at a time (once the sequences grow). | ||
|
||
prompts = [ | ||
"Hello, my name is", | ||
"The president of the United States is", | ||
"The capital of France is", | ||
"The future of AI is", | ||
] | ||
|
||
prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] | ||
|
||
sampling_params = SamplingParams( | ||
max_tokens=output_len, | ||
ignore_eos=True, | ||
temperature=temperature, | ||
) | ||
|
||
print('Getting token ids from block manager v1') | ||
baseline_token_ids = get_token_ids_from_llm_generator( | ||
baseline_llm_generator, prompts, sampling_params) | ||
|
||
print('Getting token ids from block manager v2') | ||
test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, | ||
prompts, sampling_params) | ||
|
||
for expected_token_ids, actual_token_ids in zip(baseline_token_ids, | ||
test_token_ids): | ||
assert expected_token_ids == actual_token_ids | ||
|
||
assert baseline_token_ids == test_token_ids | ||
|
||
|
||
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params): | ||
for llm in llm_generator: | ||
outputs = llm.generate(prompts, sampling_params, use_tqdm=True) | ||
token_ids = [output.outputs[0].token_ids for output in outputs] | ||
del llm | ||
|
||
return token_ids |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pytest | ||
|
||
from vllm.core.block_manager_v2 import BlockSpaceManagerV2 | ||
from vllm.core.interfaces import AllocStatus | ||
|
||
from ..utils import create_seq_group | ||
|
||
|
||
@pytest.mark.parametrize("block_size", [16]) | ||
@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80]) | ||
@pytest.mark.parametrize("num_seqs_per_group", [1, 4]) | ||
@pytest.mark.parametrize("watermark", [0.0, 0.5]) | ||
def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int, | ||
num_gpu_blocks: int, watermark: float): | ||
block_manager = BlockSpaceManagerV2( | ||
block_size=block_size, | ||
num_gpu_blocks=num_gpu_blocks, | ||
num_cpu_blocks=1024, | ||
watermark=watermark, | ||
) | ||
num_watermark_blocks = int(watermark * num_gpu_blocks) | ||
|
||
num_output_blocks_per_seq = 1 | ||
|
||
# NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but | ||
# the current implementation assumes all seqs are new prompts / don't have | ||
# different output lens. | ||
num_output_blocks = num_output_blocks_per_seq | ||
|
||
for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks): | ||
seq_group = create_seq_group( | ||
seq_prompt_lens=block_size * num_prompt_blocks, | ||
seq_output_lens=[ | ||
block_size * num_output_blocks_per_seq | ||
for _ in range(num_seqs_per_group) | ||
], | ||
) | ||
|
||
assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks | ||
|
||
can_allocate_result = block_manager.can_allocate(seq_group) | ||
|
||
num_required_blocks = num_prompt_blocks + num_output_blocks | ||
|
||
if num_gpu_blocks - num_required_blocks < num_watermark_blocks: | ||
assert can_allocate_result == AllocStatus.NEVER | ||
elif num_gpu_blocks >= num_required_blocks: | ||
assert can_allocate_result == AllocStatus.OK | ||
else: | ||
assert can_allocate_result == AllocStatus.LATER |
Oops, something went wrong.