vllm-project · youkaichao · Mar 28, 2024 · Mar 11, 2024 · Mar 11, 2024 · Mar 11, 2024
diff --git a/tests/core/block/__init__.py b/tests/core/block/__init__.py
diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py
@@ -0,0 +1,59 @@
+import contextlib
+import gc
+
+import pytest
+import ray
+import torch
+
+from vllm import LLM
+from vllm.model_executor.parallel_utils.parallel_state import (
+    destroy_model_parallel)
+from vllm.model_executor.utils import set_random_seed
+
+
+def cleanup():
+    destroy_model_parallel()
+    with contextlib.suppress(AssertionError):
+        torch.distributed.destroy_process_group()
+    gc.collect()
+    torch.cuda.empty_cache()
+    ray.shutdown()
+
+
+@pytest.fixture
+def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                           baseline_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                baseline_llm_kwargs, seed)
+
+
+@pytest.fixture
+def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                       test_llm_kwargs, seed):
+    return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                                test_llm_kwargs, seed)
+
+
+def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+                         distinct_llm_kwargs, seed):
+    kwargs = {
+        **common_llm_kwargs,
+        **per_test_common_llm_kwargs,
+        **distinct_llm_kwargs,
+    }
+
+    def generator_inner():
+        llm = LLM(**kwargs)
+
+        set_random_seed(seed)
+
+        yield llm
+        del llm
+        cleanup()
+
+    def generator_outer():
+        for llm in generator_inner():
+            yield llm
+            del llm
+
+    return generator_outer()
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
@@ -0,0 +1,86 @@
+from itertools import cycle
+
+import pytest
+
+from vllm import SamplingParams
+
+
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Use a small model for a fast test.
+        "model": "facebook/opt-125m",
+
+        # skip cuda graph creation for fast test.
+        "enforce_eager": True,
+
+        # Allow only 5 sequences of ~1024 tokens in worst case.
+        "block_size": 16,
+        "forced_num_gpu_blocks": 5 * (64 + 1),
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{
+    "use_v2_block_manager": False
+}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("batch_size", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
+                                               test_llm_generator, batch_size):
+    """Verify block manager v2 produces same outputs as block manager v1, even
+    when there is preemption.
+
+    This constructs two LLM, each with limited number of GPU blocks. The limit
+    is decided such that as the sequences in the batch grow, sequences must be
+    preempted and removed from cache.
+
+    If the output token ids are equivalent, then we have confidence that the KV
+    cache is not corrupted in the v2 block manager.
+
+    NOTE: We want a significant number of generated tokens so that any incorrect
+    KV mapping has time to build up error.
+    """
+    output_len = 1024
+    temperature = 0.0
+
+    # We want to ensure equality even with preemption.
+    # We force the total block size to be 1 + cdiv(output_len, block_size)
+    # so that only one sequence can fit at a time (once the sequences grow).
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    print('Getting token ids from block manager v1')
+    baseline_token_ids = get_token_ids_from_llm_generator(
+        baseline_llm_generator, prompts, sampling_params)
+
+    print('Getting token ids from block manager v2')
+    test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+                                                      prompts, sampling_params)
+
+    for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+                                                    test_token_ids):
+        assert expected_token_ids == actual_token_ids
+
+    assert baseline_token_ids == test_token_ids
+
+
+def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
+    for llm in llm_generator:
+        outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+        token_ids = [output.outputs[0].token_ids for output in outputs]
+        del llm
+
+    return token_ids
diff --git a/tests/core/block/test_block_space_manager.py b/tests/core/block/test_block_space_manager.py
@@ -0,0 +1,50 @@
+import pytest
+
+from vllm.core.block_manager_v2 import BlockSpaceManagerV2
+from vllm.core.interfaces import AllocStatus
+
+from ..utils import create_seq_group
+
+
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("num_gpu_blocks", [8, 40, 80])
+@pytest.mark.parametrize("num_seqs_per_group", [1, 4])
+@pytest.mark.parametrize("watermark", [0.0, 0.5])
+def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int,
+                                num_gpu_blocks: int, watermark: float):
+    block_manager = BlockSpaceManagerV2(
+        block_size=block_size,
+        num_gpu_blocks=num_gpu_blocks,
+        num_cpu_blocks=1024,
+        watermark=watermark,
+    )
+    num_watermark_blocks = int(watermark * num_gpu_blocks)
+
+    num_output_blocks_per_seq = 1
+
+    # NOTE: This should be num_output_blocks_per_seq * num_seqs_per_group, but
+    # the current implementation assumes all seqs are new prompts / don't have
+    # different output lens.
+    num_output_blocks = num_output_blocks_per_seq
+
+    for num_prompt_blocks in range(1, num_gpu_blocks - num_output_blocks):
+        seq_group = create_seq_group(
+            seq_prompt_lens=block_size * num_prompt_blocks,
+            seq_output_lens=[
+                block_size * num_output_blocks_per_seq
+                for _ in range(num_seqs_per_group)
+            ],
+        )
+
+        assert num_prompt_blocks + num_output_blocks <= num_gpu_blocks
+
+        can_allocate_result = block_manager.can_allocate(seq_group)
+
+        num_required_blocks = num_prompt_blocks + num_output_blocks
+
+        if num_gpu_blocks - num_required_blocks < num_watermark_blocks:
+            assert can_allocate_result == AllocStatus.NEVER
+        elif num_gpu_blocks >= num_required_blocks:
+            assert can_allocate_result == AllocStatus.OK
+        else:
+            assert can_allocate_result == AllocStatus.LATER