[Dynamic Spec Decoding] Auto-disable by the running queue size (vllm-…

…project#4592) Co-authored-by: Cade Daniel <edacih@gmail.com>
neuralmagic · robertgshaw2-redhat · Jun 3, 2024 · May 5, 2024 · May 6, 2024 · May 6, 2024
commit 4a85263089d394aba17c6e5d4c2735f0c7f05c30
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
@@ -42,9 +42,11 @@ def mock_causal_accepted_tensor(k: int, last_accepted_indices: torch.Tensor,
 @pytest.mark.parametrize(
     "which_tokens_accepted",
     ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
+@pytest.mark.parametrize("disable_bonus_tokens", [True, False])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
-def test_correct_output_format(which_tokens_accepted: str, seed: int,
+def test_correct_output_format(which_tokens_accepted: str,
+                               disable_bonus_tokens: bool, seed: int,
                                device: str):
     """Verify the output has correct format given predetermined accepted matrix.
     """
@@ -86,7 +88,8 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
                                     size=(batch_size, 1),
                                     dtype=torch.int64)
 
-    rejection_sampler = RejectionSampler()
+    rejection_sampler = RejectionSampler(
+        disable_bonus_tokens=disable_bonus_tokens)
     device_rank = int(device[-1])
     rejection_sampler.init_gpu_tensors(rank=device_rank)
     output_token_ids = rejection_sampler._create_output(  # pylint: disable=protected-access
@@ -96,9 +99,11 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
         bonus_token_ids,
     )
 
-    # Bonus tokens are currently disabled. Verify they're set to -1.
+    expected_bonus_token_ids = bonus_token_ids.clone()
+    # If bonus tokens disabled. Verify they are set to -1.
     # See https://github.com/vllm-project/vllm/issues/4212
-    expected_bonus_token_ids = bonus_token_ids.clone() * 0 - 1
+    if disable_bonus_tokens:
+        expected_bonus_token_ids = expected_bonus_token_ids * 0 - 1
 
     if which_tokens_accepted == "all_tokens_accepted":
         # Expect all tokens to be equal to draft tokens.

diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -536,6 +536,40 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        "model": "JackFram/llama-160m",
+
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "speculative_disable_by_batch_size": 2,
+    },
+])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("output_len", [10])
+@pytest.mark.parametrize("seed", [1])
+def test_disable_speculation(baseline_llm_generator, test_llm_generator,
+                             batch_size: int, output_len: int):
+    """Verify greedy equality when all sequences disable speculation.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len=output_len,
+                                         force_output_len=True)
+
+
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{

diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -57,7 +57,7 @@
 @pytest.mark.parametrize("output_len", [
     256,
 ])
-@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize("seed", [1])
 def test_ngram_e2e_greedy_correctness(baseline_llm_generator,
                                       test_llm_generator, batch_size: int,

diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -0,0 +1,77 @@
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.rejection_sampler import RejectionSampler
+from vllm.sequence import ExecuteModelRequest
+from vllm.spec_decode.metrics import AsyncMetricsCollector
+from vllm.spec_decode.multi_step_worker import MultiStepWorker
+from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
+from vllm.spec_decode.top1_proposer import Top1Proposer
+
+from .utils import create_batch, mock_worker
+
+
+@pytest.mark.parametrize('queue_size', [2, 4])
+@pytest.mark.parametrize('batch_size', [1, 2, 3, 6])
+@pytest.mark.parametrize('k', [1, 2, 5, 7, 10])
+@torch.inference_mode()
+def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
+    """Verify that speculative tokens are disabled when the batch size
+    exceeds the threshold.
+    """
+    disable_by_batch_size = 3
+
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    rejection_sampler = MagicMock(spec=RejectionSampler)
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    worker = SpecDecodeWorker(proposer_worker=draft_worker,
+                              scorer_worker=target_worker,
+                              rejection_sampler=rejection_sampler,
+                              metrics_collector=metrics_collector,
+                              disable_by_batch_size=disable_by_batch_size)
+
+    exception_secret = 'artificial stop'
+    draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
+
+    seq_group_metadata_list, _, _ = create_batch(batch_size, k)
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=seq_group_metadata_list,
+        num_lookahead_slots=k,
+        running_queue_size=queue_size)
+
+    with pytest.raises(ValueError, match=exception_secret):
+        worker.execute_model(execute_model_req=execute_model_req)
+
+    # When the batch size is larger than the threshold,
+    # we expect no speculative tokens (0).
+    expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0
+    assert seq_group_metadata_list[
+        0].num_speculative_tokens == expected_num_spec_tokens
+
+    draft_worker.sampler_output.side_effect = ValueError(exception_secret)
+
+    proposer = Top1Proposer(
+        worker=draft_worker,
+        device='cpu',  # not used
+        vocab_size=100,  # not used
+        # Must be long enough to avoid being skipped due to length.
+        max_proposal_len=1024,
+    )
+
+    if queue_size < disable_by_batch_size:
+        # Should raise exception when executing the mocked draft model.
+        with pytest.raises(ValueError, match=exception_secret):
+            proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list,
+                num_lookahead_slots=k), )
+    else:
+        # Should not execute the draft model because spec decode is disabled
+        # for all requests. Accordingly, the proposal length should be 0.
+        proposals = proposer.get_proposals(
+            execute_model_req=ExecuteModelRequest(
+                seq_group_metadata_list=seq_group_metadata_list,
+                num_lookahead_slots=k), )
+        assert proposals.proposal_lens.tolist() == [0] * batch_size
diff --git a/vllm/config.py b/vllm/config.py
@@ -736,6 +736,7 @@ def maybe_create_spec_config(
         speculative_max_model_len: Optional[int],
         enable_chunked_prefill: bool,
         use_v2_block_manager: bool,
+        speculative_disable_by_batch_size: Optional[int],
         ngram_prompt_lookup_max: Optional[int],
         ngram_prompt_lookup_min: Optional[int],
     ) -> Optional["SpeculativeConfig"]:
@@ -764,6 +765,9 @@ def maybe_create_spec_config(
             use_v2_block_manager (bool): Whether vLLM is configured to use the
                 v2 block manager or not. Used for raising an error since the v2
                 block manager is required with spec decode.
+            speculative_disable_by_batch_size (Optional[int]): Disable
+                speculative decoding for new incoming requests when the number
+                of enqueue requests  is larger than this value, if provided.
             ngram_prompt_lookup_max (Optional[int]): Max size of ngram token
                 window, if provided.
             ngram_prompt_lookup_min (Optional[int]): Min size of ngram token
@@ -774,7 +778,7 @@ def maybe_create_spec_config(
                 the necessary conditions are met, else None.
         """
 
-        if (speculative_model is None and num_speculative_tokens is None):
+        if speculative_model is None and num_speculative_tokens is None:
             return None
 
         if speculative_model is not None and num_speculative_tokens is None:
@@ -783,6 +787,12 @@ def maybe_create_spec_config(
                 "num_speculative_tokens to be provided, but found "
                 f"{speculative_model=} and {num_speculative_tokens=}.")
 
+        if (speculative_disable_by_batch_size is not None
+                and speculative_disable_by_batch_size < 2):
+            raise ValueError("Expect the batch size threshold of disabling "
+                             "speculative decoding is > 1, but got "
+                             f"{speculative_disable_by_batch_size=}")
+
         assert (speculative_model is not None
                 and num_speculative_tokens is not None)
 
@@ -851,6 +861,7 @@ def maybe_create_spec_config(
             draft_model_config,
             draft_parallel_config,
             num_speculative_tokens,
+            speculative_disable_by_batch_size,
             ngram_prompt_lookup_max,
             ngram_prompt_lookup_min,
         )
@@ -920,8 +931,9 @@ def __init__(
         draft_model_config: ModelConfig,
         draft_parallel_config: ParallelConfig,
         num_speculative_tokens: int,
-        ngram_prompt_lookup_max: int,
-        ngram_prompt_lookup_min: int,
+        speculative_disable_by_batch_size: Optional[int],
+        ngram_prompt_lookup_max: Optional[int],
+        ngram_prompt_lookup_min: Optional[int],
     ):
         """Create a SpeculativeConfig object.
 
@@ -930,12 +942,19 @@ def __init__(
             draft_parallel_config: ParallelConfig for the draft model.
             num_speculative_tokens: The number of tokens to sample from the
                 draft model before scoring with the target model.
+            speculative_disable_by_batch_size: Disable speculative
+                decoding for new incoming requests when the number of
+                enqueue requests is larger than this value.
+            ngram_prompt_lookup_max: Max size of ngram token window.
+            ngram_prompt_lookup_min: Min size of ngram token window.
         """
         self.draft_model_config = draft_model_config
         self.draft_parallel_config = draft_parallel_config
         self.num_speculative_tokens = num_speculative_tokens
-        self.ngram_prompt_lookup_max = ngram_prompt_lookup_max
-        self.ngram_prompt_lookup_min = ngram_prompt_lookup_min
+        self.speculative_disable_by_batch_size = \
+            speculative_disable_by_batch_size
+        self.ngram_prompt_lookup_max = ngram_prompt_lookup_max or 0
+        self.ngram_prompt_lookup_min = ngram_prompt_lookup_min or 0
 
         self._verify_args()
 

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -87,6 +87,7 @@ class EngineArgs:
     speculative_model: Optional[str] = None
     num_speculative_tokens: Optional[int] = None
     speculative_max_model_len: Optional[int] = None
+    speculative_disable_by_batch_size: Optional[int] = None
     ngram_prompt_lookup_max: Optional[int] = None
     ngram_prompt_lookup_min: Optional[int] = None
 
@@ -482,6 +483,13 @@ def add_cli_args(
             'draft model. Sequences over this length will skip '
             'speculation.')
 
+        parser.add_argument(
+            '--speculative-disable-by-batch-size',
+            type=int,
+            default=EngineArgs.speculative_disable_by_batch_size,
+            help='Disable speculative decoding for new incoming requests '
+            'if the number of enqueue requests is larger than this value.')
+
         parser.add_argument(
             '--ngram-prompt-lookup-max',
             type=int,
@@ -575,6 +583,8 @@ def create_engine_config(self, ) -> EngineConfig:
             target_dtype=self.dtype,
             speculative_model=self.speculative_model,
             num_speculative_tokens=self.num_speculative_tokens,
+            speculative_disable_by_batch_size=self.
+            speculative_disable_by_batch_size,
             speculative_max_model_len=self.speculative_max_model_len,
             enable_chunked_prefill=self.enable_chunked_prefill,
             use_v2_block_manager=self.use_v2_block_manager,

diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
@@ -93,6 +93,8 @@ def _init_spec_worker(self):
         spec_decode_worker = SpecDecodeWorker.create_worker(
             scorer_worker=target_worker,
             draft_worker_kwargs=draft_worker_kwargs,
+            disable_by_batch_size=self.speculative_config.
+            speculative_disable_by_batch_size,
         )
 
         assert self.parallel_config.world_size == 1, (

diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
@@ -12,15 +12,21 @@ class RejectionSampler(nn.Module):
         https://arxiv.org/pdf/2302.01318.pdf.
     """
 
-    def __init__(self, strict_mode: bool = False):
+    def __init__(self,
+                 disable_bonus_tokens: bool = True,
+                 strict_mode: bool = False):
         """Create a rejection sampler.
 
         Args:
+            disable_bonus_tokens: Whether or not to disable the bonus token.
+            Require when bonus tokens will cause corrupt KV cache for
+            proposal methods that require KV cache.
             strict_mode: Whether or not to perform shape/device/dtype checks
                 during sampling. This catches correctness issues but adds
                 nontrivial latency.
         """
         super().__init__()
+        self._disable_bonus_tokens = disable_bonus_tokens
         self._strict_mode = strict_mode
 
         # NOTE: A "bonus token" is accepted iff all proposal tokens are
@@ -312,7 +318,8 @@ def _create_output(
         # proposal methods that require KV cache. We can fix it by "prefilling"
         # the bonus token in the proposer. The following issue tracks the fix.
         # https://github.com/vllm-project/vllm/issues/4212
-        output_with_bonus_tokens[:, -1] = -1
+        if self._disable_bonus_tokens:
+            output_with_bonus_tokens[:, -1] = -1
 
         # Fill the recovered token ids.
         output.mul_(~after_false_mask).add_(

diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -612,6 +612,12 @@ def __init__(
         self._token_chunk_size = token_chunk_size
         self.do_sample = do_sample
 
+        # The number of speculative tokens adopted in this request.
+        # None means specuative decoding is not used.
+        # Zero means speculative decoding is disabled for some reasons.
+        # TODO: We should maintain this states out of the sequence group.
+        self.num_speculative_tokens = None
+
         if self._token_chunk_size is None:
             if is_prompt:
                 self._token_chunk_size = list(seq_data.values())[0].get_len()