From fbd1189136236665649fb5ce89df0d3221b5f87f Mon Sep 17 00:00:00 2001 From: Tomer Asida Date: Thu, 25 Jul 2024 15:11:24 +0300 Subject: [PATCH] feat: don't enable chunked prefill by default for long seqlen models that have seqlen agnostic cache (Jamba) --- vllm/engine/arg_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cd64d3345b83..bad5be491721 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -754,10 +754,14 @@ def create_engine_config(self, ) -> EngineConfig: use_sliding_window = (model_config.get_sliding_window() is not None) use_spec_decode = self.speculative_model is not None + has_seqlen_agnostic_layers = ( + model_config.contains_seqlen_agnostic_layers( + parallel_config)) if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter - and not self.enable_prefix_caching): + and not self.enable_prefix_caching + and not has_seqlen_agnostic_layers): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with "