From fbd1189136236665649fb5ce89df0d3221b5f87f Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Thu, 25 Jul 2024 15:11:24 +0300
Subject: [PATCH] feat: don't enable chunked prefill by default for long seqlen
 models that have seqlen agnostic cache (Jamba)

---
 vllm/engine/arg_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index cd64d3345b83..bad5be491721 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -754,10 +754,14 @@ def create_engine_config(self, ) -> EngineConfig:
                 use_sliding_window = (model_config.get_sliding_window()
                                       is not None)
                 use_spec_decode = self.speculative_model is not None
+                has_seqlen_agnostic_layers = (
+                    model_config.contains_seqlen_agnostic_layers(
+                        parallel_config))
                 if (is_gpu and not use_sliding_window and not use_spec_decode
                         and not self.enable_lora
                         and not self.enable_prompt_adapter
-                        and not self.enable_prefix_caching):
+                        and not self.enable_prefix_caching
+                        and not has_seqlen_agnostic_layers):
                     self.enable_chunked_prefill = True
                     logger.warning(
                         "Chunked prefill is enabled by default for models with "