From b96673629c82ddc880258586067a75ea9a168170 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Mon, 8 Jul 2024 15:50:02 +0300
Subject: [PATCH 1/3] bugfix: when working in CG mode, batch size should be by
 input_ids shape and not by number of sequences. This is so we pad the mamba
 cache to the captured CG batch sizes

---
 vllm/model_executor/models/jamba.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index bf330c7770d1..5aa42ec1176d 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -788,7 +788,7 @@ def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
             key in kwargs
             for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
         request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-        batch_size = len(request_ids_to_seq_ids)
+        batch_size = input_buffers['input_ids'].shape[0]
         (
             current_mamba_cache,
             indices,

From 612df99894f21515976719280722a1cf5fd1027a Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Mon, 8 Jul 2024 16:10:24 +0300
Subject: [PATCH 2/3] Add relevant test

---
 tests/models/test_jamba.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/models/test_jamba.py b/tests/models/test_jamba.py
index d7e3a2fc4a71..0a5fe19f80ec 100644
--- a/tests/models/test_jamba.py
+++ b/tests/models/test_jamba.py
@@ -1,5 +1,7 @@
 import pytest
 
+from vllm.worker.model_runner import _get_graph_batch_size
+
 MODELS = ["ai21labs/Jamba-tiny-random"]
 
 
@@ -32,6 +34,32 @@ def test_models(
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
 
 
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_mamba_cache_cg_padding(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # This test is for verifying that mamba cache is padded to CG captured
+    # batch size. If it's not, a torch RuntimeError will be raised because
+    # tensor dimensions aren't compatible
+    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+        example_prompts.append(example_prompts[0])
+
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            vllm_model.generate_greedy(example_prompts, max_tokens)
+    except RuntimeError:
+        pytest.fail(
+            "Couldn't run batch size which is not equal to a Cuda Graph "
+            "captured batch size. "
+            "Could be related to mamba cache not padded correctly")
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 def test_state_cleanup(

From e0b49e4dc6d1337c0602df238bdfbfae0e6bbe60 Mon Sep 17 00:00:00 2001
From: Tomer Asida <tomera@ai21.com>
Date: Mon, 8 Jul 2024 16:26:28 +0300
Subject: [PATCH 3/3] rename batch_szoe -> cg_batch_size for clarity

---
 vllm/model_executor/models/jamba.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 5aa42ec1176d..4524d8df86b9 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -788,12 +788,12 @@ def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
             key in kwargs
             for key in ["request_ids_to_seq_ids", "finished_requests_ids"])
         request_ids_to_seq_ids = kwargs["request_ids_to_seq_ids"]
-        batch_size = input_buffers['input_ids'].shape[0]
+        cg_batch_size = input_buffers['input_ids'].shape[0]
         (
             current_mamba_cache,
             indices,
         ) = self._prepare_current_run_mamba_cache(request_ids_to_seq_ids,
-                                                  batch_size)
+                                                  cg_batch_size)
         self.current_indices = indices
         finished_requests_ids = kwargs["finished_requests_ids"]
         self._release_mamba_cache(finished_requests_ids)