Fixing graph capture for flash decoding. (#2163)

huggingface · Jul 26, 2024 · ba906df · ba906df
1 parent bec6a17
commit ba906df
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
@@ -926,7 +926,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
             "slots": slots,
             "input_lengths": input_lengths,
         }
-        input_lengths = Seqlen(input_lengths=input_lengths)
+        input_lengths_ = Seqlen(input_lengths=input_lengths)
         graph = torch.cuda.CUDAGraph()
         self.cuda_graphs[bs]["graph"] = graph
 
@@ -939,14 +939,15 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
             kv_cache=self.kv_cache,
             block_tables=block_tables,
             slots=slots,
-            input_lengths=input_lengths,
+            input_lengths=input_lengths_,
             max_s=max_s,
             prefill_cache_indices=None,
             lm_head_indices=None,
         )
         torch.cuda.synchronize()
 
         with torch.cuda.graph(graph, pool=MEM_POOL):
+            input_lengths = Seqlen(input_lengths=input_lengths)
             logits, speculative_logits = self.model.forward(
                 input_ids=input_ids,
                 position_ids=position_ids,