From ad0087787ef38f4919af0d6bda28aa2d7916a042 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 26 Jan 2025 14:07:54 -0800 Subject: [PATCH 1/2] [V1] Avoid list creation in input preparation Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6339f1f03f11d..be407ad6b46fa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -358,8 +358,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Get batched arange. # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - arange = np.concatenate( - [self.arange_np[:n] for n in num_scheduled_tokens]) + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_scheduled_tokens) + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, + num_scheduled_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets # Get positions. positions_np = self.positions_np[:total_num_scheduled_tokens] @@ -406,8 +413,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Prepare the attention metadata. self.query_start_loc_np[0] = 0 - np.cumsum(num_scheduled_tokens, - out=self.query_start_loc_np[1:num_reqs + 1]) + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens self.seq_lens_np[:num_reqs] = ( self.input_batch.num_computed_tokens_cpu[:num_reqs] + From 31b1a8ad9560d6e1cc93dedc566fc6b0b9566791 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 26 Jan 2025 16:03:13 -0800 Subject: [PATCH 2/2] Fix Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index be407ad6b46fa..9d7e30079dfbb 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -171,7 +171,8 @@ def __init__( # OPTIMIZATION: Cache the tensors rather than creating them every step. self.arange_np = np.arange(max(self.max_num_reqs + 1, - self.max_model_len), + self.max_model_len, + self.max_num_tokens), dtype=np.int32) # NOTE(woosuk): These tensors are "stateless", i.e., they are literally # a faster version of creating a new tensor every time. Thus, we should