From f374f3df445b808760fa3374743e517e094c353d Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Mon, 14 Oct 2024 17:08:53 -0600 Subject: [PATCH 01/11] :alembic: debugging memory measurement Signed-off-by: Joe Runde --- vllm/model_executor/models/mllama.py | 6 ++++ vllm/multimodal/registry.py | 2 ++ vllm/worker/enc_dec_model_runner.py | 44 +++++++++++++++++++++++++--- vllm/worker/worker.py | 15 ++++++++-- 4 files changed, 61 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 66e9b2844620..40b2a26fcd57 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1120,6 +1120,9 @@ def flat_encoder_result(self, cross_attention_states: torch.Tensor, cross_attention_states.shape[-1], device=cross_attention_states.device, dtype=cross_attention_states.dtype) + + print(f"\n\n\n CROSS ATTN STATES SHAPE: {cross_attention_states_flat.shape} \n\n\n") + start_pos = 0 for seq_len, vision_token_in_batch in zip(actual_encoder_seq_lens, cross_attention_states): @@ -1217,6 +1220,9 @@ def forward( attn_metadata: AttentionMetadata, **kwargs: object, ) -> Union[Tuple, CausalLMOutputWithPast]: + + print("\n\n\n MLLAMA FORWARD \n\n\n") + if attn_metadata.num_prefill_tokens > 0 and \ attn_metadata.num_decode_tokens > 0: raise ValueError("Chunk prefill not supported") diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 5e9b8bd518de..fee57c2ef2db 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -189,6 +189,8 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: """ limits_per_plugin = self._limits_by_model[model_config] + print(f"\n\n\n limits_per_plugin: {limits_per_plugin}\n\n]n") + return sum((limits_per_plugin[key] * plugin.get_max_multimodal_tokens(model_config)) for key, plugin in self._plugins.items()) diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 6a00444f5098..12acffbb8c93 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -300,10 +300,32 @@ def profile_run(self) -> None: if max_mm_tokens > 0: logger.info("Starting profile run for multi-modal models.") + # # Find the maximum number of multimodal inputs (e.g. images) that will + # # fit into the `max_num_batched_tokens` budget. + # max_multimodal_inputs = max_num_batched_tokens // max_mm_tokens + # # Use the leftover tokens for text for each sequence + # leftover_tokens = max_num_batched_tokens % max_mm_tokens + # if leftover_tokens < max_num_seqs: + # # If we don't have at least one token left over for each sequence, + # # then remove one multi-modal input + # max_multimodal_inputs -= 1 + # leftover_tokens += max_mm_tokens + # assert max_multimodal_inputs > 0, "No room to fit a single " \ + # "multi-modal input within the scheduling budget" + + # original logic: + leftover_tokens = max_num_batched_tokens + max_multimodal_inputs = max_num_seqs + + print(f"\n\n\nMAX IMAGES: {max_multimodal_inputs}, LEFTOVER_TOKENS: {leftover_tokens}") + + # start profile + torch.cuda.memory._record_memory_history(max_entries=100000) + batch_size = 0 for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) + seq_len = (leftover_tokens // max_num_seqs + + (group_id < leftover_tokens % max_num_seqs)) batch_size += seq_len decoder_seq_data, decoder_dummy_multi_modal_data \ @@ -329,6 +351,11 @@ def profile_run(self) -> None: "Multi-modal data can't be provided in both encoder and decoder" ) + if group_id < max_multimodal_inputs: + multi_modal_data = decoder_dummy_multi_modal_data or encoder_dummy_multi_modal_data + else: + multi_modal_data = None + seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, @@ -337,8 +364,7 @@ def profile_run(self) -> None: block_tables=None, encoder_seq_data=encoder_seq_data, cross_block_table=None, - multi_modal_data=decoder_dummy_multi_modal_data - or encoder_dummy_multi_modal_data, + multi_modal_data=multi_modal_data, ) seqs.append(seq) @@ -358,6 +384,16 @@ def profile_run(self) -> None: intermediate_tensors = None self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() + + # Record memory usage + try: + torch.cuda.memory._dump_snapshot(f"/tmp/forward_profile.pickle") + except Exception as e: + logger.error(f"Failed to capture memory snapshot {e}") + + # Stop profiling + torch.cuda.memory._record_memory_history(enabled=None) + return def _prepare_encoder_model_input_tensors( diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index ab61e4377f90..2a076645c87a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -217,9 +217,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() # Execute a forward pass with dummy inputs to profile the memory usage # of the model. + print("\n\n\n CALLING PROFILE_RUN \n\n\n") self.model_runner.profile_run() # Calculate the number of blocks that can be allocated with the @@ -228,14 +230,23 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. - peak_memory = self.init_gpu_memory - free_gpu_memory - assert peak_memory > 0, ( + assert self.init_gpu_memory - free_gpu_memory > 0, ( "Error in memory profiling. " f"Initial free memory {self.init_gpu_memory}, current free memory" f" {free_gpu_memory}. This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") + + # Get the peak memory allocation recorded by torch + peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] + + print(f"\n\n\n TOTAL VRAM: {total_gpu_memory} | INITIAL VRAM: {self.init_gpu_memory} | FREE VRAM: {free_gpu_memory} | PEAK VRAM: {peak_memory}\n\n\n") cache_block_size = self.get_cache_block_size_bytes() + print(f"\n\n\n CACHE BLOCK SIZE BYTES: {cache_block_size}\n\n\n") + + stats = torch.cuda.memory_stats() + print(f"\n\n\n STATS: {stats}\n\n\n") + if cache_block_size == 0: num_gpu_blocks = 0 num_cpu_blocks = 0 From 6f12a0127c7edaa3b188ffa63f4c1eea17b60220 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Mon, 14 Oct 2024 17:12:01 -0600 Subject: [PATCH 02/11] :art: cleanup Signed-off-by: Joe Runde --- vllm/model_executor/models/mllama.py | 6 ---- vllm/multimodal/registry.py | 2 -- vllm/worker/enc_dec_model_runner.py | 44 +++------------------------- vllm/worker/worker.py | 10 +------ 4 files changed, 5 insertions(+), 57 deletions(-) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 40b2a26fcd57..66e9b2844620 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1120,9 +1120,6 @@ def flat_encoder_result(self, cross_attention_states: torch.Tensor, cross_attention_states.shape[-1], device=cross_attention_states.device, dtype=cross_attention_states.dtype) - - print(f"\n\n\n CROSS ATTN STATES SHAPE: {cross_attention_states_flat.shape} \n\n\n") - start_pos = 0 for seq_len, vision_token_in_batch in zip(actual_encoder_seq_lens, cross_attention_states): @@ -1220,9 +1217,6 @@ def forward( attn_metadata: AttentionMetadata, **kwargs: object, ) -> Union[Tuple, CausalLMOutputWithPast]: - - print("\n\n\n MLLAMA FORWARD \n\n\n") - if attn_metadata.num_prefill_tokens > 0 and \ attn_metadata.num_decode_tokens > 0: raise ValueError("Chunk prefill not supported") diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index fee57c2ef2db..5e9b8bd518de 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -189,8 +189,6 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: """ limits_per_plugin = self._limits_by_model[model_config] - print(f"\n\n\n limits_per_plugin: {limits_per_plugin}\n\n]n") - return sum((limits_per_plugin[key] * plugin.get_max_multimodal_tokens(model_config)) for key, plugin in self._plugins.items()) diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 12acffbb8c93..6a00444f5098 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -300,32 +300,10 @@ def profile_run(self) -> None: if max_mm_tokens > 0: logger.info("Starting profile run for multi-modal models.") - # # Find the maximum number of multimodal inputs (e.g. images) that will - # # fit into the `max_num_batched_tokens` budget. - # max_multimodal_inputs = max_num_batched_tokens // max_mm_tokens - # # Use the leftover tokens for text for each sequence - # leftover_tokens = max_num_batched_tokens % max_mm_tokens - # if leftover_tokens < max_num_seqs: - # # If we don't have at least one token left over for each sequence, - # # then remove one multi-modal input - # max_multimodal_inputs -= 1 - # leftover_tokens += max_mm_tokens - # assert max_multimodal_inputs > 0, "No room to fit a single " \ - # "multi-modal input within the scheduling budget" - - # original logic: - leftover_tokens = max_num_batched_tokens - max_multimodal_inputs = max_num_seqs - - print(f"\n\n\nMAX IMAGES: {max_multimodal_inputs}, LEFTOVER_TOKENS: {leftover_tokens}") - - # start profile - torch.cuda.memory._record_memory_history(max_entries=100000) - batch_size = 0 for group_id in range(max_num_seqs): - seq_len = (leftover_tokens // max_num_seqs + - (group_id < leftover_tokens % max_num_seqs)) + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) batch_size += seq_len decoder_seq_data, decoder_dummy_multi_modal_data \ @@ -351,11 +329,6 @@ def profile_run(self) -> None: "Multi-modal data can't be provided in both encoder and decoder" ) - if group_id < max_multimodal_inputs: - multi_modal_data = decoder_dummy_multi_modal_data or encoder_dummy_multi_modal_data - else: - multi_modal_data = None - seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, @@ -364,7 +337,8 @@ def profile_run(self) -> None: block_tables=None, encoder_seq_data=encoder_seq_data, cross_block_table=None, - multi_modal_data=multi_modal_data, + multi_modal_data=decoder_dummy_multi_modal_data + or encoder_dummy_multi_modal_data, ) seqs.append(seq) @@ -384,16 +358,6 @@ def profile_run(self) -> None: intermediate_tensors = None self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() - - # Record memory usage - try: - torch.cuda.memory._dump_snapshot(f"/tmp/forward_profile.pickle") - except Exception as e: - logger.error(f"Failed to capture memory snapshot {e}") - - # Stop profiling - torch.cuda.memory._record_memory_history(enabled=None) - return def _prepare_encoder_model_input_tensors( diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 2a076645c87a..7ff1ad711e8e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -221,7 +221,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - print("\n\n\n CALLING PROFILE_RUN \n\n\n") self.model_runner.profile_run() # Calculate the number of blocks that can be allocated with the @@ -235,18 +234,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: f"Initial free memory {self.init_gpu_memory}, current free memory" f" {free_gpu_memory}. This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") - + # Get the peak memory allocation recorded by torch peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] - print(f"\n\n\n TOTAL VRAM: {total_gpu_memory} | INITIAL VRAM: {self.init_gpu_memory} | FREE VRAM: {free_gpu_memory} | PEAK VRAM: {peak_memory}\n\n\n") - cache_block_size = self.get_cache_block_size_bytes() - print(f"\n\n\n CACHE BLOCK SIZE BYTES: {cache_block_size}\n\n\n") - - stats = torch.cuda.memory_stats() - print(f"\n\n\n STATS: {stats}\n\n\n") - if cache_block_size == 0: num_gpu_blocks = 0 num_cpu_blocks = 0 From 3eb8293b57b1c1b16ed2cc79dbd09fcb95d1918b Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 15 Oct 2024 12:46:13 -0600 Subject: [PATCH 03/11] :bug: fixup broken test, add more logs Signed-off-by: Joe Runde --- tests/entrypoints/llm/test_lazy_outlines.py | 2 +- vllm/worker/worker.py | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 39480531f586..94b02b95b5dc 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -29,7 +29,7 @@ def test_lazy_outlines(sample_regex): llm = LLM(model="facebook/opt-125m", enforce_eager=True, guided_decoding_backend="lm-format-enforcer", - gpu_memory_utilization=0.3) + gpu_memory_utilization=0.6) sampling_params = SamplingParams(temperature=0.8, top_p=0.95) outputs = llm.generate( prompts=[ diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 7ff1ad711e8e..fcbf047e4f47 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -237,15 +237,25 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Get the peak memory allocation recorded by torch peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] + available_kv_cache_memory = ( + total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) + + logger.info("Initial memory usage before profile: %.2f GB", + (total_gpu_memory - self.init_gpu_memory) / (1024**3)) + logger.info("Peak memory usage during profile: %.2f GB", + peak_memory / (1024**3)) + logger.info( + "Available memory for KV cache with %.2f gpu utilization: %.2f GB", + self.cache_config.gpu_memory_utilization, + available_kv_cache_memory / (1024**3)) cache_block_size = self.get_cache_block_size_bytes() if cache_block_size == 0: num_gpu_blocks = 0 num_cpu_blocks = 0 else: - num_gpu_blocks = int( - (total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) // cache_block_size) + num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) From fbb5e8f5b85b5cd3f5dd0fa58dad6b61118bf92b Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 15 Oct 2024 12:53:00 -0600 Subject: [PATCH 04/11] :goal_net: handle non-torch allocations Signed-off-by: Joe Runde --- vllm/worker/worker.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index fcbf047e4f47..20b0f0c52d3c 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -241,6 +241,18 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) + # Edge case: Check for any memory left around that may have been + # allocated on the gpu outside of `torch` + torch.cuda.empty_cache() + leftover_allocations = torch.cuda.mem_get_info( + )[0] - self.init_gpu_memory + if leftover_allocations > 0: + logger.info( + "Found %.2f GB of allocated memory leftover after clearing " + "torch cache. Adding to peak memory usage.", + leftover_allocations / (1024**3)) + peak_memory += leftover_allocations + logger.info("Initial memory usage before profile: %.2f GB", (total_gpu_memory - self.init_gpu_memory) / (1024**3)) logger.info("Peak memory usage during profile: %.2f GB", @@ -263,7 +275,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: if self.model_runner.lora_manager: self.model_runner.remove_all_loras() gc.collect() - torch.cuda.empty_cache() return num_gpu_blocks, num_cpu_blocks def initialize_cache(self, num_gpu_blocks: int, From 7f9b77cba0c9b7c71a14278720731966f24160aa Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 15 Oct 2024 12:56:37 -0600 Subject: [PATCH 05/11] :bug: move kv_cache size calculation Signed-off-by: Joe Runde --- vllm/worker/worker.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 20b0f0c52d3c..00c4a82eb567 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -237,9 +237,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Get the peak memory allocation recorded by torch peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] - available_kv_cache_memory = ( - total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) # Edge case: Check for any memory left around that may have been # allocated on the gpu outside of `torch` @@ -253,6 +250,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: leftover_allocations / (1024**3)) peak_memory += leftover_allocations + available_kv_cache_memory = ( + total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) + logger.info("Initial memory usage before profile: %.2f GB", (total_gpu_memory - self.init_gpu_memory) / (1024**3)) logger.info("Peak memory usage during profile: %.2f GB", From 994b2a3f6b714ca088b4bb0cc1e32ba790c623c8 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 15 Oct 2024 14:25:08 -0600 Subject: [PATCH 06/11] :bug: fixup offline mode test Signed-off-by: Joe Runde --- tests/entrypoints/offline_mode/test_offline_mode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 0b6026a89c75..fe40af271c1c 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -44,7 +44,7 @@ def test_offline_mode(llm: LLM, monkeypatch): LLM(model=MODEL_NAME, max_num_batched_tokens=4096, tensor_parallel_size=1, - gpu_memory_utilization=0.10, + gpu_memory_utilization=0.20, enforce_eager=True) finally: # Reset the environment after the test From d0eee6351eee8e1870f240eeb6d9909ccee4d646 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Wed, 16 Oct 2024 18:47:13 -0600 Subject: [PATCH 07/11] :bug: fixup non-torch allocation detection Signed-off-by: Joe Runde --- vllm/worker/worker.py | 69 ++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 30 deletions(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 00c4a82eb567..5160248a4a79 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -219,50 +219,35 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() + free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info() + # Execute a forward pass with dummy inputs to profile the memory usage # of the model. self.model_runner.profile_run() - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. torch.cuda.synchronize() - free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() - # NOTE(woosuk): Here we assume that the other processes using the same - # GPU did not change their memory usage during the profiling. - assert self.init_gpu_memory - free_gpu_memory > 0, ( - "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory}, current free memory" - f" {free_gpu_memory}. This happens when the GPU memory was " - "not properly cleaned up before initializing the vLLM instance.") + + self.assert_no_other_gpu_processes() # Get the peak memory allocation recorded by torch peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] - # Edge case: Check for any memory left around that may have been - # allocated on the gpu outside of `torch` + # Check for any memory left around that may have been allocated on the + # gpu outside of `torch`. NCCL operations, for example, can use a few + # GB during a forward pass torch.cuda.empty_cache() - leftover_allocations = torch.cuda.mem_get_info( - )[0] - self.init_gpu_memory - if leftover_allocations > 0: - logger.info( - "Found %.2f GB of allocated memory leftover after clearing " - "torch cache. Adding to peak memory usage.", - leftover_allocations / (1024**3)) - peak_memory += leftover_allocations + # After emptying the torch cache, any other increase in gpu ram should + # be from non-torch allocations. + non_torch_allocations = free_memory_pre_profile - \ + torch.cuda.mem_get_info()[0] + if non_torch_allocations > 0: + peak_memory += non_torch_allocations available_kv_cache_memory = ( total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) - logger.info("Initial memory usage before profile: %.2f GB", - (total_gpu_memory - self.init_gpu_memory) / (1024**3)) - logger.info("Peak memory usage during profile: %.2f GB", - peak_memory / (1024**3)) - logger.info( - "Available memory for KV cache with %.2f gpu utilization: %.2f GB", - self.cache_config.gpu_memory_utilization, - available_kv_cache_memory / (1024**3)) - + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. cache_block_size = self.get_cache_block_size_bytes() if cache_block_size == 0: num_gpu_blocks = 0 @@ -273,11 +258,35 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) + + logger.info( + "Memory profiling results: total_gpu_memory=%.2fGiB" + " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB" + " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB" + " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3), + (total_gpu_memory - free_memory_pre_profile) / (1024**3), + (peak_memory - non_torch_allocations) / (1024**3), + non_torch_allocations / (1024**3), + available_kv_cache_memory / (1024**3), + self.cache_config.gpu_memory_utilization) + + # Final cleanup if self.model_runner.lora_manager: self.model_runner.remove_all_loras() gc.collect() + return num_gpu_blocks, num_cpu_blocks + def assert_no_other_gpu_processes(self): + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + free_gpu_memory, _ = torch.cuda.mem_get_info() + assert self.init_gpu_memory - free_gpu_memory > 0, ( + "Error in memory profiling. " + f"Initial free memory {self.init_gpu_memory}, current free memory" + f" {free_gpu_memory}. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Allocate GPU and CPU KV cache with the specified number of blocks. From 4255af2ccc993911b19f3042896102a151b7bd8e Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Thu, 17 Oct 2024 10:15:23 -0600 Subject: [PATCH 08/11] :white_check_mark: Test gpu profiling Signed-off-by: Joe Runde --- tests/worker/test_profile.py | 59 ++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/worker/test_profile.py diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py new file mode 100644 index 000000000000..84e77be0467f --- /dev/null +++ b/tests/worker/test_profile.py @@ -0,0 +1,59 @@ +import torch + +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.worker import Worker + + +def test_profile(): + # Set up engine args to build a worker. + engine_args = EngineArgs(model="facebook/opt-125m", + dtype="half", + load_format="dummy") + engine_config = engine_args.create_engine_config() + engine_config.cache_config.num_gpu_blocks = 1000 + engine_config.cache_config.num_cpu_blocks = 1000 + + # Create the worker. + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + worker = Worker( + model_config=engine_config.model_config, + parallel_config=engine_config.parallel_config, + scheduler_config=engine_config.scheduler_config, + device_config=engine_config.device_config, + cache_config=engine_config.cache_config, + load_config=engine_config.load_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + is_driver_worker=True, + ) + + # Load the model so we can profile it + worker.init_device() + worker.load_model() + + # Set 10GiB as the mock total gpu ram so this test can be device-agnostic + def mock_mem_info(): + current_usage = torch.cuda.memory_stats( + )["allocated_bytes.all.current"] + mock_total_bytes = 10 * 1024**3 + free = mock_total_bytes - current_usage + + return (free, mock_total_bytes) + + from unittest.mock import patch + with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info): + gpu_blocks, _ = worker.determine_num_available_blocks() + + # Peak vram usage by torch should be 0.7077 GiB + # Non-torch allocations should be 0.0079 GiB + # 9.0 GiB should be the utilization target + # 8.2843 GiB should be available for the KV cache + block_size = CacheEngine.get_cache_block_size( + engine_config.cache_config, engine_config.model_config, + engine_config.parallel_config) + + assert gpu_blocks == (8.2843 * 1024**3) // block_size From 4b17056fea9b5918ee5827e217435f3d6abb4f30 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Thu, 17 Oct 2024 11:32:49 -0600 Subject: [PATCH 09/11] :memo: update test description Signed-off-by: Joe Runde --- tests/worker/test_profile.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index 84e77be0467f..56e63f67f55a 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -6,7 +6,12 @@ from vllm.worker.worker import Worker -def test_profile(): +def test_gpu_memory_profiling(): + # Tests the gpu profiling that happens in order to determine the number of + # KV cache blocks that we can allocate on the GPU. + # This test mocks the maximum available gpu memory so that it can run on + # any gpu setup. + # Set up engine args to build a worker. engine_args = EngineArgs(model="facebook/opt-125m", dtype="half", @@ -35,7 +40,7 @@ def test_profile(): worker.init_device() worker.load_model() - # Set 10GiB as the mock total gpu ram so this test can be device-agnostic + # Set 10GiB as the total gpu ram to be device-agnostic def mock_mem_info(): current_usage = torch.cuda.memory_stats( )["allocated_bytes.all.current"] From b9e279bb198432e6014b45221f8d6c3d22e33bed Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Thu, 17 Oct 2024 12:47:09 -0600 Subject: [PATCH 10/11] :recycle: rename internal fn Signed-off-by: Joe Runde --- vllm/worker/worker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 5160248a4a79..9c46bb425860 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -226,7 +226,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: self.model_runner.profile_run() torch.cuda.synchronize() - self.assert_no_other_gpu_processes() + self._assert_memory_footprint_increased_during_profiling() # Get the peak memory allocation recorded by torch peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] @@ -277,7 +277,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: return num_gpu_blocks, num_cpu_blocks - def assert_no_other_gpu_processes(self): + def _assert_memory_footprint_increased_during_profiling(self): # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. free_gpu_memory, _ = torch.cuda.mem_get_info() From f48b6b9c48093f11cefa7429cd003cc37b42b6ca Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Thu, 17 Oct 2024 12:57:54 -0600 Subject: [PATCH 11/11] :white_check_mark: add small tolerance in profile test Signed-off-by: Joe Runde --- tests/entrypoints/llm/test_lazy_outlines.py | 2 ++ tests/worker/test_profile.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 94b02b95b5dc..010969ad4750 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -26,6 +26,8 @@ def test_lazy_outlines(sample_regex): # make sure outlines is not imported assert 'outlines' not in sys.modules + # The second LLM needs to request a higher gpu_memory_utilization because + # the first LLM has already allocated a full 30% of the gpu memory. llm = LLM(model="facebook/opt-125m", enforce_eager=True, guided_decoding_backend="lm-format-enforcer", diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index 56e63f67f55a..7e9138dc8d77 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -61,4 +61,9 @@ def mock_mem_info(): engine_config.cache_config, engine_config.model_config, engine_config.parallel_config) - assert gpu_blocks == (8.2843 * 1024**3) // block_size + expected_blocks = (8.2843 * 1024**3) // block_size + + # Check within a small tolerance for portability + # Hardware, kernel, or dependency changes could all affect memory + # utilization + assert abs(gpu_blocks - expected_blocks) < 5