From f374f3df445b808760fa3374743e517e094c353d Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Mon, 14 Oct 2024 17:08:53 -0600
Subject: [PATCH 01/11] :alembic: debugging memory measurement

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/model_executor/models/mllama.py |  6 ++++
 vllm/multimodal/registry.py          |  2 ++
 vllm/worker/enc_dec_model_runner.py  | 44 +++++++++++++++++++++++++---
 vllm/worker/worker.py                | 15 ++++++++--
 4 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 66e9b2844620..40b2a26fcd57 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1120,6 +1120,9 @@ def flat_encoder_result(self, cross_attention_states: torch.Tensor,
             cross_attention_states.shape[-1],
             device=cross_attention_states.device,
             dtype=cross_attention_states.dtype)
+        
+        print(f"\n\n\n CROSS ATTN STATES SHAPE: {cross_attention_states_flat.shape} \n\n\n")
+
         start_pos = 0
         for seq_len, vision_token_in_batch in zip(actual_encoder_seq_lens,
                                                   cross_attention_states):
@@ -1217,6 +1220,9 @@ def forward(
         attn_metadata: AttentionMetadata,
         **kwargs: object,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
+        
+        print("\n\n\n MLLAMA FORWARD \n\n\n")
+
         if attn_metadata.num_prefill_tokens > 0 and \
             attn_metadata.num_decode_tokens > 0:
             raise ValueError("Chunk prefill not supported")
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 5e9b8bd518de..fee57c2ef2db 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -189,6 +189,8 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
         """
         limits_per_plugin = self._limits_by_model[model_config]
 
+        print(f"\n\n\n limits_per_plugin: {limits_per_plugin}\n\n]n")
+
         return sum((limits_per_plugin[key] *
                     plugin.get_max_multimodal_tokens(model_config))
                    for key, plugin in self._plugins.items())
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 6a00444f5098..12acffbb8c93 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -300,10 +300,32 @@ def profile_run(self) -> None:
         if max_mm_tokens > 0:
             logger.info("Starting profile run for multi-modal models.")
 
+        # # Find the maximum number of multimodal inputs (e.g. images) that will
+        # # fit into the `max_num_batched_tokens` budget.
+        # max_multimodal_inputs = max_num_batched_tokens // max_mm_tokens
+        # # Use the leftover tokens for text for each sequence
+        # leftover_tokens = max_num_batched_tokens % max_mm_tokens
+        # if leftover_tokens < max_num_seqs:
+        #     # If we don't have at least one token left over for each sequence,
+        #     # then remove one multi-modal input
+        #     max_multimodal_inputs -= 1
+        #     leftover_tokens += max_mm_tokens
+        # assert max_multimodal_inputs > 0, "No room to fit a single " \
+        #     "multi-modal input within the scheduling budget"
+
+        # original logic:
+        leftover_tokens = max_num_batched_tokens
+        max_multimodal_inputs = max_num_seqs
+        
+        print(f"\n\n\nMAX IMAGES: {max_multimodal_inputs}, LEFTOVER_TOKENS: {leftover_tokens}")
+
+        # start profile
+        torch.cuda.memory._record_memory_history(max_entries=100000)
+
         batch_size = 0
         for group_id in range(max_num_seqs):
-            seq_len = (max_num_batched_tokens // max_num_seqs +
-                       (group_id < max_num_batched_tokens % max_num_seqs))
+            seq_len = (leftover_tokens // max_num_seqs +
+                       (group_id < leftover_tokens % max_num_seqs))
             batch_size += seq_len
 
             decoder_seq_data, decoder_dummy_multi_modal_data \
@@ -329,6 +351,11 @@ def profile_run(self) -> None:
                 "Multi-modal data can't be provided in both encoder and decoder"
             )
 
+            if group_id < max_multimodal_inputs:
+                multi_modal_data = decoder_dummy_multi_modal_data or encoder_dummy_multi_modal_data
+            else:
+                multi_modal_data = None
+
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
@@ -337,8 +364,7 @@ def profile_run(self) -> None:
                 block_tables=None,
                 encoder_seq_data=encoder_seq_data,
                 cross_block_table=None,
-                multi_modal_data=decoder_dummy_multi_modal_data
-                or encoder_dummy_multi_modal_data,
+                multi_modal_data=multi_modal_data,
             )
             seqs.append(seq)
 
@@ -358,6 +384,16 @@ def profile_run(self) -> None:
         intermediate_tensors = None
         self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
+
+        # Record memory usage
+        try:
+            torch.cuda.memory._dump_snapshot(f"/tmp/forward_profile.pickle")
+        except Exception as e:
+            logger.error(f"Failed to capture memory snapshot {e}")
+
+        # Stop profiling
+        torch.cuda.memory._record_memory_history(enabled=None)
+
         return
 
     def _prepare_encoder_model_input_tensors(
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index ab61e4377f90..2a076645c87a 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -217,9 +217,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
         torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
+        print("\n\n\n CALLING PROFILE_RUN \n\n\n")
         self.model_runner.profile_run()
 
         # Calculate the number of blocks that can be allocated with the
@@ -228,14 +230,23 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
-        peak_memory = self.init_gpu_memory - free_gpu_memory
-        assert peak_memory > 0, (
+        assert self.init_gpu_memory - free_gpu_memory > 0, (
             "Error in memory profiling. "
             f"Initial free memory {self.init_gpu_memory}, current free memory"
             f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
+        
+        # Get the peak memory allocation recorded by torch
+        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
+
+        print(f"\n\n\n TOTAL VRAM: {total_gpu_memory} | INITIAL VRAM: {self.init_gpu_memory} | FREE VRAM: {free_gpu_memory} | PEAK VRAM: {peak_memory}\n\n\n")
 
         cache_block_size = self.get_cache_block_size_bytes()
+        print(f"\n\n\n CACHE BLOCK SIZE BYTES: {cache_block_size}\n\n\n")
+
+        stats = torch.cuda.memory_stats()
+        print(f"\n\n\n STATS: {stats}\n\n\n")
+
         if cache_block_size == 0:
             num_gpu_blocks = 0
             num_cpu_blocks = 0

From 6f12a0127c7edaa3b188ffa63f4c1eea17b60220 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Mon, 14 Oct 2024 17:12:01 -0600
Subject: [PATCH 02/11] :art: cleanup

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/model_executor/models/mllama.py |  6 ----
 vllm/multimodal/registry.py          |  2 --
 vllm/worker/enc_dec_model_runner.py  | 44 +++-------------------------
 vllm/worker/worker.py                | 10 +------
 4 files changed, 5 insertions(+), 57 deletions(-)

diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 40b2a26fcd57..66e9b2844620 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1120,9 +1120,6 @@ def flat_encoder_result(self, cross_attention_states: torch.Tensor,
             cross_attention_states.shape[-1],
             device=cross_attention_states.device,
             dtype=cross_attention_states.dtype)
-        
-        print(f"\n\n\n CROSS ATTN STATES SHAPE: {cross_attention_states_flat.shape} \n\n\n")
-
         start_pos = 0
         for seq_len, vision_token_in_batch in zip(actual_encoder_seq_lens,
                                                   cross_attention_states):
@@ -1220,9 +1217,6 @@ def forward(
         attn_metadata: AttentionMetadata,
         **kwargs: object,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
-        
-        print("\n\n\n MLLAMA FORWARD \n\n\n")
-
         if attn_metadata.num_prefill_tokens > 0 and \
             attn_metadata.num_decode_tokens > 0:
             raise ValueError("Chunk prefill not supported")
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index fee57c2ef2db..5e9b8bd518de 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -189,8 +189,6 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
         """
         limits_per_plugin = self._limits_by_model[model_config]
 
-        print(f"\n\n\n limits_per_plugin: {limits_per_plugin}\n\n]n")
-
         return sum((limits_per_plugin[key] *
                     plugin.get_max_multimodal_tokens(model_config))
                    for key, plugin in self._plugins.items())
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 12acffbb8c93..6a00444f5098 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -300,32 +300,10 @@ def profile_run(self) -> None:
         if max_mm_tokens > 0:
             logger.info("Starting profile run for multi-modal models.")
 
-        # # Find the maximum number of multimodal inputs (e.g. images) that will
-        # # fit into the `max_num_batched_tokens` budget.
-        # max_multimodal_inputs = max_num_batched_tokens // max_mm_tokens
-        # # Use the leftover tokens for text for each sequence
-        # leftover_tokens = max_num_batched_tokens % max_mm_tokens
-        # if leftover_tokens < max_num_seqs:
-        #     # If we don't have at least one token left over for each sequence,
-        #     # then remove one multi-modal input
-        #     max_multimodal_inputs -= 1
-        #     leftover_tokens += max_mm_tokens
-        # assert max_multimodal_inputs > 0, "No room to fit a single " \
-        #     "multi-modal input within the scheduling budget"
-
-        # original logic:
-        leftover_tokens = max_num_batched_tokens
-        max_multimodal_inputs = max_num_seqs
-        
-        print(f"\n\n\nMAX IMAGES: {max_multimodal_inputs}, LEFTOVER_TOKENS: {leftover_tokens}")
-
-        # start profile
-        torch.cuda.memory._record_memory_history(max_entries=100000)
-
         batch_size = 0
         for group_id in range(max_num_seqs):
-            seq_len = (leftover_tokens // max_num_seqs +
-                       (group_id < leftover_tokens % max_num_seqs))
+            seq_len = (max_num_batched_tokens // max_num_seqs +
+                       (group_id < max_num_batched_tokens % max_num_seqs))
             batch_size += seq_len
 
             decoder_seq_data, decoder_dummy_multi_modal_data \
@@ -351,11 +329,6 @@ def profile_run(self) -> None:
                 "Multi-modal data can't be provided in both encoder and decoder"
             )
 
-            if group_id < max_multimodal_inputs:
-                multi_modal_data = decoder_dummy_multi_modal_data or encoder_dummy_multi_modal_data
-            else:
-                multi_modal_data = None
-
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
@@ -364,7 +337,8 @@ def profile_run(self) -> None:
                 block_tables=None,
                 encoder_seq_data=encoder_seq_data,
                 cross_block_table=None,
-                multi_modal_data=multi_modal_data,
+                multi_modal_data=decoder_dummy_multi_modal_data
+                or encoder_dummy_multi_modal_data,
             )
             seqs.append(seq)
 
@@ -384,16 +358,6 @@ def profile_run(self) -> None:
         intermediate_tensors = None
         self.execute_model(model_input, kv_caches, intermediate_tensors)
         torch.cuda.synchronize()
-
-        # Record memory usage
-        try:
-            torch.cuda.memory._dump_snapshot(f"/tmp/forward_profile.pickle")
-        except Exception as e:
-            logger.error(f"Failed to capture memory snapshot {e}")
-
-        # Stop profiling
-        torch.cuda.memory._record_memory_history(enabled=None)
-
         return
 
     def _prepare_encoder_model_input_tensors(
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 2a076645c87a..7ff1ad711e8e 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -221,7 +221,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
-        print("\n\n\n CALLING PROFILE_RUN \n\n\n")
         self.model_runner.profile_run()
 
         # Calculate the number of blocks that can be allocated with the
@@ -235,18 +234,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             f"Initial free memory {self.init_gpu_memory}, current free memory"
             f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
-        
+
         # Get the peak memory allocation recorded by torch
         peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
 
-        print(f"\n\n\n TOTAL VRAM: {total_gpu_memory} | INITIAL VRAM: {self.init_gpu_memory} | FREE VRAM: {free_gpu_memory} | PEAK VRAM: {peak_memory}\n\n\n")
-
         cache_block_size = self.get_cache_block_size_bytes()
-        print(f"\n\n\n CACHE BLOCK SIZE BYTES: {cache_block_size}\n\n\n")
-
-        stats = torch.cuda.memory_stats()
-        print(f"\n\n\n STATS: {stats}\n\n\n")
-
         if cache_block_size == 0:
             num_gpu_blocks = 0
             num_cpu_blocks = 0

From 3eb8293b57b1c1b16ed2cc79dbd09fcb95d1918b Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 15 Oct 2024 12:46:13 -0600
Subject: [PATCH 03/11] :bug: fixup broken test, add more logs

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/entrypoints/llm/test_lazy_outlines.py |  2 +-
 vllm/worker/worker.py                       | 16 +++++++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 39480531f586..94b02b95b5dc 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -29,7 +29,7 @@ def test_lazy_outlines(sample_regex):
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
-              gpu_memory_utilization=0.3)
+              gpu_memory_utilization=0.6)
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     outputs = llm.generate(
         prompts=[
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 7ff1ad711e8e..fcbf047e4f47 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -237,15 +237,25 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Get the peak memory allocation recorded by torch
         peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
+        available_kv_cache_memory = (
+            total_gpu_memory * self.cache_config.gpu_memory_utilization -
+            peak_memory)
+
+        logger.info("Initial memory usage before profile: %.2f GB",
+                    (total_gpu_memory - self.init_gpu_memory) / (1024**3))
+        logger.info("Peak memory usage during profile: %.2f GB",
+                    peak_memory / (1024**3))
+        logger.info(
+            "Available memory for KV cache with %.2f gpu utilization: %.2f GB",
+            self.cache_config.gpu_memory_utilization,
+            available_kv_cache_memory / (1024**3))
 
         cache_block_size = self.get_cache_block_size_bytes()
         if cache_block_size == 0:
             num_gpu_blocks = 0
             num_cpu_blocks = 0
         else:
-            num_gpu_blocks = int(
-                (total_gpu_memory * self.cache_config.gpu_memory_utilization -
-                 peak_memory) // cache_block_size)
+            num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
             num_cpu_blocks = int(self.cache_config.swap_space_bytes //
                                  cache_block_size)
         num_gpu_blocks = max(num_gpu_blocks, 0)

From fbb5e8f5b85b5cd3f5dd0fa58dad6b61118bf92b Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 15 Oct 2024 12:53:00 -0600
Subject: [PATCH 04/11] :goal_net: handle non-torch allocations

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/worker/worker.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index fcbf047e4f47..20b0f0c52d3c 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -241,6 +241,18 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
             total_gpu_memory * self.cache_config.gpu_memory_utilization -
             peak_memory)
 
+        # Edge case: Check for any memory left around that may have been
+        # allocated on the gpu outside of `torch`
+        torch.cuda.empty_cache()
+        leftover_allocations = torch.cuda.mem_get_info(
+        )[0] - self.init_gpu_memory
+        if leftover_allocations > 0:
+            logger.info(
+                "Found %.2f GB of allocated memory leftover after clearing "
+                "torch cache. Adding to peak memory usage.",
+                leftover_allocations / (1024**3))
+            peak_memory += leftover_allocations
+
         logger.info("Initial memory usage before profile: %.2f GB",
                     (total_gpu_memory - self.init_gpu_memory) / (1024**3))
         logger.info("Peak memory usage during profile: %.2f GB",
@@ -263,7 +275,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         if self.model_runner.lora_manager:
             self.model_runner.remove_all_loras()
         gc.collect()
-        torch.cuda.empty_cache()
         return num_gpu_blocks, num_cpu_blocks
 
     def initialize_cache(self, num_gpu_blocks: int,

From 7f9b77cba0c9b7c71a14278720731966f24160aa Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 15 Oct 2024 12:56:37 -0600
Subject: [PATCH 05/11] :bug: move kv_cache size calculation

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/worker/worker.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 20b0f0c52d3c..00c4a82eb567 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -237,9 +237,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         # Get the peak memory allocation recorded by torch
         peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
-        available_kv_cache_memory = (
-            total_gpu_memory * self.cache_config.gpu_memory_utilization -
-            peak_memory)
 
         # Edge case: Check for any memory left around that may have been
         # allocated on the gpu outside of `torch`
@@ -253,6 +250,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                 leftover_allocations / (1024**3))
             peak_memory += leftover_allocations
 
+        available_kv_cache_memory = (
+            total_gpu_memory * self.cache_config.gpu_memory_utilization -
+            peak_memory)
+
         logger.info("Initial memory usage before profile: %.2f GB",
                     (total_gpu_memory - self.init_gpu_memory) / (1024**3))
         logger.info("Peak memory usage during profile: %.2f GB",

From 994b2a3f6b714ca088b4bb0cc1e32ba790c623c8 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Tue, 15 Oct 2024 14:25:08 -0600
Subject: [PATCH 06/11] :bug: fixup offline mode test

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/entrypoints/offline_mode/test_offline_mode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 0b6026a89c75..fe40af271c1c 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -44,7 +44,7 @@ def test_offline_mode(llm: LLM, monkeypatch):
         LLM(model=MODEL_NAME,
             max_num_batched_tokens=4096,
             tensor_parallel_size=1,
-            gpu_memory_utilization=0.10,
+            gpu_memory_utilization=0.20,
             enforce_eager=True)
     finally:
         # Reset the environment after the test

From d0eee6351eee8e1870f240eeb6d9909ccee4d646 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 16 Oct 2024 18:47:13 -0600
Subject: [PATCH 07/11] :bug: fixup non-torch allocation detection

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/worker/worker.py | 69 ++++++++++++++++++++++++-------------------
 1 file changed, 39 insertions(+), 30 deletions(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 00c4a82eb567..5160248a4a79 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -219,50 +219,35 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
 
+        free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info()
+
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
         self.model_runner.profile_run()
-
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
         torch.cuda.synchronize()
-        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
-        # NOTE(woosuk): Here we assume that the other processes using the same
-        # GPU did not change their memory usage during the profiling.
-        assert self.init_gpu_memory - free_gpu_memory > 0, (
-            "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory}, current free memory"
-            f" {free_gpu_memory}. This happens when the GPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
+
+        self.assert_no_other_gpu_processes()
 
         # Get the peak memory allocation recorded by torch
         peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
 
-        # Edge case: Check for any memory left around that may have been
-        # allocated on the gpu outside of `torch`
+        # Check for any memory left around that may have been allocated on the
+        # gpu outside of `torch`. NCCL operations, for example, can use a few
+        # GB during a forward pass
         torch.cuda.empty_cache()
-        leftover_allocations = torch.cuda.mem_get_info(
-        )[0] - self.init_gpu_memory
-        if leftover_allocations > 0:
-            logger.info(
-                "Found %.2f GB of allocated memory leftover after clearing "
-                "torch cache. Adding to peak memory usage.",
-                leftover_allocations / (1024**3))
-            peak_memory += leftover_allocations
+        # After emptying the torch cache, any other increase in gpu ram should
+        # be from non-torch allocations.
+        non_torch_allocations = free_memory_pre_profile - \
+            torch.cuda.mem_get_info()[0]
+        if non_torch_allocations > 0:
+            peak_memory += non_torch_allocations
 
         available_kv_cache_memory = (
             total_gpu_memory * self.cache_config.gpu_memory_utilization -
             peak_memory)
 
-        logger.info("Initial memory usage before profile: %.2f GB",
-                    (total_gpu_memory - self.init_gpu_memory) / (1024**3))
-        logger.info("Peak memory usage during profile: %.2f GB",
-                    peak_memory / (1024**3))
-        logger.info(
-            "Available memory for KV cache with %.2f gpu utilization: %.2f GB",
-            self.cache_config.gpu_memory_utilization,
-            available_kv_cache_memory / (1024**3))
-
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
         cache_block_size = self.get_cache_block_size_bytes()
         if cache_block_size == 0:
             num_gpu_blocks = 0
@@ -273,11 +258,35 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                                  cache_block_size)
         num_gpu_blocks = max(num_gpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
+
+        logger.info(
+            "Memory profiling results: total_gpu_memory=%.2fGiB"
+            " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB"
+            " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB"
+            " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3),
+            (total_gpu_memory - free_memory_pre_profile) / (1024**3),
+            (peak_memory - non_torch_allocations) / (1024**3),
+            non_torch_allocations / (1024**3),
+            available_kv_cache_memory / (1024**3),
+            self.cache_config.gpu_memory_utilization)
+
+        # Final cleanup
         if self.model_runner.lora_manager:
             self.model_runner.remove_all_loras()
         gc.collect()
+
         return num_gpu_blocks, num_cpu_blocks
 
+    def assert_no_other_gpu_processes(self):
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        free_gpu_memory, _ = torch.cuda.mem_get_info()
+        assert self.init_gpu_memory - free_gpu_memory > 0, (
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
     def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
         """Allocate GPU and CPU KV cache with the specified number of blocks.

From 4255af2ccc993911b19f3042896102a151b7bd8e Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Thu, 17 Oct 2024 10:15:23 -0600
Subject: [PATCH 08/11] :white_check_mark: Test gpu profiling

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/worker/test_profile.py | 59 ++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 tests/worker/test_profile.py

diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
new file mode 100644
index 000000000000..84e77be0467f
--- /dev/null
+++ b/tests/worker/test_profile.py
@@ -0,0 +1,59 @@
+import torch
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.worker import Worker
+
+
+def test_profile():
+    # Set up engine args to build a worker.
+    engine_args = EngineArgs(model="facebook/opt-125m",
+                             dtype="half",
+                             load_format="dummy")
+    engine_config = engine_args.create_engine_config()
+    engine_config.cache_config.num_gpu_blocks = 1000
+    engine_config.cache_config.num_cpu_blocks = 1000
+
+    # Create the worker.
+    distributed_init_method = get_distributed_init_method(
+        get_ip(), get_open_port())
+    worker = Worker(
+        model_config=engine_config.model_config,
+        parallel_config=engine_config.parallel_config,
+        scheduler_config=engine_config.scheduler_config,
+        device_config=engine_config.device_config,
+        cache_config=engine_config.cache_config,
+        load_config=engine_config.load_config,
+        local_rank=0,
+        rank=0,
+        distributed_init_method=distributed_init_method,
+        is_driver_worker=True,
+    )
+
+    # Load the model so we can profile it
+    worker.init_device()
+    worker.load_model()
+
+    # Set 10GiB as the mock total gpu ram so this test can be device-agnostic
+    def mock_mem_info():
+        current_usage = torch.cuda.memory_stats(
+        )["allocated_bytes.all.current"]
+        mock_total_bytes = 10 * 1024**3
+        free = mock_total_bytes - current_usage
+
+        return (free, mock_total_bytes)
+
+    from unittest.mock import patch
+    with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
+        gpu_blocks, _ = worker.determine_num_available_blocks()
+
+    # Peak vram usage by torch should be 0.7077 GiB
+    # Non-torch allocations should be 0.0079 GiB
+    # 9.0 GiB should be the utilization target
+    # 8.2843 GiB should be available for the KV cache
+    block_size = CacheEngine.get_cache_block_size(
+        engine_config.cache_config, engine_config.model_config,
+        engine_config.parallel_config)
+
+    assert gpu_blocks == (8.2843 * 1024**3) // block_size

From 4b17056fea9b5918ee5827e217435f3d6abb4f30 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Thu, 17 Oct 2024 11:32:49 -0600
Subject: [PATCH 09/11] :memo: update test description

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/worker/test_profile.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 84e77be0467f..56e63f67f55a 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -6,7 +6,12 @@
 from vllm.worker.worker import Worker
 
 
-def test_profile():
+def test_gpu_memory_profiling():
+    # Tests the gpu profiling that happens in order to determine the number of
+    # KV cache blocks that we can allocate on the GPU.
+    # This test mocks the maximum available gpu memory so that it can run on
+    # any gpu setup.
+
     # Set up engine args to build a worker.
     engine_args = EngineArgs(model="facebook/opt-125m",
                              dtype="half",
@@ -35,7 +40,7 @@ def test_profile():
     worker.init_device()
     worker.load_model()
 
-    # Set 10GiB as the mock total gpu ram so this test can be device-agnostic
+    # Set 10GiB as the total gpu ram to be device-agnostic
     def mock_mem_info():
         current_usage = torch.cuda.memory_stats(
         )["allocated_bytes.all.current"]

From b9e279bb198432e6014b45221f8d6c3d22e33bed Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Thu, 17 Oct 2024 12:47:09 -0600
Subject: [PATCH 10/11] :recycle: rename internal fn

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 vllm/worker/worker.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 5160248a4a79..9c46bb425860 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -226,7 +226,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         self.model_runner.profile_run()
         torch.cuda.synchronize()
 
-        self.assert_no_other_gpu_processes()
+        self._assert_memory_footprint_increased_during_profiling()
 
         # Get the peak memory allocation recorded by torch
         peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
@@ -277,7 +277,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
 
         return num_gpu_blocks, num_cpu_blocks
 
-    def assert_no_other_gpu_processes(self):
+    def _assert_memory_footprint_increased_during_profiling(self):
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
         free_gpu_memory, _ = torch.cuda.mem_get_info()

From f48b6b9c48093f11cefa7429cd003cc37b42b6ca Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Thu, 17 Oct 2024 12:57:54 -0600
Subject: [PATCH 11/11] :white_check_mark: add small tolerance in profile test

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 tests/entrypoints/llm/test_lazy_outlines.py | 2 ++
 tests/worker/test_profile.py                | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py
index 94b02b95b5dc..010969ad4750 100644
--- a/tests/entrypoints/llm/test_lazy_outlines.py
+++ b/tests/entrypoints/llm/test_lazy_outlines.py
@@ -26,6 +26,8 @@ def test_lazy_outlines(sample_regex):
     # make sure outlines is not imported
     assert 'outlines' not in sys.modules
 
+    # The second LLM needs to request a higher gpu_memory_utilization because
+    # the first LLM has already allocated a full 30% of the gpu memory.
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py
index 56e63f67f55a..7e9138dc8d77 100644
--- a/tests/worker/test_profile.py
+++ b/tests/worker/test_profile.py
@@ -61,4 +61,9 @@ def mock_mem_info():
         engine_config.cache_config, engine_config.model_config,
         engine_config.parallel_config)
 
-    assert gpu_blocks == (8.2843 * 1024**3) // block_size
+    expected_blocks = (8.2843 * 1024**3) // block_size
+
+    # Check within a small tolerance for portability
+    # Hardware, kernel, or dependency changes could all affect memory
+    # utilization
+    assert abs(gpu_blocks - expected_blocks) < 5