vllm-project · tlrmchlsmth · Oct 18, 2024 · Oct 14, 2024 · Oct 14, 2024 · Oct 15, 2024
@@ -29,7 +29,7 @@ def test_lazy_outlines(sample_regex):
     llm = LLM(model="facebook/opt-125m",
               enforce_eager=True,
               guided_decoding_backend="lm-format-enforcer",
-              gpu_memory_utilization=0.3)
+              gpu_memory_utilization=0.6)
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
     outputs = llm.generate(
         prompts=[

@@ -217,6 +217,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
         torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
@@ -228,29 +229,53 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
         # NOTE(woosuk): Here we assume that the other processes using the same
         # GPU did not change their memory usage during the profiling.
-        peak_memory = self.init_gpu_memory - free_gpu_memory
-        assert peak_memory > 0, (
+        assert self.init_gpu_memory - free_gpu_memory > 0, (
             "Error in memory profiling. "
             f"Initial free memory {self.init_gpu_memory}, current free memory"
             f" {free_gpu_memory}. This happens when the GPU memory was "
             "not properly cleaned up before initializing the vLLM instance.")
 
+        # Get the peak memory allocation recorded by torch
+        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
+
+        # Edge case: Check for any memory left around that may have been
+        # allocated on the gpu outside of `torch`
+        torch.cuda.empty_cache()
+        leftover_allocations = torch.cuda.mem_get_info(
+        )[0] - self.init_gpu_memory
+        if leftover_allocations > 0:
+            logger.info(
+                "Found %.2f GB of allocated memory leftover after clearing "
+                "torch cache. Adding to peak memory usage.",
+                leftover_allocations / (1024**3))
+            peak_memory += leftover_allocations
+
+        available_kv_cache_memory = (
+            total_gpu_memory * self.cache_config.gpu_memory_utilization -
+            peak_memory)
+
+        logger.info("Initial memory usage before profile: %.2f GB",
+                    (total_gpu_memory - self.init_gpu_memory) / (1024**3))
+        logger.info("Peak memory usage during profile: %.2f GB",
+                    peak_memory / (1024**3))
+        logger.info(
+            "Available memory for KV cache with %.2f gpu utilization: %.2f GB",
+            self.cache_config.gpu_memory_utilization,
+            available_kv_cache_memory / (1024**3))
+
         cache_block_size = self.get_cache_block_size_bytes()
         if cache_block_size == 0:
             num_gpu_blocks = 0
             num_cpu_blocks = 0
         else:
-            num_gpu_blocks = int(
-                (total_gpu_memory * self.cache_config.gpu_memory_utilization -
-                 peak_memory) // cache_block_size)
+            num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
             num_cpu_blocks = int(self.cache_config.swap_space_bytes //
                                  cache_block_size)
         num_gpu_blocks = max(num_gpu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
         if self.model_runner.lora_manager:
             self.model_runner.remove_all_loras()
         gc.collect()
-        torch.cuda.empty_cache()
         return num_gpu_blocks, num_cpu_blocks
 
     def initialize_cache(self, num_gpu_blocks: int,