Reduce GPU memory utilization to make sure OOM doesn't happen (#153)

vllm-project · Jun 18, 2023 · bf5f121 · bf5f121
1 parent bec7b2d
commit bf5f121
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -21,7 +21,7 @@ class EngineArgs:
     tensor_parallel_size: int = 1
     block_size: int = 16
     swap_space: int = 4  # GiB
-    gpu_memory_utilization: float = 0.95
+    gpu_memory_utilization: float = 0.90
     max_num_batched_tokens: int = 2560
     max_num_seqs: int = 256
     disable_log_stats: bool = False