remove the H200 check and add range by dafault

vllm-project · Aug 27, 2024 · 503e657 · 503e657
1 parent b645bbe
commit 503e657
Showing 1 changed file with 1 addition and 12 deletions.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -62,19 +62,8 @@
 # Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
 # NOTE: _get_graph_batch_size needs to be updated if this list is changed.
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
-    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
+    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 129)
 ]
-
-# Get the current GPU properties
-gpu_properties = torch.cuda.get_device_properties(0)
-# Retrieve the total memory in GB
-mem = gpu_properties.total_memory / 1024**3
-# Retrieve the SM version
-gpu_sm_version = gpu_properties.major + gpu_properties.minor / 10.0
-# extend cuda graph for H200 GPUs
-if mem > 120.0 and gpu_sm_version >= 9.0:
-    _BATCH_SIZES_TO_CAPTURE.extend([512, 768])
-
 _NUM_WARMUP_ITERS = 2
 
 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")