Skip to content

Commit

Permalink
remove the H200 check and add range by dafault
Browse files Browse the repository at this point in the history
  • Loading branch information
kushanam committed Aug 27, 2024
1 parent b645bbe commit 503e657
Showing 1 changed file with 1 addition and 12 deletions.
13 changes: 1 addition & 12 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,19 +62,8 @@
# Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
_BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
_BATCH_SIZE_ALIGNMENT * i for i in range(1, 129)
]

# Get the current GPU properties
gpu_properties = torch.cuda.get_device_properties(0)
# Retrieve the total memory in GB
mem = gpu_properties.total_memory / 1024**3
# Retrieve the SM version
gpu_sm_version = gpu_properties.major + gpu_properties.minor / 10.0
# extend cuda graph for H200 GPUs
if mem > 120.0 and gpu_sm_version >= 9.0:
_BATCH_SIZES_TO_CAPTURE.extend([512, 768])

_NUM_WARMUP_ITERS = 2

TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU")
Expand Down

0 comments on commit 503e657

Please sign in to comment.