Skip to content

Commit

Permalink
Disable cuda version check in vllm-openai image (#4530)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhaoyang-star authored and joerunde committed May 6, 2024
1 parent c7426c1 commit 352ef7c
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 33 deletions.
11 changes: 1 addition & 10 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,13 @@
from typing import TYPE_CHECKING, ClassVar, List, Optional, Union

import torch
from packaging.version import Version
from transformers import PretrainedConfig

from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
get_quantization_config)
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
is_neuron)
from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron

GPTQMarlinConfig = get_quantization_config("gptq_marlin")

Expand Down Expand Up @@ -369,13 +367,6 @@ def _verify_cache_dtype(self) -> None:
if self.cache_dtype == "auto":
pass
elif self.cache_dtype == "fp8":
if not is_hip():
nvcc_cuda_version = get_nvcc_cuda_version()
if nvcc_cuda_version is not None \
and nvcc_cuda_version < Version("11.8"):
raise ValueError(
"FP8 is not supported when cuda version is"
"lower than 11.8.")
logger.info(
"Using fp8 data type to store kv cache. It reduces the GPU "
"memory footprint and boosts the performance. "
Expand Down
24 changes: 1 addition & 23 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

import psutil
import torch
from packaging.version import Version, parse

import vllm.envs as envs
from vllm.logger import enable_trace_function_call, init_logger
Expand Down Expand Up @@ -314,27 +313,6 @@ def cdiv(a: int, b: int) -> int:
return -(a // -b)


@lru_cache(maxsize=None)
def get_nvcc_cuda_version() -> Optional[Version]:
cuda_home = envs.CUDA_HOME
if not cuda_home:
cuda_home = '/usr/local/cuda'
if os.path.isfile(cuda_home + '/bin/nvcc'):
logger.info(
'CUDA_HOME is not found in the environment. '
'Using %s as CUDA_HOME.', cuda_home)
else:
logger.warning('Not found nvcc in %s. Skip cuda version check!',
cuda_home)
return None
nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
universal_newlines=True)
output = nvcc_output.split()
release_idx = output.index("release") + 1
nvcc_cuda_version = parse(output[release_idx].split(",")[0])
return nvcc_cuda_version


def _generate_random_fp8(
tensor: torch.tensor,
low: float,
Expand Down Expand Up @@ -560,7 +538,7 @@ def maybe_expand_dim(tensor: torch.Tensor,
def merge_dicts(dict1: Dict[Any, List[Any]],
dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]:
"""Merge 2 dicts that have key -> List of items.
When a key conflicts, the values in dict1 is prioritized.
"""
merged_dict = defaultdict(list)
Expand Down

0 comments on commit 352ef7c

Please sign in to comment.