diff --git a/vllm/config.py b/vllm/config.py index 13bb29459..5c3a8615e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4,15 +4,13 @@ from typing import TYPE_CHECKING, ClassVar, List, Optional, Union import torch -from packaging.version import Version from transformers import PretrainedConfig from vllm.logger import init_logger from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, get_quantization_config) from vllm.transformers_utils.config import get_config, get_hf_text_config -from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip, - is_neuron) +from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron GPTQMarlinConfig = get_quantization_config("gptq_marlin") @@ -369,13 +367,6 @@ def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass elif self.cache_dtype == "fp8": - if not is_hip(): - nvcc_cuda_version = get_nvcc_cuda_version() - if nvcc_cuda_version is not None \ - and nvcc_cuda_version < Version("11.8"): - raise ValueError( - "FP8 is not supported when cuda version is" - "lower than 11.8.") logger.info( "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " diff --git a/vllm/utils.py b/vllm/utils.py index b06c85087..6479a8dab 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -19,7 +19,6 @@ import psutil import torch -from packaging.version import Version, parse import vllm.envs as envs from vllm.logger import enable_trace_function_call, init_logger @@ -314,27 +313,6 @@ def cdiv(a: int, b: int) -> int: return -(a // -b) -@lru_cache(maxsize=None) -def get_nvcc_cuda_version() -> Optional[Version]: - cuda_home = envs.CUDA_HOME - if not cuda_home: - cuda_home = '/usr/local/cuda' - if os.path.isfile(cuda_home + '/bin/nvcc'): - logger.info( - 'CUDA_HOME is not found in the environment. ' - 'Using %s as CUDA_HOME.', cuda_home) - else: - logger.warning('Not found nvcc in %s. Skip cuda version check!', - cuda_home) - return None - nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], - universal_newlines=True) - output = nvcc_output.split() - release_idx = output.index("release") + 1 - nvcc_cuda_version = parse(output[release_idx].split(",")[0]) - return nvcc_cuda_version - - def _generate_random_fp8( tensor: torch.tensor, low: float, @@ -560,7 +538,7 @@ def maybe_expand_dim(tensor: torch.Tensor, def merge_dicts(dict1: Dict[Any, List[Any]], dict2: Dict[Any, List[Any]]) -> Dict[Any, List[Any]]: """Merge 2 dicts that have key -> List of items. - + When a key conflicts, the values in dict1 is prioritized. """ merged_dict = defaultdict(list)