From 312fd843e568bdea5e52e26aa42d6f247c6fc4f0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 07:42:21 +0000 Subject: [PATCH 1/6] Collect interface info in a single process per model --- vllm/model_executor/models/registry.py | 165 +++++++++++++------------ 1 file changed, 85 insertions(+), 80 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f1d484521acb..2de1976bc015 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -3,8 +3,9 @@ import subprocess import sys import tempfile +from dataclasses import dataclass from functools import lru_cache, partial -from typing import Callable, Dict, List, Optional, Tuple, Type, Union +from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union import cloudpickle import torch.nn as nn @@ -154,6 +155,29 @@ } +@dataclass +class _ModelInterfaces: + is_text_generation_model: bool + is_embedding_model: bool + supports_multimodal: bool + supports_pp: bool + + +def _inspect_model(model: object) -> _ModelInterfaces: + return _ModelInterfaces( + is_text_generation_model=is_text_generation_model(model), + is_embedding_model=is_embedding_model(model), + supports_multimodal=supports_multimodal(model), + supports_pp=supports_pp(model), + ) + + +def _inspect_model_lazy(mod_name: str, cls_name: str) -> _ModelInterfaces: + mod = importlib.import_module(mod_name) + klass = getattr(mod, cls_name) + return _inspect_model(klass) + + class ModelRegistry: @staticmethod @@ -255,115 +279,96 @@ def register_model(model_arch: str, model_cls: Union[Type[nn.Module], else: _OOT_MODELS[model_arch] = model_cls + @staticmethod + def _normalize_archs(architectures: Union[str, List[str]]) -> List[str]: + if isinstance(architectures, str): + architectures = [architectures] + if not architectures: + logger.warning("No model architectures are specified") + + return architectures + @staticmethod @lru_cache(maxsize=128) - def _check_stateless( - func: Callable[[Type[nn.Module]], bool], - model_arch: str, - *, - default: Optional[bool] = None, - ) -> bool: + def _inspect_stateless(model_arch: str) -> _ModelInterfaces: """ - Run a boolean function against a model and return the result. + Inspect the interfaces that are implemented by a model. - If the model is not found, returns the provided default value. - - If the model is not already imported, the function is run inside a + If the model is not already imported, the inspection is done inside a subprocess to avoid initializing CUDA for the main program. """ model = ModelRegistry._try_get_model_stateless(model_arch) if model is not None: - return func(model) + return _inspect_model(model) try: mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch) except KeyError: - if default is not None: - return default - raise - with tempfile.NamedTemporaryFile() as output_file: - # `cloudpickle` allows pickling lambda functions directly - input_bytes = cloudpickle.dumps( - (mod_name, cls_name, func, output_file.name)) - # cannot use `sys.executable __file__` here because the script - # contains relative imports - returned = subprocess.run( - [sys.executable, "-m", "vllm.model_executor.models.registry"], - input=input_bytes, - capture_output=True) - - # check if the subprocess is successful - try: - returned.check_returncode() - except Exception as e: - # wrap raised exception to provide more information - raise RuntimeError(f"Error happened when testing " - f"model support for{mod_name}.{cls_name}:\n" - f"{returned.stderr.decode()}") from e - with open(output_file.name, "rb") as f: - result = pickle.load(f) - return result + inspect_fn = partial(_inspect_model_lazy, mod_name, cls_name) + return _run_in_subprocess(inspect_fn) @staticmethod def is_text_generation_model(architectures: Union[str, List[str]]) -> bool: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") + return any( + ModelRegistry._inspect_stateless(arch).is_text_generation_model + for arch in ModelRegistry._normalize_archs(architectures)) - is_txt_gen = partial(ModelRegistry._check_stateless, - is_text_generation_model, - default=False) + @staticmethod + def is_embedding_model(architectures: Union[str, List[str]]) -> bool: + return any( + ModelRegistry._inspect_stateless(arch).is_embedding_model + for arch in ModelRegistry._normalize_archs(architectures)) - return any(is_txt_gen(arch) for arch in architectures) + @staticmethod + def is_multimodal_model(architectures: Union[str, List[str]]) -> bool: + return any( + ModelRegistry._inspect_stateless(arch).supports_multimodal + for arch in ModelRegistry._normalize_archs(architectures)) @staticmethod - def is_embedding_model(architectures: Union[str, List[str]]) -> bool: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") + def is_pp_supported_model(architectures: Union[str, List[str]]) -> bool: + return any( + ModelRegistry._inspect_stateless(arch).supports_pp + for arch in ModelRegistry._normalize_archs(architectures)) - is_emb = partial(ModelRegistry._check_stateless, - is_embedding_model, - default=False) - return any(is_emb(arch) for arch in architectures) +_T = TypeVar("_T") - @staticmethod - def is_multimodal_model(architectures: Union[str, List[str]]) -> bool: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") - is_mm = partial(ModelRegistry._check_stateless, - supports_multimodal, - default=False) +def _run_in_subprocess(fn: Callable[[], _T]) -> _T: + with tempfile.NamedTemporaryFile() as output_file: + # `cloudpickle` allows pickling lambda functions directly + input_bytes = cloudpickle.dumps((fn, output_file.name)) - return any(is_mm(arch) for arch in architectures) + # cannot use `sys.executable __file__` here because the script + # contains relative imports + returned = subprocess.run( + [sys.executable, "-m", "vllm.model_executor.models.registry"], + input=input_bytes, + capture_output=True) - @staticmethod - def is_pp_supported_model(architectures: Union[str, List[str]]) -> bool: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") + # check if the subprocess is successful + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + raise RuntimeError(f"Error raised in subprocess:\n" + f"{returned.stderr.decode()}") from e - is_pp = partial(ModelRegistry._check_stateless, - supports_pp, - default=False) + with open(output_file.name, "rb") as f: + return pickle.load(f) - return any(is_pp(arch) for arch in architectures) +def _run() -> None: + fn, output_file = pickle.loads(sys.stdin.buffer.read()) + + result = fn() -if __name__ == "__main__": - (mod_name, cls_name, func, - output_file) = pickle.loads(sys.stdin.buffer.read()) - mod = importlib.import_module(mod_name) - klass = getattr(mod, cls_name) - result = func(klass) with open(output_file, "wb") as f: f.write(pickle.dumps(result)) + + +if __name__ == "__main__": + _run() From 56656615b90d4f474867a20717be7f1d36da414a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 15:51:52 +0000 Subject: [PATCH 2/6] Rework --- docs/source/models/adding_model.rst | 2 +- vllm/model_executor/models/registry.py | 309 +++++++++++++++---------- 2 files changed, 183 insertions(+), 128 deletions(-) diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index fa1003874033..ae09259c0756 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -99,7 +99,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a 5. Register your model ---------------------- -Finally, register your :code:`*ForCausalLM` class to the :code:`_MODELS` in `vllm/model_executor/models/registry.py `_. +Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. 6. Out-of-Tree Model Integration -------------------------------------------- diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 2de1976bc015..2ef7f5d51ead 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -3,8 +3,9 @@ import subprocess import sys import tempfile -from dataclasses import dataclass -from functools import lru_cache, partial +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from functools import lru_cache from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union import cloudpickle @@ -117,18 +118,13 @@ } # yapf: enable -_MODELS = { +_VLLM_MODELS = { **_TEXT_GENERATION_MODELS, **_EMBEDDING_MODELS, **_MULTIMODAL_MODELS, **_SPECULATIVE_DECODING_MODELS, } -# Architecture -> type or (module, class). -# out of tree models -_OOT_MODELS: Dict[str, Type[nn.Module]] = {} -_OOT_MODELS_LAZY: Dict[str, Tuple[str, str]] = {} - # Models not supported by ROCm. _ROCM_UNSUPPORTED_MODELS: List[str] = [] @@ -156,101 +152,124 @@ @dataclass -class _ModelInterfaces: +class _ModelInfo: is_text_generation_model: bool is_embedding_model: bool supports_multimodal: bool supports_pp: bool - -def _inspect_model(model: object) -> _ModelInterfaces: - return _ModelInterfaces( - is_text_generation_model=is_text_generation_model(model), - is_embedding_model=is_embedding_model(model), - supports_multimodal=supports_multimodal(model), - supports_pp=supports_pp(model), - ) - - -def _inspect_model_lazy(mod_name: str, cls_name: str) -> _ModelInterfaces: - mod = importlib.import_module(mod_name) - klass = getattr(mod, cls_name) - return _inspect_model(klass) + @staticmethod + def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": + return _ModelInfo( + is_text_generation_model=is_text_generation_model(model), + is_embedding_model=is_embedding_model(model), + supports_multimodal=supports_multimodal(model), + supports_pp=supports_pp(model), + ) -class ModelRegistry: +class _BaseRegisteredModel(ABC): - @staticmethod - def _get_module_cls_name(model_arch: str) -> Tuple[str, str]: - if model_arch in _MODELS: - module_relname, cls_name = _MODELS[model_arch] - return f"vllm.model_executor.models.{module_relname}", cls_name + @abstractmethod + def inspect_model_cls(self) -> _ModelInfo: + raise NotImplementedError - if model_arch in _OOT_MODELS_LAZY: - return _OOT_MODELS_LAZY[model_arch] + @abstractmethod + def load_model_cls(self) -> Type[nn.Module]: + raise NotImplementedError - raise KeyError(model_arch) - @staticmethod - @lru_cache(maxsize=128) - def _try_get_model_stateful(model_arch: str) -> Optional[Type[nn.Module]]: - try: - mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch) - except KeyError: - return None +@dataclass +class _RegisteredModel(_BaseRegisteredModel): + """ + Represents a model that has already been imported in the main process. + """ - module = importlib.import_module(mod_name) - return getattr(module, cls_name, None) + interfaces: _ModelInfo + model_cls: Type[nn.Module] @staticmethod - def _try_get_model_stateless(model_arch: str) -> Optional[Type[nn.Module]]: - if model_arch in _OOT_MODELS: - return _OOT_MODELS[model_arch] - - if is_hip(): - if model_arch in _ROCM_UNSUPPORTED_MODELS: - raise ValueError( - f"Model architecture {model_arch} is not supported by " - "ROCm for now.") - if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: - logger.warning( - "Model architecture %s is partially supported by ROCm: %s", - model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) + def from_model_cls(model_cls: Type[nn.Module]): + return _RegisteredModel( + interfaces=_ModelInfo.from_model_cls(model_cls), + model_cls=model_cls, + ) - return None + def inspect_model_cls(self) -> _ModelInfo: + return self.interfaces - @staticmethod - def _try_load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: - model = ModelRegistry._try_get_model_stateless(model_arch) - if model is not None: - return model + def load_model_cls(self) -> Type[nn.Module]: + return self.model_cls - return ModelRegistry._try_get_model_stateful(model_arch) - @staticmethod - def resolve_model_cls( - architectures: Union[str, List[str]], ) -> Tuple[Type[nn.Module], str]: - if isinstance(architectures, str): - architectures = [architectures] - if not architectures: - logger.warning("No model architectures are specified") +@dataclass +class _LazyRegisteredModel(_BaseRegisteredModel): + """ + Represents a model that has not been imported in the main process. + """ + module_name: str + class_name: str + + # Performed in another process to avoid initializing CUDA + def inspect_model_cls(self) -> _ModelInfo: + return _run_in_subprocess( + lambda: _ModelInfo.from_model_cls(self.load_model_cls())) + + def load_model_cls(self) -> Type[nn.Module]: + mod = importlib.import_module(self.module_name) + return getattr(mod, self.class_name) + + +@lru_cache(maxsize=128) +def _try_load_model_cls( + model_arch: str, + model: _BaseRegisteredModel, +) -> Optional[Type[nn.Module]]: + if is_hip(): + if model_arch in _ROCM_UNSUPPORTED_MODELS: + raise ValueError(f"Model architecture '{model_arch}' is not " + "supported by ROCm for now.") + + if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: + msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] + logger.warning( + "Model architecture '%s' is partially " + "supported by ROCm: %s", model_arch, msg) + + try: + return model.load_model_cls() + except Exception: + logger.exception("Error in loading model architecture '%s'", + model_arch) + return None - for arch in architectures: - model_cls = ModelRegistry._try_load_model_cls(arch) - if model_cls is not None: - return (model_cls, arch) - raise ValueError( - f"Model architectures {architectures} are not supported for now. " - f"Supported architectures: {ModelRegistry.get_supported_archs()}") +@lru_cache(maxsize=128) +def _try_inspect_model_cls( + model_arch: str, + model: _BaseRegisteredModel, +) -> Optional[_ModelInfo]: + try: + return model.inspect_model_cls() + except Exception: + logger.exception("Error in inspecting model architecture '%s'", + model_arch) + return None - @staticmethod - def get_supported_archs() -> List[str]: - return list(_MODELS.keys()) + list(_OOT_MODELS.keys()) - @staticmethod - def register_model(model_arch: str, model_cls: Union[Type[nn.Module], - str]): +@dataclass +class _ModelRegistry: + # Keyed by model_arch + models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) + + def get_supported_archs(self) -> List[str]: + return list(self.models.keys()) + + def register_model( + self, + model_arch: str, + model_cls: Union[Type[nn.Module], str], + ) -> None: """ Register an external model to be used in vLLM. @@ -262,7 +281,7 @@ def register_model(model_arch: str, model_cls: Union[Type[nn.Module], when importing the model and thus the related error :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`. """ - if model_arch in _MODELS: + if model_arch in self.models: logger.warning( "Model architecture %s is already registered, and will be " "overwritten by the new model class %s.", model_arch, @@ -274,13 +293,36 @@ def register_model(model_arch: str, model_cls: Union[Type[nn.Module], msg = "Expected a string in the format `:`" raise ValueError(msg) - module_name, cls_name = split_str - _OOT_MODELS_LAZY[model_arch] = module_name, cls_name + model = _LazyRegisteredModel(*split_str) else: - _OOT_MODELS[model_arch] = model_cls + model = _RegisteredModel.from_model_cls(model_cls) - @staticmethod - def _normalize_archs(architectures: Union[str, List[str]]) -> List[str]: + self.models[model_arch] = model + + def _raise_for_unsupported(self, architectures: List[str]): + all_supported_archs = self.get_supported_archs() + + raise ValueError( + f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {all_supported_archs}") + + def _try_load_model_cls(self, + model_arch: str) -> Optional[Type[nn.Module]]: + if model_arch not in self.models: + return None + + return _try_load_model_cls(model_arch, self.models[model_arch]) + + def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]: + if model_arch not in self.models: + return None + + return _try_inspect_model_cls(model_arch, self.models[model_arch]) + + def _normalize_archs( + self, + architectures: Union[str, List[str]], + ) -> List[str]: if isinstance(architectures, str): architectures = [architectures] if not architectures: @@ -288,51 +330,64 @@ def _normalize_archs(architectures: Union[str, List[str]]) -> List[str]: return architectures - @staticmethod - @lru_cache(maxsize=128) - def _inspect_stateless(model_arch: str) -> _ModelInterfaces: - """ - Inspect the interfaces that are implemented by a model. - - If the model is not already imported, the inspection is done inside a - subprocess to avoid initializing CUDA for the main program. - """ - model = ModelRegistry._try_get_model_stateless(model_arch) - if model is not None: - return _inspect_model(model) - - try: - mod_name, cls_name = ModelRegistry._get_module_cls_name(model_arch) - except KeyError: - raise - - inspect_fn = partial(_inspect_model_lazy, mod_name, cls_name) - return _run_in_subprocess(inspect_fn) + def inspect_model_cls( + self, + architectures: Union[str, List[str]], + ) -> _ModelInfo: + architectures = self._normalize_archs(architectures) - @staticmethod - def is_text_generation_model(architectures: Union[str, List[str]]) -> bool: - return any( - ModelRegistry._inspect_stateless(arch).is_text_generation_model - for arch in ModelRegistry._normalize_archs(architectures)) + for arch in architectures: + model_info = self._try_inspect_model_cls(arch) + if model_info is not None: + return model_info - @staticmethod - def is_embedding_model(architectures: Union[str, List[str]]) -> bool: - return any( - ModelRegistry._inspect_stateless(arch).is_embedding_model - for arch in ModelRegistry._normalize_archs(architectures)) + return self._raise_for_unsupported(architectures) - @staticmethod - def is_multimodal_model(architectures: Union[str, List[str]]) -> bool: - return any( - ModelRegistry._inspect_stateless(arch).supports_multimodal - for arch in ModelRegistry._normalize_archs(architectures)) + def resolve_model_cls( + self, + architectures: Union[str, List[str]], + ) -> Tuple[Type[nn.Module], str]: + architectures = self._normalize_archs(architectures) - @staticmethod - def is_pp_supported_model(architectures: Union[str, List[str]]) -> bool: - return any( - ModelRegistry._inspect_stateless(arch).supports_pp - for arch in ModelRegistry._normalize_archs(architectures)) + for arch in architectures: + model_cls = self._try_load_model_cls(arch) + if model_cls is not None: + return (model_cls, arch) + return self._raise_for_unsupported(architectures) + + def is_text_generation_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).is_text_generation_model + + def is_embedding_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).is_embedding_model + + def is_multimodal_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).supports_multimodal + + def is_pp_supported_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).supports_pp + + +ModelRegistry = _ModelRegistry({ + model_arch: _LazyRegisteredModel( + module_name=f"vllm.model_executor.models.{mod_relname}", + class_name=cls_name, + ) + for model_arch, (mod_relname, cls_name) in _VLLM_MODELS.items() +}) _T = TypeVar("_T") From a12408ff334fed0c4ebc65e422207a4bb15fae20 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 10 Oct 2024 15:59:11 +0000 Subject: [PATCH 3/6] Fix unhashable --- vllm/model_executor/models/registry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 2ef7f5d51ead..8b30b6138509 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -151,7 +151,7 @@ } -@dataclass +@dataclass(frozen=True) class _ModelInfo: is_text_generation_model: bool is_embedding_model: bool @@ -179,7 +179,7 @@ def load_model_cls(self) -> Type[nn.Module]: raise NotImplementedError -@dataclass +@dataclass(frozen=True) class _RegisteredModel(_BaseRegisteredModel): """ Represents a model that has already been imported in the main process. @@ -202,7 +202,7 @@ def load_model_cls(self) -> Type[nn.Module]: return self.model_cls -@dataclass +@dataclass(frozen=True) class _LazyRegisteredModel(_BaseRegisteredModel): """ Represents a model that has not been imported in the main process. From 6ee76f7902419b77587a2004566567019e77bba0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 11 Oct 2024 04:37:22 +0000 Subject: [PATCH 4/6] Avoid failing in multiprocessing --- vllm/engine/arg_utils.py | 2 ++ vllm/engine/async_llm_engine.py | 4 ++++ vllm/engine/llm_engine.py | 4 ++++ vllm/engine/multiprocessing/engine.py | 3 +++ 4 files changed, 13 insertions(+) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cae95d20ca23..efdcec4ab797 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -183,6 +183,8 @@ class EngineArgs: def __post_init__(self): if self.tokenizer is None: self.tokenizer = self.model + + # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 30e1a09981c5..bc4f98df25b9 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -661,6 +661,10 @@ def from_engine_args( stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "AsyncLLMEngine": """Creates an async LLM engine from the engine arguments.""" + # Setup plugins + from vllm.plugins import load_general_plugins + load_general_plugins() + # Create the engine configs. if engine_config is None: engine_config = engine_args.create_engine_config() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 510ffac6f689..e4cdf50f0fd4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -567,6 +567,10 @@ def from_engine_args( stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" + # Setup plugins + from vllm.plugins import load_general_plugins + load_general_plugins() + # Create the engine configs. engine_config = engine_args.create_engine_config() executor_class = cls._get_executor_cls(engine_config) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index eecca82cd2f7..ebe926f981ab 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -130,6 +130,9 @@ def dead_error(self) -> BaseException: def from_engine_args(cls, engine_args: AsyncEngineArgs, usage_context: UsageContext, ipc_path: str): """Creates an MQLLMEngine from the engine arguments.""" + # Setup plugins + from vllm.plugins import load_general_plugins + load_general_plugins() engine_config = engine_args.create_engine_config() From 7722699d52c6348b8ef14bd6f5cd14f61e085cad Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 11 Oct 2024 04:39:45 +0000 Subject: [PATCH 5/6] Remove unnecessary calls --- vllm/engine/async_llm_engine.py | 4 ---- vllm/engine/llm_engine.py | 4 ---- vllm/engine/multiprocessing/engine.py | 2 +- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index bc4f98df25b9..30e1a09981c5 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -661,10 +661,6 @@ def from_engine_args( stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "AsyncLLMEngine": """Creates an async LLM engine from the engine arguments.""" - # Setup plugins - from vllm.plugins import load_general_plugins - load_general_plugins() - # Create the engine configs. if engine_config is None: engine_config = engine_args.create_engine_config() diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e4cdf50f0fd4..510ffac6f689 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -567,10 +567,6 @@ def from_engine_args( stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" - # Setup plugins - from vllm.plugins import load_general_plugins - load_general_plugins() - # Create the engine configs. engine_config = engine_args.create_engine_config() executor_class = cls._get_executor_cls(engine_config) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index ebe926f981ab..d68970e1da24 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -130,7 +130,7 @@ def dead_error(self) -> BaseException: def from_engine_args(cls, engine_args: AsyncEngineArgs, usage_context: UsageContext, ipc_path: str): """Creates an MQLLMEngine from the engine arguments.""" - # Setup plugins + # Setup plugins for each process from vllm.plugins import load_general_plugins load_general_plugins() From 44ae63220d3abad13bca2b5198db562d7878fd6e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 11 Oct 2024 05:04:25 +0000 Subject: [PATCH 6/6] Setup plugins in subprocess --- vllm/model_executor/models/registry.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8b30b6138509..b37452877cf0 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -417,6 +417,10 @@ def _run_in_subprocess(fn: Callable[[], _T]) -> _T: def _run() -> None: + # Setup plugins + from vllm.plugins import load_general_plugins + load_general_plugins() + fn, output_file = pickle.loads(sys.stdin.buffer.read()) result = fn()