From 36c771a710a25f201f63dff6d565bce4556ec774 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Tue, 26 Dec 2023 16:32:54 +0800 Subject: [PATCH 01/20] dev --- qwen_vl_demo.py | 0 xinference/model/multimodal/__init__.py | 0 xinference/model/multimodal/core.py | 244 ++++++++++++++++++ xinference/model/multimodal/model_spec.json | 0 xinference/model/multimodal/tests/__init__.py | 0 .../model/multimodal/tests/test_multimodal.py | 0 6 files changed, 244 insertions(+) create mode 100644 qwen_vl_demo.py create mode 100644 xinference/model/multimodal/__init__.py create mode 100644 xinference/model/multimodal/core.py create mode 100644 xinference/model/multimodal/model_spec.json create mode 100644 xinference/model/multimodal/tests/__init__.py create mode 100644 xinference/model/multimodal/tests/test_multimodal.py diff --git a/qwen_vl_demo.py b/qwen_vl_demo.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/xinference/model/multimodal/__init__.py b/xinference/model/multimodal/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py new file mode 100644 index 0000000000..884f986a2b --- /dev/null +++ b/xinference/model/multimodal/core.py @@ -0,0 +1,244 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc +import logging +import os +import platform +from abc import abstractmethod +from typing import TYPE_CHECKING, List, Optional, Tuple, Union + +from ...core.utils import parse_replica_model_uid +from ..core import ModelDescription + +if TYPE_CHECKING: + from .llm_family import LLMFamilyV1, LLMSpecV1 + +logger = logging.getLogger(__name__) + + +class LLM(abc.ABC): + def __init__( + self, + replica_model_uid: str, + model_family: "LLMFamilyV1", + model_spec: "LLMSpecV1", + quantization: str, + model_path: str, + *args, + **kwargs, + ): + self.model_uid, self.replica, self.rep_id = parse_replica_model_uid( + replica_model_uid + ) + self.model_family = model_family + self.model_spec = model_spec + self.quantization = quantization + self.model_path = model_path + if args: + raise ValueError(f"Unrecognized positional arguments: {args}") + if kwargs: + raise ValueError(f"Unrecognized keyword arguments: {kwargs}") + + @staticmethod + def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]: + if isinstance(model_size_in_billions, str): + if "_" in model_size_in_billions: + ms = model_size_in_billions.replace("_", ".") + return float(ms) + else: + raise ValueError("Invalid format for `model_size_in_billions`") + return model_size_in_billions + + @staticmethod + def _is_darwin_and_apple_silicon(): + return platform.system() == "Darwin" and platform.processor() == "arm" + + @staticmethod + def _is_linux(): + return platform.system() == "Linux" + + @staticmethod + def _has_cuda_device(): + from ...utils import cuda_count + + return cuda_count() > 0 + + @staticmethod + def _get_cuda_count(): + from ...utils import cuda_count + + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) + if cuda_visible_devices is None: + return cuda_count() + + if cuda_visible_devices == "-1": + return 0 + else: + return len(cuda_visible_devices.split(",")) + + @abstractmethod + def load(self): + raise NotImplementedError + + @classmethod + def match( + cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str + ) -> bool: + raise NotImplementedError + + +class LLMDescription(ModelDescription): + def __init__( + self, + address: Optional[str], + devices: Optional[List[str]], + llm_family: "LLMFamilyV1", + llm_spec: "LLMSpecV1", + quantization: Optional[str], + ): + super().__init__(address, devices) + self._llm_family = llm_family + self._llm_spec = llm_spec + self._quantization = quantization + + def to_dict(self): + return { + "model_type": "LLM", + "address": self.address, + "accelerators": self.devices, + "model_name": self._llm_family.model_name, + "model_lang": self._llm_family.model_lang, + "model_ability": self._llm_family.model_ability, + "model_description": self._llm_family.model_description, + "model_format": self._llm_spec.model_format, + "model_size_in_billions": self._llm_spec.model_size_in_billions, + "quantization": self._quantization, + "model_hub": self._llm_spec.model_hub, + "revision": self._llm_spec.model_revision, + "context_length": self._llm_family.context_length, + } + + +def create_llm_model_instance( + subpool_addr: str, + devices: List[str], + model_uid: str, + model_name: str, + model_format: Optional[str] = None, + model_size_in_billions: Optional[int] = None, + quantization: Optional[str] = None, + is_local_deployment: bool = False, + **kwargs, +) -> Tuple[LLM, LLMDescription]: + from . import match_llm, match_llm_cls + from .llm_family import cache + + match_result = match_llm( + model_name, + model_format, + model_size_in_billions, + quantization, + is_local_deployment, + ) + if not match_result: + raise ValueError( + f"Model not found, name: {model_name}, format: {model_format}," + f" size: {model_size_in_billions}, quantization: {quantization}" + ) + llm_family, llm_spec, quantization = match_result + + assert quantization is not None + save_path = cache(llm_family, llm_spec, quantization) + + llm_cls = match_llm_cls(llm_family, llm_spec, quantization) + if not llm_cls: + raise ValueError( + f"Model not supported, name: {model_name}, format: {model_format}," + f" size: {model_size_in_billions}, quantization: {quantization}" + ) + logger.debug(f"Launching {model_uid} with {llm_cls.__name__}") + + model = llm_cls(model_uid, llm_family, llm_spec, quantization, save_path, kwargs) + return model, LLMDescription( + subpool_addr, devices, llm_family, llm_spec, quantization + ) + + +def create_speculative_llm_model_instance( + subpool_addr: str, + devices: List[str], + model_uid: str, + model_name: str, + model_size_in_billions: Optional[int], + quantization: Optional[str], + draft_model_name: str, + draft_model_size_in_billions: Optional[int], + draft_quantization: Optional[str], + is_local_deployment: bool = False, +) -> Tuple[LLM, LLMDescription]: + from . import match_llm + from .llm_family import cache + + match_result = match_llm( + model_name, + "pytorch", + model_size_in_billions, + quantization, + is_local_deployment, + ) + + if not match_result: + raise ValueError( + f"Model not found, name: {model_name}, format: pytorch," + f" size: {model_size_in_billions}, quantization: {quantization}" + ) + llm_family, llm_spec, quantization = match_result + assert quantization is not None + save_path = cache(llm_family, llm_spec, quantization) + + draft_match_result = match_llm( + draft_model_name, + "pytorch", + draft_model_size_in_billions, + draft_quantization, + is_local_deployment, + ) + + if not draft_match_result: + raise ValueError( + f"Model not found, name: {draft_model_name}, format: pytorch," + f" size: {draft_model_size_in_billions}, quantization: {draft_quantization}" + ) + draft_llm_family, draft_llm_spec, draft_quantization = draft_match_result + assert draft_quantization is not None + draft_save_path = cache(draft_llm_family, draft_llm_spec, draft_quantization) + + from .pytorch.spec_model import SpeculativeModel + + model = SpeculativeModel( + model_uid, + model_family=llm_family, + model_spec=llm_spec, + quantization=quantization, + model_path=save_path, + draft_model_family=draft_llm_family, + draft_model_spec=draft_llm_spec, + draft_quantization=draft_quantization, + draft_model_path=draft_save_path, + ) + + return model, LLMDescription( + subpool_addr, devices, llm_family, llm_spec, quantization + ) diff --git a/xinference/model/multimodal/model_spec.json b/xinference/model/multimodal/model_spec.json new file mode 100644 index 0000000000..e69de29bb2 diff --git a/xinference/model/multimodal/tests/__init__.py b/xinference/model/multimodal/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py new file mode 100644 index 0000000000..e69de29bb2 From d1b9a27cbf197d34428632b0da17fda15e408a85 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Tue, 26 Dec 2023 16:33:40 +0800 Subject: [PATCH 02/20] dev --- xinference/model/core.py | 6 + xinference/model/multimodal/__init__.py | 37 +++ xinference/model/multimodal/core.py | 276 +++++++++++--------- xinference/model/multimodal/model_spec.json | 37 +++ 4 files changed, 232 insertions(+), 124 deletions(-) diff --git a/xinference/model/core.py b/xinference/model/core.py index 9414c504e4..3494c8dc51 100644 --- a/xinference/model/core.py +++ b/xinference/model/core.py @@ -45,6 +45,7 @@ def create_model_instance( from .image.core import create_image_model_instance from .llm.core import create_llm_model_instance from .rerank.core import create_rerank_model_instance + from .multimodal.core import create_multimodal_model_instance if model_type == "LLM": return create_llm_model_instance( @@ -74,5 +75,10 @@ def create_model_instance( return create_rerank_model_instance( subpool_addr, devices, model_uid, model_name, **kwargs ) + elif model_type == "multimodal": + kwargs.pop("trust_remote_code", None) + return create_multimodal_model_instance( + subpool_addr, devices, model_uid, model_name, **kwargs + ) else: raise ValueError(f"Unsupported model type: {model_type}.") diff --git a/xinference/model/multimodal/__init__.py b/xinference/model/multimodal/__init__.py index e69de29bb2..1f9c3acee0 100644 --- a/xinference/model/multimodal/__init__.py +++ b/xinference/model/multimodal/__init__.py @@ -0,0 +1,37 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import codecs +import json +import os + +from .core import ( + BUILTIN_LVLM_FAMILIES, + BUILTIN_MODELSCOPE_LVLM_FAMILIES, + LVLM, + LVLMFamilyV1, + LVLMPromptStyleV1, +) + + +def _install(): + json_path = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "llm_family.json" + ) + for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")): + model_spec = LVLMFamilyV1.parse_obj(json_obj) + BUILTIN_LVLM_FAMILIES.append(model_spec) + + +_install() diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index 884f986a2b..1d6891ff48 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -17,23 +17,99 @@ import os import platform from abc import abstractmethod -from typing import TYPE_CHECKING, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Union + +from pydantic import BaseModel, validator from ...core.utils import parse_replica_model_uid from ..core import ModelDescription - -if TYPE_CHECKING: - from .llm_family import LLMFamilyV1, LLMSpecV1 +from ..utils import download_from_modelscope logger = logging.getLogger(__name__) +DEFAULT_CONTEXT_LENGTH = 2048 + + +class LVLMSpecV1(BaseModel): + model_format: Literal["pytorch", "gptq"] + # Must in order that `str` first, then `int` + model_size_in_billions: Union[str, int] + quantizations: List[str] + model_id: str + model_hub: str = "huggingface" + model_uri: Optional[str] + model_revision: Optional[str] + + @validator("model_size_in_billions", pre=False) + def validate_model_size_with_radix(cls, v: object) -> object: + if isinstance(v, str): + if ( + "_" in v + ): # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18 + return v + else: + return int(v) + return v + + +class LVLMPromptStyleV1(BaseModel): + style_name: str + system_prompt: str = "" + roles: List[str] + image_formatter: str = "" + text_formatter: str = "" + sep: str = "" + + +class LVLMFamilyV1(BaseModel): + version: Literal[1] + context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH + model_name: str + model_lang: List[str] + model_ability: List[Literal["chat"]] + model_description: Optional[str] + model_specs: List["LVLMSpecV1"] + prompt_style: Optional["LVLMPromptStyleV1"] + + +class LVLMDescription(ModelDescription): + def __init__( + self, + address: Optional[str], + devices: Optional[List[str]], + llm_family: "LVLMFamilyV1", + llm_spec: "LVLMSpecV1", + quantization: Optional[str], + ): + super().__init__(address, devices) + self._llm_family = llm_family + self._llm_spec = llm_spec + self._quantization = quantization + + def to_dict(self): + return { + "model_type": "LLM", + "address": self.address, + "accelerators": self.devices, + "model_name": self._llm_family.model_name, + "model_lang": self._llm_family.model_lang, + "model_ability": self._llm_family.model_ability, + "model_description": self._llm_family.model_description, + "model_format": self._llm_spec.model_format, + "model_size_in_billions": self._llm_spec.model_size_in_billions, + "quantization": self._quantization, + "model_hub": self._llm_spec.model_hub, + "revision": self._llm_spec.model_revision, + "context_length": self._llm_family.context_length, + } + -class LLM(abc.ABC): +class LVLM(abc.ABC): def __init__( self, replica_model_uid: str, - model_family: "LLMFamilyV1", - model_spec: "LLMSpecV1", + model_family: "LVLMFamilyV1", + model_spec: "LVLMSpecV1", quantization: str, model_path: str, *args, @@ -94,44 +170,73 @@ def load(self): @classmethod def match( - cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str + cls, llm_family: "LVLMFamilyV1", llm_spec: "LVLMSpecV1", quantization: str ) -> bool: raise NotImplementedError -class LLMDescription(ModelDescription): - def __init__( - self, - address: Optional[str], - devices: Optional[List[str]], - llm_family: "LLMFamilyV1", - llm_spec: "LLMSpecV1", - quantization: Optional[str], - ): - super().__init__(address, devices) - self._llm_family = llm_family - self._llm_spec = llm_spec - self._quantization = quantization +BUILTIN_LVLM_FAMILIES: List["LVLMFamilyV1"] = [] +BUILTIN_MODELSCOPE_LVLM_FAMILIES: List["LVLMFamilyV1"] = [] - def to_dict(self): - return { - "model_type": "LLM", - "address": self.address, - "accelerators": self.devices, - "model_name": self._llm_family.model_name, - "model_lang": self._llm_family.model_lang, - "model_ability": self._llm_family.model_ability, - "model_description": self._llm_family.model_description, - "model_format": self._llm_spec.model_format, - "model_size_in_billions": self._llm_spec.model_size_in_billions, - "quantization": self._quantization, - "model_hub": self._llm_spec.model_hub, - "revision": self._llm_spec.model_revision, - "context_length": self._llm_family.context_length, - } + +def match_multimodal( + model_name: str, + model_format: Optional[str] = None, + model_size_in_billions: Optional[int] = None, + quantization: Optional[str] = None, +) -> Optional[Tuple[LVLMFamilyV1, LVLMSpecV1, str]]: + """ + Find an LLM family, spec, and quantization that satisfy given criteria. + """ + + def _match_quantization(q: Union[str, None], quantizations: List[str]): + # Currently, the quantization name could include both uppercase and lowercase letters, + # so it is necessary to ensure that the case sensitivity does not + # affect the matching results. + if q is None: + return q + for quant in quantizations: + if q.lower() == quant.lower(): + return quant + + def _apply_format_to_model_id(spec: LVLMSpecV1, q: str) -> LVLMSpecV1: + # Different quantized versions of some models use different model ids, + # Here we check the `{}` in the model id to format the id. + if "{" in spec.model_id: + spec.model_id = spec.model_id.format(quantization=q) + return spec + + if download_from_modelscope(): + all_families = BUILTIN_MODELSCOPE_LVLM_FAMILIES + else: + all_families = BUILTIN_LVLM_FAMILIES + + for family in all_families: + if model_name != family.model_name: + continue + for spec in family.model_specs: + matched_quantization = _match_quantization(quantization, spec.quantizations) + if ( + model_format + and model_format != spec.model_format + or model_size_in_billions + and model_size_in_billions != spec.model_size_in_billions + or quantization + and matched_quantization is None + ): + continue + if quantization: + return ( + family, + _apply_format_to_model_id(spec, matched_quantization), + matched_quantization, + ) + else: + return family, _apply_format_to_model_id(spec, "none"), "none" + return None -def create_llm_model_instance( +def create_multimodal_model_instance( subpool_addr: str, devices: List[str], model_uid: str, @@ -139,106 +244,29 @@ def create_llm_model_instance( model_format: Optional[str] = None, model_size_in_billions: Optional[int] = None, quantization: Optional[str] = None, - is_local_deployment: bool = False, **kwargs, -) -> Tuple[LLM, LLMDescription]: - from . import match_llm, match_llm_cls - from .llm_family import cache +) -> Tuple[LVLM, LVLMDescription]: + from ..llm.llm_family import cache - match_result = match_llm( + match_result = match_multimodal( model_name, model_format, model_size_in_billions, quantization, - is_local_deployment, ) if not match_result: raise ValueError( f"Model not found, name: {model_name}, format: {model_format}," f" size: {model_size_in_billions}, quantization: {quantization}" ) - llm_family, llm_spec, quantization = match_result + model_family, model_spec, quantization = match_result assert quantization is not None - save_path = cache(llm_family, llm_spec, quantization) + save_path = cache(model_family, model_spec, quantization) - llm_cls = match_llm_cls(llm_family, llm_spec, quantization) - if not llm_cls: - raise ValueError( - f"Model not supported, name: {model_name}, format: {model_format}," - f" size: {model_size_in_billions}, quantization: {quantization}" - ) - logger.debug(f"Launching {model_uid} with {llm_cls.__name__}") - - model = llm_cls(model_uid, llm_family, llm_spec, quantization, save_path, kwargs) - return model, LLMDescription( - subpool_addr, devices, llm_family, llm_spec, quantization - ) - - -def create_speculative_llm_model_instance( - subpool_addr: str, - devices: List[str], - model_uid: str, - model_name: str, - model_size_in_billions: Optional[int], - quantization: Optional[str], - draft_model_name: str, - draft_model_size_in_billions: Optional[int], - draft_quantization: Optional[str], - is_local_deployment: bool = False, -) -> Tuple[LLM, LLMDescription]: - from . import match_llm - from .llm_family import cache - - match_result = match_llm( - model_name, - "pytorch", - model_size_in_billions, - quantization, - is_local_deployment, - ) - - if not match_result: - raise ValueError( - f"Model not found, name: {model_name}, format: pytorch," - f" size: {model_size_in_billions}, quantization: {quantization}" - ) - llm_family, llm_spec, quantization = match_result - assert quantization is not None - save_path = cache(llm_family, llm_spec, quantization) - - draft_match_result = match_llm( - draft_model_name, - "pytorch", - draft_model_size_in_billions, - draft_quantization, - is_local_deployment, - ) - - if not draft_match_result: - raise ValueError( - f"Model not found, name: {draft_model_name}, format: pytorch," - f" size: {draft_model_size_in_billions}, quantization: {draft_quantization}" - ) - draft_llm_family, draft_llm_spec, draft_quantization = draft_match_result - assert draft_quantization is not None - draft_save_path = cache(draft_llm_family, draft_llm_spec, draft_quantization) - - from .pytorch.spec_model import SpeculativeModel - - model = SpeculativeModel( - model_uid, - model_family=llm_family, - model_spec=llm_spec, - quantization=quantization, - model_path=save_path, - draft_model_family=draft_llm_family, - draft_model_spec=draft_llm_spec, - draft_quantization=draft_quantization, - draft_model_path=draft_save_path, - ) + logger.debug(f"Launching {model_uid} with {LVLM.__name__}") - return model, LLMDescription( - subpool_addr, devices, llm_family, llm_spec, quantization + model = LVLM(model_uid, model_family, model_spec, quantization, save_path, kwargs) + return model, LVLMDescription( + subpool_addr, devices, model_family, model_spec, quantization ) diff --git a/xinference/model/multimodal/model_spec.json b/xinference/model/multimodal/model_spec.json index e69de29bb2..ef160ee6f4 100644 --- a/xinference/model/multimodal/model_spec.json +++ b/xinference/model/multimodal/model_spec.json @@ -0,0 +1,37 @@ +[ + { + "version": 1, + "context_length": 4096, + "model_name": "qwen-vl-chat", + "model_lang": [ + "en", + "zh" + ], + "model_ability": [ + "chat" + ], + "model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "none" + ], + "model_id": "Qwen/Qwen-VL-Chat", + "model_revision": "989c61aac20be61660684ab7400e2e383e67b3ef" + } + ], + "prompt_style": { + "style_name": "QWEN", + "system_prompt": "You are a helpful assistant.", + "roles": [ + "user", + "assistant" + ], + "image_formatter": "Picture {idx}: {image}", + "text_formatter": "{text}", + "sep": "\n" + } + } +] From a6fb3feeddae3b7ec88a5bd420894b8007e8baab Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Tue, 26 Dec 2023 17:05:47 +0800 Subject: [PATCH 03/20] dev --- xinference/core/supervisor.py | 32 +++++++++++++++++++++++++ xinference/model/core.py | 2 +- xinference/model/multimodal/__init__.py | 11 ++++++--- xinference/model/multimodal/core.py | 17 ++++++++++--- 4 files changed, 55 insertions(+), 7 deletions(-) diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py index 1d531367ed..480e879627 100644 --- a/xinference/core/supervisor.py +++ b/xinference/core/supervisor.py @@ -36,6 +36,7 @@ from ..model.embedding import EmbeddingModelSpec from ..model.image import ImageModelFamilyV1 from ..model.llm import LLMFamilyV1 + from ..model.multimodal import LVLMFamilyV1 from ..model.rerank import RerankModelSpec from .worker import WorkerActor @@ -215,6 +216,25 @@ def _to_image_model_reg( "is_builtin": is_builtin, } + def _to_multimodal_reg( + self, model_family: "LVLMFamilyV1", is_builtin: bool + ) -> Dict[str, Any]: + from ..model.llm import get_cache_status + + if self.is_local_deployment(): + specs = [] + # TODO: does not work when the supervisor and worker are running on separate nodes. + for spec in model_family.model_specs: + cache_status = get_cache_status(model_family, spec) + specs.append({**spec.dict(), "cache_status": cache_status}) + return { + **model_family.dict(), + "is_builtin": is_builtin, + "model_specs": specs, + } + else: + return {**model_family.dict(), "is_builtin": is_builtin} + @log_sync(logger=logger) def list_model_registrations( self, model_type: str, detailed: bool = False @@ -286,6 +306,18 @@ def sort_helper(item): else: ret.append({"model_name": model_name, "is_builtin": True}) + ret.sort(key=sort_helper) + return ret + elif model_type == "multimodal": + from ..model.multimodal import BUILTIN_LVLM_FAMILIES + + ret = [] + for family in BUILTIN_LVLM_FAMILIES: + if detailed: + ret.append(self._to_multimodal_reg(family, True)) + else: + ret.append({"model_name": family.model_name, "is_builtin": True}) + ret.sort(key=sort_helper) return ret else: diff --git a/xinference/model/core.py b/xinference/model/core.py index 3494c8dc51..bcc465247c 100644 --- a/xinference/model/core.py +++ b/xinference/model/core.py @@ -44,8 +44,8 @@ def create_model_instance( from .embedding.core import create_embedding_model_instance from .image.core import create_image_model_instance from .llm.core import create_llm_model_instance - from .rerank.core import create_rerank_model_instance from .multimodal.core import create_multimodal_model_instance + from .rerank.core import create_rerank_model_instance if model_type == "LLM": return create_llm_model_instance( diff --git a/xinference/model/multimodal/__init__.py b/xinference/model/multimodal/__init__.py index 1f9c3acee0..5f0269253a 100644 --- a/xinference/model/multimodal/__init__.py +++ b/xinference/model/multimodal/__init__.py @@ -20,6 +20,7 @@ BUILTIN_LVLM_FAMILIES, BUILTIN_MODELSCOPE_LVLM_FAMILIES, LVLM, + MODEL_NAME_TO_REVISION, LVLMFamilyV1, LVLMPromptStyleV1, ) @@ -27,11 +28,15 @@ def _install(): json_path = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "llm_family.json" + os.path.dirname(os.path.abspath(__file__)), "model_spec.json" ) for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")): - model_spec = LVLMFamilyV1.parse_obj(json_obj) - BUILTIN_LVLM_FAMILIES.append(model_spec) + model_family = LVLMFamilyV1.parse_obj(json_obj) + BUILTIN_LVLM_FAMILIES.append(model_family) + for model_spec in model_family.model_specs: + MODEL_NAME_TO_REVISION[model_family.model_name].append( + model_spec.model_revision + ) _install() diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index 1d6891ff48..5329d96ad3 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -17,17 +17,22 @@ import os import platform from abc import abstractmethod -from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Union +from collections import defaultdict +from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union from pydantic import BaseModel, validator +from ...constants import XINFERENCE_CACHE_DIR from ...core.utils import parse_replica_model_uid from ..core import ModelDescription -from ..utils import download_from_modelscope +from ..utils import download_from_modelscope, is_model_cached, valid_model_revision logger = logging.getLogger(__name__) DEFAULT_CONTEXT_LENGTH = 2048 +# Used for check whether the model is cached. +# Init when registering all the builtin models. +MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list) class LVLMSpecV1(BaseModel): @@ -207,7 +212,7 @@ def _apply_format_to_model_id(spec: LVLMSpecV1, q: str) -> LVLMSpecV1: return spec if download_from_modelscope(): - all_families = BUILTIN_MODELSCOPE_LVLM_FAMILIES + all_families = BUILTIN_MODELSCOPE_LVLM_FAMILIES + BUILTIN_LVLM_FAMILIES else: all_families = BUILTIN_LVLM_FAMILIES @@ -270,3 +275,9 @@ def create_multimodal_model_instance( return model, LVLMDescription( subpool_addr, devices, model_family, model_spec, quantization ) + + +def get_cache_status( + model_spec: LVLMSpecV1, +) -> bool: + return is_model_cached(model_spec, MODEL_NAME_TO_REVISION) From 0fbb70c61105a3ba4ac3a2c1e7eb4283dbba8ef8 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Tue, 26 Dec 2023 17:27:24 +0800 Subject: [PATCH 04/20] dev --- xinference/model/multimodal/core.py | 226 ++++++++++++++++++++++++++-- 1 file changed, 210 insertions(+), 16 deletions(-) diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index 5329d96ad3..b3111a804d 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -25,7 +25,13 @@ from ...constants import XINFERENCE_CACHE_DIR from ...core.utils import parse_replica_model_uid from ..core import ModelDescription -from ..utils import download_from_modelscope, is_model_cached, valid_model_revision +from ..utils import ( + download_from_modelscope, + is_model_cached, + retry_download, + symlink_local_file, + valid_model_revision, +) logger = logging.getLogger(__name__) @@ -82,13 +88,13 @@ def __init__( self, address: Optional[str], devices: Optional[List[str]], - llm_family: "LVLMFamilyV1", - llm_spec: "LVLMSpecV1", + model_family: "LVLMFamilyV1", + model_spec: "LVLMSpecV1", quantization: Optional[str], ): super().__init__(address, devices) - self._llm_family = llm_family - self._llm_spec = llm_spec + self._model_family = model_family + self._model_spec = model_spec self._quantization = quantization def to_dict(self): @@ -96,16 +102,16 @@ def to_dict(self): "model_type": "LLM", "address": self.address, "accelerators": self.devices, - "model_name": self._llm_family.model_name, - "model_lang": self._llm_family.model_lang, - "model_ability": self._llm_family.model_ability, - "model_description": self._llm_family.model_description, - "model_format": self._llm_spec.model_format, - "model_size_in_billions": self._llm_spec.model_size_in_billions, + "model_name": self._model_family.model_name, + "model_lang": self._model_family.model_lang, + "model_ability": self._model_family.model_ability, + "model_description": self._model_family.model_description, + "model_format": self._model_spec.model_format, + "model_size_in_billions": self._model_spec.model_size_in_billions, "quantization": self._quantization, - "model_hub": self._llm_spec.model_hub, - "revision": self._llm_spec.model_revision, - "context_length": self._llm_family.context_length, + "model_hub": self._model_spec.model_hub, + "revision": self._model_spec.model_revision, + "context_length": self._model_family.context_length, } @@ -251,8 +257,6 @@ def create_multimodal_model_instance( quantization: Optional[str] = None, **kwargs, ) -> Tuple[LVLM, LVLMDescription]: - from ..llm.llm_family import cache - match_result = match_multimodal( model_name, model_format, @@ -277,6 +281,196 @@ def create_multimodal_model_instance( ) +def _get_cache_dir( + model_family: LVLMFamilyV1, + model_spec: "LVLMSpecV1", + create_if_not_exist=True, +): + cache_dir_name = ( + f"{model_family.model_name}-{model_spec.model_format}" + f"-{model_spec.model_size_in_billions}b" + ) + cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name)) + if create_if_not_exist and not os.path.exists(cache_dir): + os.makedirs(cache_dir, exist_ok=True) + return cache_dir + + +def _get_meta_path( + cache_dir: str, + model_format: str, + model_hub: str, + quantization: Optional[str] = None, +): + if model_format == "pytorch": + if model_hub == "huggingface": + return os.path.join(cache_dir, "__valid_download") + else: + return os.path.join(cache_dir, f"__valid_download_{model_hub}") + elif model_format in ["ggmlv3", "ggufv2", "gptq"]: + assert quantization is not None + if model_hub == "huggingface": + return os.path.join(cache_dir, f"__valid_download_{quantization}") + else: + return os.path.join( + cache_dir, f"__valid_download_{model_hub}_{quantization}" + ) + else: + raise ValueError(f"Unsupported format: {model_format}") + + +def _skip_download( + cache_dir: str, + model_format: str, + model_hub: str, + model_revision: Optional[str], + quantization: Optional[str] = None, +) -> bool: + if model_format == "pytorch": + model_hub_to_meta_path = { + "huggingface": _get_meta_path( + cache_dir, model_format, "huggingface", quantization + ), + "modelscope": _get_meta_path( + cache_dir, model_format, "modelscope", quantization + ), + } + if valid_model_revision(model_hub_to_meta_path[model_hub], model_revision): + logger.info(f"Cache {cache_dir} exists") + return True + else: + for hub, meta_path in model_hub_to_meta_path.items(): + if hub != model_hub and os.path.exists(meta_path): + # PyTorch models from modelscope can also be loaded by transformers. + logger.warning(f"Cache {cache_dir} exists, but it was from {hub}") + return True + return False + else: + raise ValueError(f"Unsupported format: {model_format}") + + +def _generate_meta_file( + meta_path: str, + model_family: "LVLMFamilyV1", + model_spec: "LVLMSpecV1", + quantization: Optional[str] = None, +): + assert not valid_model_revision( + meta_path, model_spec.model_revision + ), f"meta file {meta_path} should not be valid" + with open(meta_path, "w") as f: + import json + + desc = LVLMDescription(None, None, model_family, model_spec, quantization) + json.dump(desc.to_dict(), f) + + +def cache_from_modelscope( + model_family: LVLMFamilyV1, + model_spec: "LVLMSpecV1", + quantization: Optional[str] = None, +) -> str: + """ + Cache model from Modelscope. Return the cache directory. + """ + from modelscope.hub.snapshot_download import snapshot_download + + cache_dir = _get_cache_dir(model_family, model_spec) + if _skip_download( + cache_dir, + model_spec.model_format, + model_spec.model_hub, + model_spec.model_revision, + quantization, + ): + return cache_dir + + if model_spec.model_format in ["pytorch", "gptq"]: + download_dir = retry_download( + snapshot_download, + model_family.model_name, + { + "model_size": model_spec.model_size_in_billions, + "model_format": model_spec.model_format, + }, + model_spec.model_id, + revision=model_spec.model_revision, + ) + for subdir, dirs, files in os.walk(download_dir): + for file in files: + relpath = os.path.relpath(os.path.join(subdir, file), download_dir) + symlink_local_file(os.path.join(subdir, file), cache_dir, relpath) + else: + raise ValueError(f"Unsupported format: {model_spec.model_format}") + + meta_path = _get_meta_path( + cache_dir, model_spec.model_format, model_spec.model_hub, quantization + ) + _generate_meta_file(meta_path, model_family, model_spec, quantization) + + return cache_dir + + +def cache_from_huggingface( + model_family: LVLMFamilyV1, + model_spec: "LVLMSpecV1", + quantization: Optional[str] = None, +) -> str: + """ + Cache model from Hugging Face. Return the cache directory. + """ + import huggingface_hub + + cache_dir = _get_cache_dir(model_family, model_spec) + if _skip_download( + cache_dir, + model_spec.model_format, + model_spec.model_hub, + model_spec.model_revision, + quantization, + ): + return cache_dir + + if model_spec.model_format in ["pytorch"]: + assert isinstance(model_spec, LVLMSpecV1) + retry_download( + huggingface_hub.snapshot_download, + model_family.model_name, + { + "model_size": model_spec.model_size_in_billions, + "model_format": model_spec.model_format, + }, + model_spec.model_id, + revision=model_spec.model_revision, + local_dir=cache_dir, + local_dir_use_symlinks=True, + ) + else: + raise ValueError(f"Unsupported model format: {model_spec.model_format}") + + meta_path = _get_meta_path( + cache_dir, model_spec.model_format, model_spec.model_hub, quantization + ) + _generate_meta_file(meta_path, model_family, model_spec, quantization) + + return cache_dir + + +def cache( + llm_family: LVLMFamilyV1, + llm_spec: "LVLMSpecV1", + quantization: Optional[str] = None, +) -> str: + if llm_spec.model_hub == "huggingface": + logger.info(f"Caching from Hugging Face: {llm_spec.model_id}") + return cache_from_huggingface(llm_family, llm_spec, quantization) + elif llm_spec.model_hub == "modelscope": + logger.info(f"Caching from Modelscope: {llm_spec.model_id}") + return cache_from_modelscope(llm_family, llm_spec, quantization) + else: + raise ValueError(f"Unknown model hub: {llm_spec.model_hub}") + + def get_cache_status( model_spec: LVLMSpecV1, ) -> bool: From 7253c658bbf2e69a1f43b2e5f71ac5498949906d Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Tue, 26 Dec 2023 18:24:44 +0800 Subject: [PATCH 05/20] dev --- xinference/model/multimodal/__init__.py | 5 ++- xinference/model/multimodal/core.py | 35 ++++++++++++++++--- xinference/model/multimodal/qwen_vl.py | 26 ++++++++++++++ .../model/multimodal/tests/test_multimodal.py | 28 +++++++++++++++ 4 files changed, 89 insertions(+), 5 deletions(-) create mode 100644 xinference/model/multimodal/qwen_vl.py diff --git a/xinference/model/multimodal/__init__.py b/xinference/model/multimodal/__init__.py index 5f0269253a..bae4627739 100644 --- a/xinference/model/multimodal/__init__.py +++ b/xinference/model/multimodal/__init__.py @@ -19,11 +19,14 @@ from .core import ( BUILTIN_LVLM_FAMILIES, BUILTIN_MODELSCOPE_LVLM_FAMILIES, - LVLM, + MODEL_CLASSES, MODEL_NAME_TO_REVISION, LVLMFamilyV1, LVLMPromptStyleV1, ) +from .qwen_vl import QwenVLChat + +MODEL_CLASSES.append(QwenVLChat) def _install(): diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index b3111a804d..02751ae441 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -18,12 +18,13 @@ import platform from abc import abstractmethod from collections import defaultdict -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union +from typing import Dict, Iterator, List, Literal, Optional, Tuple, Type, Union from pydantic import BaseModel, validator from ...constants import XINFERENCE_CACHE_DIR from ...core.utils import parse_replica_model_uid +from ...types import ChatCompletion, ChatCompletionChunk from ..core import ModelDescription from ..utils import ( download_from_modelscope, @@ -179,9 +180,19 @@ def _get_cuda_count(): def load(self): raise NotImplementedError + @abstractmethod + def chat( + self, + prompt: str, + system_prompt: Optional[str] = None, + chat_history: Optional[List[Dict]] = None, + generate_config: Optional[Dict] = None, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + raise NotImplementedError + @classmethod def match( - cls, llm_family: "LVLMFamilyV1", llm_spec: "LVLMSpecV1", quantization: str + cls, model_family: "LVLMFamilyV1", model_spec: "LVLMSpecV1", quantization: str ) -> bool: raise NotImplementedError @@ -273,14 +284,30 @@ def create_multimodal_model_instance( assert quantization is not None save_path = cache(model_family, model_spec, quantization) - logger.debug(f"Launching {model_uid} with {LVLM.__name__}") + cls = match_cls(model_family, model_spec, quantization) + logger.debug(f"Launching {model_uid} with {cls.__name__}") - model = LVLM(model_uid, model_family, model_spec, quantization, save_path, kwargs) + model = cls(model_uid, model_family, model_spec, quantization, save_path, kwargs) return model, LVLMDescription( subpool_addr, devices, model_family, model_spec, quantization ) +MODEL_CLASSES: List[Type[LVLM]] = [] + + +def match_cls( + model_family: LVLMFamilyV1, model_spec: "LVLMSpecV1", quantization: str +) -> Optional[Type[LVLM]]: + """ + Find an LLM implementation for given LLM family and spec. + """ + for cls in MODEL_CLASSES: + if cls.match(model_family, model_spec, quantization): + return cls + return None + + def _get_cache_dir( model_family: LVLMFamilyV1, model_spec: "LVLMSpecV1", diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py new file mode 100644 index 0000000000..086040a8c2 --- /dev/null +++ b/xinference/model/multimodal/qwen_vl.py @@ -0,0 +1,26 @@ +from typing import Dict, Iterator, List, Optional, Union + +from ...types import ChatCompletion, ChatCompletionChunk +from .core import LVLM, LVLMFamilyV1, LVLMSpecV1 + + +class QwenVLChat(LVLM): + @classmethod + def match( + cls, model_family: "LVLMFamilyV1", model_spec: "LVLMSpecV1", quantization: str + ) -> bool: + if "qwen" in model_family.model_name: + return True + return False + + def load(self): + raise NotImplementedError + + def chat( + self, + prompt: str, + system_prompt: Optional[str] = None, + chat_history: Optional[List[Dict]] = None, + generate_config: Optional[Dict] = None, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + raise NotImplementedError diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py index e69de29bb2..ff6a8b0987 100644 --- a/xinference/model/multimodal/tests/test_multimodal.py +++ b/xinference/model/multimodal/tests/test_multimodal.py @@ -0,0 +1,28 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def test_restful_api_for_multimodal(setup): + endpoint, _ = setup + from ....client import Client + + client = Client(endpoint) + + model_uid = client.launch_model( + model_uid="my_controlnet", + model_name="qwen-vl-chat", + model_type="multimodal", + ) + model = client.get_model(model_uid) + print(model) From b5eb073340695f3f3b3ed84c04c674207833315c Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 12:22:47 +0800 Subject: [PATCH 06/20] Implement qwen vl chat --- xinference/model/llm/pytorch/core.py | 30 +------ xinference/model/llm/pytorch/spec_model.py | 3 +- xinference/model/multimodal/core.py | 9 +-- xinference/model/multimodal/model_spec.json | 2 +- xinference/model/multimodal/qwen_vl.py | 80 ++++++++++++++++++- .../model/multimodal/tests/test_multimodal.py | 24 ++++++ xinference/model/utils.py | 28 +++++++ 7 files changed, 136 insertions(+), 40 deletions(-) diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py index f57e23d5a0..cc794f8ce3 100644 --- a/xinference/model/llm/pytorch/core.py +++ b/xinference/model/llm/pytorch/core.py @@ -29,6 +29,7 @@ PytorchGenerateConfig, PytorchModelConfig, ) +from ...utils import select_device from ..core import LLM from ..llm_family import LLMFamilyV1, LLMSpecV1 from ..utils import ChatModelMixin @@ -122,7 +123,7 @@ def load(self): quantization = self.quantization num_gpus = len(cuda_visible_devices) if cuda_visible_devices_env != "-1" else 0 device = self._pytorch_model_config.get("device", "auto") - self._pytorch_model_config["device"] = self._select_device(device) + self._pytorch_model_config["device"] = select_device(device) self._device = self._pytorch_model_config["device"] if self._device == "cpu": @@ -185,33 +186,6 @@ def load(self): self._model.to(self._device) logger.debug(f"Model Memory: {self._model.get_memory_footprint()}") - def _select_device(self, device: str) -> str: - try: - import torch - except ImportError: - raise ImportError( - f"Failed to import module 'torch'. Please make sure 'torch' is installed.\n\n" - ) - - if device == "auto": - # When env CUDA_VISIBLE_DEVICES=-1, torch.cuda.is_available() return False - if torch.cuda.is_available(): - return "cuda" - elif torch.backends.mps.is_available(): - return "mps" - return "cpu" - elif device == "cuda": - if not torch.cuda.is_available(): - raise ValueError("cuda is unavailable in your environment") - elif device == "mps": - if not torch.backends.mps.is_available(): - raise ValueError("mps is unavailable in your environment") - elif device == "cpu": - pass - else: - raise ValueError(f"Device {device} is not supported in temporary") - return device - @classmethod def match( cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str diff --git a/xinference/model/llm/pytorch/spec_model.py b/xinference/model/llm/pytorch/spec_model.py index e438bbb264..a66f6fbfc1 100644 --- a/xinference/model/llm/pytorch/spec_model.py +++ b/xinference/model/llm/pytorch/spec_model.py @@ -17,6 +17,7 @@ from typing import Iterator, List, Optional, Union from ....types import Completion, CompletionChunk, Embedding +from ...utils import select_device from .. import LLMFamilyV1, LLMSpecV1 from .core import PytorchChatModel, PytorchGenerateConfig, PytorchModelConfig @@ -85,7 +86,7 @@ def load(self): num_gpus = len(cuda_visible_devices) if cuda_visible_devices_env != "-1" else 0 device = self._pytorch_model_config.get("device", "auto") - self._pytorch_model_config["device"] = self._select_device(device) + self._pytorch_model_config["device"] = select_device(device) self._device = self._pytorch_model_config["device"] if self._device == "cpu": diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index 02751ae441..c153ff9070 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -124,8 +124,7 @@ def __init__( model_spec: "LVLMSpecV1", quantization: str, model_path: str, - *args, - **kwargs, + kwargs: Dict, ): self.model_uid, self.replica, self.rep_id = parse_replica_model_uid( replica_model_uid @@ -134,10 +133,8 @@ def __init__( self.model_spec = model_spec self.quantization = quantization self.model_path = model_path - if args: - raise ValueError(f"Unrecognized positional arguments: {args}") - if kwargs: - raise ValueError(f"Unrecognized keyword arguments: {kwargs}") + self.kwargs = kwargs + logger.info("Init model %s with kwargs: %s", self.model_uid, kwargs) @staticmethod def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]: diff --git a/xinference/model/multimodal/model_spec.json b/xinference/model/multimodal/model_spec.json index ef160ee6f4..e279441b3c 100644 --- a/xinference/model/multimodal/model_spec.json +++ b/xinference/model/multimodal/model_spec.json @@ -19,7 +19,7 @@ "none" ], "model_id": "Qwen/Qwen-VL-Chat", - "model_revision": "989c61aac20be61660684ab7400e2e383e67b3ef" + "model_revision": "6665c780ade5ff3f08853b4262dcb9c8f9598d42" } ], "prompt_style": { diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py index 086040a8c2..38cbb6a75f 100644 --- a/xinference/model/multimodal/qwen_vl.py +++ b/xinference/model/multimodal/qwen_vl.py @@ -1,10 +1,25 @@ +import operator +import tempfile +import time +import uuid from typing import Dict, Iterator, List, Optional, Union -from ...types import ChatCompletion, ChatCompletionChunk +from ...types import ( + ChatCompletion, + ChatCompletionChoice, + ChatCompletionChunk, + CompletionUsage, +) +from ..utils import select_device from .core import LVLM, LVLMFamilyV1, LVLMSpecV1 class QwenVLChat(LVLM): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._tokenizer = None + self._model = None + @classmethod def match( cls, model_family: "LVLMFamilyV1", model_spec: "LVLMSpecV1", quantization: str @@ -14,13 +29,70 @@ def match( return False def load(self): - raise NotImplementedError + from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers.generation import GenerationConfig + + device = self.kwargs.get("device", "auto") + device = select_device(device) + + self._tokenizer = AutoTokenizer.from_pretrained( + self.model_path, + trust_remote_code=True, + code_revision=self.model_spec.model_revision, + ) + self._model = AutoModelForCausalLM.from_pretrained( + self.model_path, + device_map=device, + trust_remote_code=True, + code_revision=self.model_spec.model_revision, + ).eval() + # Specify hyperparameters for generation + self._model.generation_config = GenerationConfig.from_pretrained( + self.model_path, + trust_remote_code=True, + code_revision=self.model_spec.model_revision, + ) def chat( self, - prompt: str, + prompt: Union[str, List[Dict]], system_prompt: Optional[str] = None, chat_history: Optional[List[Dict]] = None, generate_config: Optional[Dict] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - raise NotImplementedError + if not isinstance(prompt, str): + prompt = [ + {"image": p["image_url"]["url"], "type": "image"} + if p.get("type") == "image_url" + else p + for p in prompt + ] + prompt = sorted(prompt, key=operator.itemgetter("type")) + prompt = self._tokenizer.from_list_format(prompt) + response, history = self._model.chat( + self._tokenizer, query=prompt, chat_history=chat_history + ) + if "" in response: + image = self._tokenizer.draw_bbox_on_latest_picture(response, history) + if image: + with tempfile.NamedTemporaryFile( + suffix=".jpg", delete_on_close=False + ) as output: + image.save(output) + response = output.name + return ChatCompletion( + id="chat" + str(uuid.uuid1()), + object="chat.completion", + created=int(time.time()), + model=self.model_uid, + choices=[ + ChatCompletionChoice( + index=0, + message={"role": "assistant", "content": response}, + finish_reason="stop", + ) + ], + usage=CompletionUsage( + prompt_tokens=-1, completion_tokens=-1, total_tokens=-1 + ), + ) diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py index ff6a8b0987..5caf58778a 100644 --- a/xinference/model/multimodal/tests/test_multimodal.py +++ b/xinference/model/multimodal/tests/test_multimodal.py @@ -23,6 +23,30 @@ def test_restful_api_for_multimodal(setup): model_uid="my_controlnet", model_name="qwen-vl-chat", model_type="multimodal", + device="cpu", ) model = client.get_model(model_uid) print(model) + + # openai client + import openai + + client = openai.Client(api_key="not empty", base_url=f"{endpoint}/v1") + completion = client.chat.completions.create( + model=model_uid, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + } + ], + ) + print(completion) diff --git a/xinference/model/utils.py b/xinference/model/utils.py index 9ec82c6c4f..a3f67a8d8d 100644 --- a/xinference/model/utils.py +++ b/xinference/model/utils.py @@ -256,3 +256,31 @@ def _patched_resolve_trust_remote_code(*args, **kwargs): resolve_trust_remote_code.__code__ = ( _patched_resolve_trust_remote_code.__code__ ) + + +def select_device(device): + try: + import torch + except ImportError: + raise ImportError( + f"Failed to import module 'torch'. Please make sure 'torch' is installed.\n\n" + ) + + if device == "auto": + # When env CUDA_VISIBLE_DEVICES=-1, torch.cuda.is_available() return False + if torch.cuda.is_available(): + return "cuda" + elif torch.backends.mps.is_available(): + return "mps" + return "cpu" + elif device == "cuda": + if not torch.cuda.is_available(): + raise ValueError("cuda is unavailable in your environment") + elif device == "mps": + if not torch.backends.mps.is_available(): + raise ValueError("mps is unavailable in your environment") + elif device == "cpu": + pass + else: + raise ValueError(f"Device {device} is not supported in temporary") + return device From b7b818635b73d072aa2f9a4f1974cddaee255953 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 12:26:05 +0800 Subject: [PATCH 07/20] Fix --- xinference/model/multimodal/qwen_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py index 38cbb6a75f..77adb88d5a 100644 --- a/xinference/model/multimodal/qwen_vl.py +++ b/xinference/model/multimodal/qwen_vl.py @@ -70,7 +70,7 @@ def chat( prompt = sorted(prompt, key=operator.itemgetter("type")) prompt = self._tokenizer.from_list_format(prompt) response, history = self._model.chat( - self._tokenizer, query=prompt, chat_history=chat_history + self._tokenizer, query=prompt, history=chat_history ) if "" in response: image = self._tokenizer.draw_bbox_on_latest_picture(response, history) From d663b3cf6fe96cab9db8d31dfc6b8376070ced6d Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 14:18:32 +0800 Subject: [PATCH 08/20] Add ut --- .../model/multimodal/tests/test_multimodal.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py index 5caf58778a..859dcba56c 100644 --- a/xinference/model/multimodal/tests/test_multimodal.py +++ b/xinference/model/multimodal/tests/test_multimodal.py @@ -50,3 +50,24 @@ def test_restful_api_for_multimodal(setup): ], ) print(completion) + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "这是什么?"}, + { + "type": "image_url", + "image_url": { + "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + }, + }, + ], + } + ] + completion = client.chat.completions.create(model=model_uid, messages=messages) + print(completion) + messages.append(completion.choices[0].message.model_dump()) + messages.append({"role": "user", "content": "框出图中击掌的位置"}) + print(messages) + completion = client.chat.completions.create(model=model_uid, messages=messages) + print(completion) From 8c246a4ddf9fabfdee5daa6308819941d9cafd07 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 14:27:01 +0800 Subject: [PATCH 09/20] Fix history --- xinference/model/multimodal/qwen_vl.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py index 77adb88d5a..88b077ef99 100644 --- a/xinference/model/multimodal/qwen_vl.py +++ b/xinference/model/multimodal/qwen_vl.py @@ -53,6 +53,18 @@ def load(self): code_revision=self.model_spec.model_revision, ) + def _message_content_to_qwen(self, content): + if not isinstance(content, str): + content = [ + {"image": c["image_url"]["url"], "type": "image"} + if c.get("type") == "image_url" + else c + for c in content + ] + content = sorted(content, key=operator.itemgetter("type")) + return self._tokenizer.from_list_format(content) + return content + def chat( self, prompt: Union[str, List[Dict]], @@ -60,15 +72,9 @@ def chat( chat_history: Optional[List[Dict]] = None, generate_config: Optional[Dict] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - if not isinstance(prompt, str): - prompt = [ - {"image": p["image_url"]["url"], "type": "image"} - if p.get("type") == "image_url" - else p - for p in prompt - ] - prompt = sorted(prompt, key=operator.itemgetter("type")) - prompt = self._tokenizer.from_list_format(prompt) + prompt = self._message_content_to_qwen(prompt) + for h in chat_history: + h["content"] = self._message_content_to_qwen(h) response, history = self._model.chat( self._tokenizer, query=prompt, history=chat_history ) From fa3a11008089bff04c18900b93568ad35561bf2f Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 14:34:27 +0800 Subject: [PATCH 10/20] Fix --- xinference/model/multimodal/qwen_vl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py index 88b077ef99..3f7be7240e 100644 --- a/xinference/model/multimodal/qwen_vl.py +++ b/xinference/model/multimodal/qwen_vl.py @@ -74,7 +74,7 @@ def chat( ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: prompt = self._message_content_to_qwen(prompt) for h in chat_history: - h["content"] = self._message_content_to_qwen(h) + h["content"] = self._message_content_to_qwen(h["content"]) response, history = self._model.chat( self._tokenizer, query=prompt, history=chat_history ) From 411a696f0154d2c362ef6aecd3e7eacaf6203f1f Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 14:44:10 +0800 Subject: [PATCH 11/20] Fix --- xinference/model/multimodal/qwen_vl.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py index 3f7be7240e..8466bfd815 100644 --- a/xinference/model/multimodal/qwen_vl.py +++ b/xinference/model/multimodal/qwen_vl.py @@ -73,10 +73,21 @@ def chat( generate_config: Optional[Dict] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: prompt = self._message_content_to_qwen(prompt) + # Convert openai history to qwen vl history + qwen_history = [] + query_to_response = [] for h in chat_history: - h["content"] = self._message_content_to_qwen(h["content"]) + role = h["role"] + content = self._message_content_to_qwen(h["content"]) + if len(query_to_response) == 0 and role == "user": + query_to_response.append(content) + if len(query_to_response) == 1 and role == "assistant": + query_to_response.append(content) + if len(query_to_response) == 2: + qwen_history.append(query_to_response) + query_to_response = [] response, history = self._model.chat( - self._tokenizer, query=prompt, history=chat_history + self._tokenizer, query=prompt, history=qwen_history ) if "" in response: image = self._tokenizer.draw_bbox_on_latest_picture(response, history) From a293d77bb3a974ecc5c3ce1a6a01d4cf10404a06 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 15:02:24 +0800 Subject: [PATCH 12/20] Remove render bounding box --- xinference/model/multimodal/qwen_vl.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py index 8466bfd815..92a97fbd5b 100644 --- a/xinference/model/multimodal/qwen_vl.py +++ b/xinference/model/multimodal/qwen_vl.py @@ -1,5 +1,4 @@ import operator -import tempfile import time import uuid from typing import Dict, Iterator, List, Optional, Union @@ -89,14 +88,6 @@ def chat( response, history = self._model.chat( self._tokenizer, query=prompt, history=qwen_history ) - if "" in response: - image = self._tokenizer.draw_bbox_on_latest_picture(response, history) - if image: - with tempfile.NamedTemporaryFile( - suffix=".jpg", delete_on_close=False - ) as output: - image.save(output) - response = output.name return ChatCompletion( id="chat" + str(uuid.uuid1()), object="chat.completion", From 4cd65eaeb95bdc472adb0d38bcf1f7498d979f18 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 15:12:57 +0800 Subject: [PATCH 13/20] Fix --- xinference/model/multimodal/core.py | 4 ++-- xinference/model/multimodal/qwen_vl.py | 6 +++--- .../model/multimodal/tests/test_multimodal.py | 16 ++++++++++------ 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index c153ff9070..30fdf31c57 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -295,14 +295,14 @@ def create_multimodal_model_instance( def match_cls( model_family: LVLMFamilyV1, model_spec: "LVLMSpecV1", quantization: str -) -> Optional[Type[LVLM]]: +) -> Type[LVLM]: """ Find an LLM implementation for given LLM family and spec. """ for cls in MODEL_CLASSES: if cls.match(model_family, model_spec, quantization): return cls - return None + raise Exception(f"Model {model_family.model_name} is not supported") def _get_cache_dir( diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py index 92a97fbd5b..69b7d14862 100644 --- a/xinference/model/multimodal/qwen_vl.py +++ b/xinference/model/multimodal/qwen_vl.py @@ -52,7 +52,7 @@ def load(self): code_revision=self.model_spec.model_revision, ) - def _message_content_to_qwen(self, content): + def _message_content_to_qwen(self, content) -> str: if not isinstance(content, str): content = [ {"image": c["image_url"]["url"], "type": "image"} @@ -74,8 +74,8 @@ def chat( prompt = self._message_content_to_qwen(prompt) # Convert openai history to qwen vl history qwen_history = [] - query_to_response = [] - for h in chat_history: + query_to_response: List = [] + for h in chat_history or []: role = h["role"] content = self._message_content_to_qwen(h["content"]) if len(query_to_response) == 0 and role == "user": diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py index 859dcba56c..5176131019 100644 --- a/xinference/model/multimodal/tests/test_multimodal.py +++ b/xinference/model/multimodal/tests/test_multimodal.py @@ -13,7 +13,7 @@ # limitations under the License. -def test_restful_api_for_multimodal(setup): +def test_restful_api_for_qwen_vl(setup): endpoint, _ = setup from ....client import Client @@ -26,7 +26,6 @@ def test_restful_api_for_multimodal(setup): device="cpu", ) model = client.get_model(model_uid) - print(model) # openai client import openai @@ -49,7 +48,9 @@ def test_restful_api_for_multimodal(setup): } ], ) - print(completion) + assert "grass" in completion.choices[0].message.content + assert "tree" in completion.choices[0].message.content + assert "sky" in completion.choices[0].message.content messages = [ { "role": "user", @@ -65,9 +66,12 @@ def test_restful_api_for_multimodal(setup): } ] completion = client.chat.completions.create(model=model_uid, messages=messages) - print(completion) + assert "女" in completion.choices[0].message.content + assert "狗" in completion.choices[0].message.content + assert "沙滩" in completion.choices[0].message.content messages.append(completion.choices[0].message.model_dump()) messages.append({"role": "user", "content": "框出图中击掌的位置"}) - print(messages) completion = client.chat.completions.create(model=model_uid, messages=messages) - print(completion) + assert "击掌" in completion.choices[0].message.content + assert "" in completion.choices[0].message.content + assert "" in completion.choices[0].message.content From 92fde203decf33feecb315e7a61dbd0c9ea965cf Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 15:16:09 +0800 Subject: [PATCH 14/20] Clean code --- xinference/model/multimodal/core.py | 39 +---------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index 30fdf31c57..26d8a8d9da 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -100,7 +100,7 @@ def __init__( def to_dict(self): return { - "model_type": "LLM", + "model_type": "LVLM", "address": self.address, "accelerators": self.devices, "model_name": self._model_family.model_name, @@ -136,43 +136,6 @@ def __init__( self.kwargs = kwargs logger.info("Init model %s with kwargs: %s", self.model_uid, kwargs) - @staticmethod - def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]: - if isinstance(model_size_in_billions, str): - if "_" in model_size_in_billions: - ms = model_size_in_billions.replace("_", ".") - return float(ms) - else: - raise ValueError("Invalid format for `model_size_in_billions`") - return model_size_in_billions - - @staticmethod - def _is_darwin_and_apple_silicon(): - return platform.system() == "Darwin" and platform.processor() == "arm" - - @staticmethod - def _is_linux(): - return platform.system() == "Linux" - - @staticmethod - def _has_cuda_device(): - from ...utils import cuda_count - - return cuda_count() > 0 - - @staticmethod - def _get_cuda_count(): - from ...utils import cuda_count - - cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) - if cuda_visible_devices is None: - return cuda_count() - - if cuda_visible_devices == "-1": - return 0 - else: - return len(cuda_visible_devices.split(",")) - @abstractmethod def load(self): raise NotImplementedError From 4b262a1a75153a50ee3fe6c2cd8bceba3f442fb8 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 15:24:28 +0800 Subject: [PATCH 15/20] Fix --- xinference/model/multimodal/core.py | 4 ---- xinference/model/multimodal/model_spec.json | 5 +---- xinference/model/multimodal/tests/test_multimodal.py | 1 + 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index 26d8a8d9da..88c7eb3331 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -15,7 +15,6 @@ import abc import logging import os -import platform from abc import abstractmethod from collections import defaultdict from typing import Dict, Iterator, List, Literal, Optional, Tuple, Type, Union @@ -68,9 +67,6 @@ class LVLMPromptStyleV1(BaseModel): style_name: str system_prompt: str = "" roles: List[str] - image_formatter: str = "" - text_formatter: str = "" - sep: str = "" class LVLMFamilyV1(BaseModel): diff --git a/xinference/model/multimodal/model_spec.json b/xinference/model/multimodal/model_spec.json index e279441b3c..07af7f2f19 100644 --- a/xinference/model/multimodal/model_spec.json +++ b/xinference/model/multimodal/model_spec.json @@ -28,10 +28,7 @@ "roles": [ "user", "assistant" - ], - "image_formatter": "Picture {idx}: {image}", - "text_formatter": "{text}", - "sep": "\n" + ] } } ] diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py index 5176131019..cc1d0ae234 100644 --- a/xinference/model/multimodal/tests/test_multimodal.py +++ b/xinference/model/multimodal/tests/test_multimodal.py @@ -26,6 +26,7 @@ def test_restful_api_for_qwen_vl(setup): device="cpu", ) model = client.get_model(model_uid) + assert model # openai client import openai From 3fcb452ef0ad80f4b4835adfe41c93a9a0ad4746 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 15:42:30 +0800 Subject: [PATCH 16/20] Remove files --- create_test_data.py | 0 qwen_vl_demo.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 create_test_data.py delete mode 100644 qwen_vl_demo.py diff --git a/create_test_data.py b/create_test_data.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/qwen_vl_demo.py b/qwen_vl_demo.py deleted file mode 100644 index e69de29bb2..0000000000 From f9f88c8308a964d9f03b75effd5648eb387e4c23 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 15:44:02 +0800 Subject: [PATCH 17/20] Add copyright --- xinference/model/multimodal/qwen_vl.py | 14 ++++++++++++++ xinference/model/multimodal/tests/__init__.py | 13 +++++++++++++ 2 files changed, 27 insertions(+) diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py index 69b7d14862..55e29fe182 100644 --- a/xinference/model/multimodal/qwen_vl.py +++ b/xinference/model/multimodal/qwen_vl.py @@ -1,3 +1,17 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import operator import time import uuid diff --git a/xinference/model/multimodal/tests/__init__.py b/xinference/model/multimodal/tests/__init__.py index e69de29bb2..37f6558d95 100644 --- a/xinference/model/multimodal/tests/__init__.py +++ b/xinference/model/multimodal/tests/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022-2023 XProbe Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 0f1f6f95995cf116d1a67a7394b7bdeb024fd10a Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 15:46:03 +0800 Subject: [PATCH 18/20] Fix --- xinference/model/multimodal/core.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py index 88c7eb3331..678c8583b1 100644 --- a/xinference/model/multimodal/core.py +++ b/xinference/model/multimodal/core.py @@ -164,7 +164,7 @@ def match_multimodal( quantization: Optional[str] = None, ) -> Optional[Tuple[LVLMFamilyV1, LVLMSpecV1, str]]: """ - Find an LLM family, spec, and quantization that satisfy given criteria. + Find an multimodal family, spec, and quantization that satisfy given criteria. """ def _match_quantization(q: Union[str, None], quantizations: List[str]): @@ -256,7 +256,7 @@ def match_cls( model_family: LVLMFamilyV1, model_spec: "LVLMSpecV1", quantization: str ) -> Type[LVLM]: """ - Find an LLM implementation for given LLM family and spec. + Find an multimodal implementation for given multimodal family and spec. """ for cls in MODEL_CLASSES: if cls.match(model_family, model_spec, quantization): @@ -440,18 +440,18 @@ def cache_from_huggingface( def cache( - llm_family: LVLMFamilyV1, - llm_spec: "LVLMSpecV1", + model_family: LVLMFamilyV1, + model_spec: "LVLMSpecV1", quantization: Optional[str] = None, ) -> str: - if llm_spec.model_hub == "huggingface": - logger.info(f"Caching from Hugging Face: {llm_spec.model_id}") - return cache_from_huggingface(llm_family, llm_spec, quantization) - elif llm_spec.model_hub == "modelscope": - logger.info(f"Caching from Modelscope: {llm_spec.model_id}") - return cache_from_modelscope(llm_family, llm_spec, quantization) + if model_spec.model_hub == "huggingface": + logger.info(f"Caching from Hugging Face: {model_spec.model_id}") + return cache_from_huggingface(model_family, model_spec, quantization) + elif model_spec.model_hub == "modelscope": + logger.info(f"Caching from Modelscope: {model_spec.model_id}") + return cache_from_modelscope(model_family, model_spec, quantization) else: - raise ValueError(f"Unknown model hub: {llm_spec.model_hub}") + raise ValueError(f"Unknown model hub: {model_spec.model_hub}") def get_cache_status( From b9ed91b34af667f063a4bbd7928816e4dd2ffde7 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Wed, 27 Dec 2023 17:29:18 +0800 Subject: [PATCH 19/20] Skip ut --- xinference/model/multimodal/tests/test_multimodal.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py index cc1d0ae234..38317049b8 100644 --- a/xinference/model/multimodal/tests/test_multimodal.py +++ b/xinference/model/multimodal/tests/test_multimodal.py @@ -11,8 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import pytest +@pytest.mark.skip(reason="Cost too many resources.") def test_restful_api_for_qwen_vl(setup): endpoint, _ = setup from ....client import Client From 9da800671f573084dfdb572ccf929d7a6edcdeb2 Mon Sep 17 00:00:00 2001 From: codingl2k1 Date: Fri, 29 Dec 2023 11:19:23 +0800 Subject: [PATCH 20/20] Fix --- xinference/core/supervisor.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py index 480e879627..54447597d9 100644 --- a/xinference/core/supervisor.py +++ b/xinference/core/supervisor.py @@ -357,6 +357,13 @@ def get_model_registration(self, model_type: str, model_name: str) -> Any: if f.model_name == model_name: return f raise ValueError(f"Model {model_name} not found") + elif model_type == "multimodal": + from ..model.multimodal import BUILTIN_LVLM_FAMILIES + + for f in BUILTIN_LVLM_FAMILIES: + if f.model_name == model_name: + return f + raise ValueError(f"Model {model_name} not found") else: raise ValueError(f"Unsupported model type: {model_type}")