From 36c771a710a25f201f63dff6d565bce4556ec774 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Tue, 26 Dec 2023 16:32:54 +0800
Subject: [PATCH 01/20] dev

---
 qwen_vl_demo.py                               |   0
 xinference/model/multimodal/__init__.py       |   0
 xinference/model/multimodal/core.py           | 244 ++++++++++++++++++
 xinference/model/multimodal/model_spec.json   |   0
 xinference/model/multimodal/tests/__init__.py |   0
 .../model/multimodal/tests/test_multimodal.py |   0
 6 files changed, 244 insertions(+)
 create mode 100644 qwen_vl_demo.py
 create mode 100644 xinference/model/multimodal/__init__.py
 create mode 100644 xinference/model/multimodal/core.py
 create mode 100644 xinference/model/multimodal/model_spec.json
 create mode 100644 xinference/model/multimodal/tests/__init__.py
 create mode 100644 xinference/model/multimodal/tests/test_multimodal.py

diff --git a/qwen_vl_demo.py b/qwen_vl_demo.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/xinference/model/multimodal/__init__.py b/xinference/model/multimodal/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
new file mode 100644
index 0000000000..884f986a2b
--- /dev/null
+++ b/xinference/model/multimodal/core.py
@@ -0,0 +1,244 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import logging
+import os
+import platform
+from abc import abstractmethod
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+from ...core.utils import parse_replica_model_uid
+from ..core import ModelDescription
+
+if TYPE_CHECKING:
+    from .llm_family import LLMFamilyV1, LLMSpecV1
+
+logger = logging.getLogger(__name__)
+
+
+class LLM(abc.ABC):
+    def __init__(
+        self,
+        replica_model_uid: str,
+        model_family: "LLMFamilyV1",
+        model_spec: "LLMSpecV1",
+        quantization: str,
+        model_path: str,
+        *args,
+        **kwargs,
+    ):
+        self.model_uid, self.replica, self.rep_id = parse_replica_model_uid(
+            replica_model_uid
+        )
+        self.model_family = model_family
+        self.model_spec = model_spec
+        self.quantization = quantization
+        self.model_path = model_path
+        if args:
+            raise ValueError(f"Unrecognized positional arguments: {args}")
+        if kwargs:
+            raise ValueError(f"Unrecognized keyword arguments: {kwargs}")
+
+    @staticmethod
+    def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]:
+        if isinstance(model_size_in_billions, str):
+            if "_" in model_size_in_billions:
+                ms = model_size_in_billions.replace("_", ".")
+                return float(ms)
+            else:
+                raise ValueError("Invalid format for `model_size_in_billions`")
+        return model_size_in_billions
+
+    @staticmethod
+    def _is_darwin_and_apple_silicon():
+        return platform.system() == "Darwin" and platform.processor() == "arm"
+
+    @staticmethod
+    def _is_linux():
+        return platform.system() == "Linux"
+
+    @staticmethod
+    def _has_cuda_device():
+        from ...utils import cuda_count
+
+        return cuda_count() > 0
+
+    @staticmethod
+    def _get_cuda_count():
+        from ...utils import cuda_count
+
+        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
+        if cuda_visible_devices is None:
+            return cuda_count()
+
+        if cuda_visible_devices == "-1":
+            return 0
+        else:
+            return len(cuda_visible_devices.split(","))
+
+    @abstractmethod
+    def load(self):
+        raise NotImplementedError
+
+    @classmethod
+    def match(
+        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+    ) -> bool:
+        raise NotImplementedError
+
+
+class LLMDescription(ModelDescription):
+    def __init__(
+        self,
+        address: Optional[str],
+        devices: Optional[List[str]],
+        llm_family: "LLMFamilyV1",
+        llm_spec: "LLMSpecV1",
+        quantization: Optional[str],
+    ):
+        super().__init__(address, devices)
+        self._llm_family = llm_family
+        self._llm_spec = llm_spec
+        self._quantization = quantization
+
+    def to_dict(self):
+        return {
+            "model_type": "LLM",
+            "address": self.address,
+            "accelerators": self.devices,
+            "model_name": self._llm_family.model_name,
+            "model_lang": self._llm_family.model_lang,
+            "model_ability": self._llm_family.model_ability,
+            "model_description": self._llm_family.model_description,
+            "model_format": self._llm_spec.model_format,
+            "model_size_in_billions": self._llm_spec.model_size_in_billions,
+            "quantization": self._quantization,
+            "model_hub": self._llm_spec.model_hub,
+            "revision": self._llm_spec.model_revision,
+            "context_length": self._llm_family.context_length,
+        }
+
+
+def create_llm_model_instance(
+    subpool_addr: str,
+    devices: List[str],
+    model_uid: str,
+    model_name: str,
+    model_format: Optional[str] = None,
+    model_size_in_billions: Optional[int] = None,
+    quantization: Optional[str] = None,
+    is_local_deployment: bool = False,
+    **kwargs,
+) -> Tuple[LLM, LLMDescription]:
+    from . import match_llm, match_llm_cls
+    from .llm_family import cache
+
+    match_result = match_llm(
+        model_name,
+        model_format,
+        model_size_in_billions,
+        quantization,
+        is_local_deployment,
+    )
+    if not match_result:
+        raise ValueError(
+            f"Model not found, name: {model_name}, format: {model_format},"
+            f" size: {model_size_in_billions}, quantization: {quantization}"
+        )
+    llm_family, llm_spec, quantization = match_result
+
+    assert quantization is not None
+    save_path = cache(llm_family, llm_spec, quantization)
+
+    llm_cls = match_llm_cls(llm_family, llm_spec, quantization)
+    if not llm_cls:
+        raise ValueError(
+            f"Model not supported, name: {model_name}, format: {model_format},"
+            f" size: {model_size_in_billions}, quantization: {quantization}"
+        )
+    logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
+
+    model = llm_cls(model_uid, llm_family, llm_spec, quantization, save_path, kwargs)
+    return model, LLMDescription(
+        subpool_addr, devices, llm_family, llm_spec, quantization
+    )
+
+
+def create_speculative_llm_model_instance(
+    subpool_addr: str,
+    devices: List[str],
+    model_uid: str,
+    model_name: str,
+    model_size_in_billions: Optional[int],
+    quantization: Optional[str],
+    draft_model_name: str,
+    draft_model_size_in_billions: Optional[int],
+    draft_quantization: Optional[str],
+    is_local_deployment: bool = False,
+) -> Tuple[LLM, LLMDescription]:
+    from . import match_llm
+    from .llm_family import cache
+
+    match_result = match_llm(
+        model_name,
+        "pytorch",
+        model_size_in_billions,
+        quantization,
+        is_local_deployment,
+    )
+
+    if not match_result:
+        raise ValueError(
+            f"Model not found, name: {model_name}, format: pytorch,"
+            f" size: {model_size_in_billions}, quantization: {quantization}"
+        )
+    llm_family, llm_spec, quantization = match_result
+    assert quantization is not None
+    save_path = cache(llm_family, llm_spec, quantization)
+
+    draft_match_result = match_llm(
+        draft_model_name,
+        "pytorch",
+        draft_model_size_in_billions,
+        draft_quantization,
+        is_local_deployment,
+    )
+
+    if not draft_match_result:
+        raise ValueError(
+            f"Model not found, name: {draft_model_name}, format: pytorch,"
+            f" size: {draft_model_size_in_billions}, quantization: {draft_quantization}"
+        )
+    draft_llm_family, draft_llm_spec, draft_quantization = draft_match_result
+    assert draft_quantization is not None
+    draft_save_path = cache(draft_llm_family, draft_llm_spec, draft_quantization)
+
+    from .pytorch.spec_model import SpeculativeModel
+
+    model = SpeculativeModel(
+        model_uid,
+        model_family=llm_family,
+        model_spec=llm_spec,
+        quantization=quantization,
+        model_path=save_path,
+        draft_model_family=draft_llm_family,
+        draft_model_spec=draft_llm_spec,
+        draft_quantization=draft_quantization,
+        draft_model_path=draft_save_path,
+    )
+
+    return model, LLMDescription(
+        subpool_addr, devices, llm_family, llm_spec, quantization
+    )
diff --git a/xinference/model/multimodal/model_spec.json b/xinference/model/multimodal/model_spec.json
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/xinference/model/multimodal/tests/__init__.py b/xinference/model/multimodal/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py
new file mode 100644
index 0000000000..e69de29bb2

From d1b9a27cbf197d34428632b0da17fda15e408a85 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Tue, 26 Dec 2023 16:33:40 +0800
Subject: [PATCH 02/20] dev

---
 xinference/model/core.py                    |   6 +
 xinference/model/multimodal/__init__.py     |  37 +++
 xinference/model/multimodal/core.py         | 276 +++++++++++---------
 xinference/model/multimodal/model_spec.json |  37 +++
 4 files changed, 232 insertions(+), 124 deletions(-)

diff --git a/xinference/model/core.py b/xinference/model/core.py
index 9414c504e4..3494c8dc51 100644
--- a/xinference/model/core.py
+++ b/xinference/model/core.py
@@ -45,6 +45,7 @@ def create_model_instance(
     from .image.core import create_image_model_instance
     from .llm.core import create_llm_model_instance
     from .rerank.core import create_rerank_model_instance
+    from .multimodal.core import create_multimodal_model_instance
 
     if model_type == "LLM":
         return create_llm_model_instance(
@@ -74,5 +75,10 @@ def create_model_instance(
         return create_rerank_model_instance(
             subpool_addr, devices, model_uid, model_name, **kwargs
         )
+    elif model_type == "multimodal":
+        kwargs.pop("trust_remote_code", None)
+        return create_multimodal_model_instance(
+            subpool_addr, devices, model_uid, model_name, **kwargs
+        )
     else:
         raise ValueError(f"Unsupported model type: {model_type}.")
diff --git a/xinference/model/multimodal/__init__.py b/xinference/model/multimodal/__init__.py
index e69de29bb2..1f9c3acee0 100644
--- a/xinference/model/multimodal/__init__.py
+++ b/xinference/model/multimodal/__init__.py
@@ -0,0 +1,37 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import codecs
+import json
+import os
+
+from .core import (
+    BUILTIN_LVLM_FAMILIES,
+    BUILTIN_MODELSCOPE_LVLM_FAMILIES,
+    LVLM,
+    LVLMFamilyV1,
+    LVLMPromptStyleV1,
+)
+
+
+def _install():
+    json_path = os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
+    )
+    for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
+        model_spec = LVLMFamilyV1.parse_obj(json_obj)
+        BUILTIN_LVLM_FAMILIES.append(model_spec)
+
+
+_install()
diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index 884f986a2b..1d6891ff48 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -17,23 +17,99 @@
 import os
 import platform
 from abc import abstractmethod
-from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Union
+
+from pydantic import BaseModel, validator
 
 from ...core.utils import parse_replica_model_uid
 from ..core import ModelDescription
-
-if TYPE_CHECKING:
-    from .llm_family import LLMFamilyV1, LLMSpecV1
+from ..utils import download_from_modelscope
 
 logger = logging.getLogger(__name__)
 
+DEFAULT_CONTEXT_LENGTH = 2048
+
+
+class LVLMSpecV1(BaseModel):
+    model_format: Literal["pytorch", "gptq"]
+    # Must in order that `str` first, then `int`
+    model_size_in_billions: Union[str, int]
+    quantizations: List[str]
+    model_id: str
+    model_hub: str = "huggingface"
+    model_uri: Optional[str]
+    model_revision: Optional[str]
+
+    @validator("model_size_in_billions", pre=False)
+    def validate_model_size_with_radix(cls, v: object) -> object:
+        if isinstance(v, str):
+            if (
+                "_" in v
+            ):  # for example, "1_8" just returns "1_8", otherwise int("1_8") returns 18
+                return v
+            else:
+                return int(v)
+        return v
+
+
+class LVLMPromptStyleV1(BaseModel):
+    style_name: str
+    system_prompt: str = ""
+    roles: List[str]
+    image_formatter: str = ""
+    text_formatter: str = ""
+    sep: str = ""
+
+
+class LVLMFamilyV1(BaseModel):
+    version: Literal[1]
+    context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH
+    model_name: str
+    model_lang: List[str]
+    model_ability: List[Literal["chat"]]
+    model_description: Optional[str]
+    model_specs: List["LVLMSpecV1"]
+    prompt_style: Optional["LVLMPromptStyleV1"]
+
+
+class LVLMDescription(ModelDescription):
+    def __init__(
+        self,
+        address: Optional[str],
+        devices: Optional[List[str]],
+        llm_family: "LVLMFamilyV1",
+        llm_spec: "LVLMSpecV1",
+        quantization: Optional[str],
+    ):
+        super().__init__(address, devices)
+        self._llm_family = llm_family
+        self._llm_spec = llm_spec
+        self._quantization = quantization
+
+    def to_dict(self):
+        return {
+            "model_type": "LLM",
+            "address": self.address,
+            "accelerators": self.devices,
+            "model_name": self._llm_family.model_name,
+            "model_lang": self._llm_family.model_lang,
+            "model_ability": self._llm_family.model_ability,
+            "model_description": self._llm_family.model_description,
+            "model_format": self._llm_spec.model_format,
+            "model_size_in_billions": self._llm_spec.model_size_in_billions,
+            "quantization": self._quantization,
+            "model_hub": self._llm_spec.model_hub,
+            "revision": self._llm_spec.model_revision,
+            "context_length": self._llm_family.context_length,
+        }
+
 
-class LLM(abc.ABC):
+class LVLM(abc.ABC):
     def __init__(
         self,
         replica_model_uid: str,
-        model_family: "LLMFamilyV1",
-        model_spec: "LLMSpecV1",
+        model_family: "LVLMFamilyV1",
+        model_spec: "LVLMSpecV1",
         quantization: str,
         model_path: str,
         *args,
@@ -94,44 +170,73 @@ def load(self):
 
     @classmethod
     def match(
-        cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
+        cls, llm_family: "LVLMFamilyV1", llm_spec: "LVLMSpecV1", quantization: str
     ) -> bool:
         raise NotImplementedError
 
 
-class LLMDescription(ModelDescription):
-    def __init__(
-        self,
-        address: Optional[str],
-        devices: Optional[List[str]],
-        llm_family: "LLMFamilyV1",
-        llm_spec: "LLMSpecV1",
-        quantization: Optional[str],
-    ):
-        super().__init__(address, devices)
-        self._llm_family = llm_family
-        self._llm_spec = llm_spec
-        self._quantization = quantization
+BUILTIN_LVLM_FAMILIES: List["LVLMFamilyV1"] = []
+BUILTIN_MODELSCOPE_LVLM_FAMILIES: List["LVLMFamilyV1"] = []
 
-    def to_dict(self):
-        return {
-            "model_type": "LLM",
-            "address": self.address,
-            "accelerators": self.devices,
-            "model_name": self._llm_family.model_name,
-            "model_lang": self._llm_family.model_lang,
-            "model_ability": self._llm_family.model_ability,
-            "model_description": self._llm_family.model_description,
-            "model_format": self._llm_spec.model_format,
-            "model_size_in_billions": self._llm_spec.model_size_in_billions,
-            "quantization": self._quantization,
-            "model_hub": self._llm_spec.model_hub,
-            "revision": self._llm_spec.model_revision,
-            "context_length": self._llm_family.context_length,
-        }
+
+def match_multimodal(
+    model_name: str,
+    model_format: Optional[str] = None,
+    model_size_in_billions: Optional[int] = None,
+    quantization: Optional[str] = None,
+) -> Optional[Tuple[LVLMFamilyV1, LVLMSpecV1, str]]:
+    """
+    Find an LLM family, spec, and quantization that satisfy given criteria.
+    """
+
+    def _match_quantization(q: Union[str, None], quantizations: List[str]):
+        # Currently, the quantization name could include both uppercase and lowercase letters,
+        # so it is necessary to ensure that the case sensitivity does not
+        # affect the matching results.
+        if q is None:
+            return q
+        for quant in quantizations:
+            if q.lower() == quant.lower():
+                return quant
+
+    def _apply_format_to_model_id(spec: LVLMSpecV1, q: str) -> LVLMSpecV1:
+        # Different quantized versions of some models use different model ids,
+        # Here we check the `{}` in the model id to format the id.
+        if "{" in spec.model_id:
+            spec.model_id = spec.model_id.format(quantization=q)
+        return spec
+
+    if download_from_modelscope():
+        all_families = BUILTIN_MODELSCOPE_LVLM_FAMILIES
+    else:
+        all_families = BUILTIN_LVLM_FAMILIES
+
+    for family in all_families:
+        if model_name != family.model_name:
+            continue
+        for spec in family.model_specs:
+            matched_quantization = _match_quantization(quantization, spec.quantizations)
+            if (
+                model_format
+                and model_format != spec.model_format
+                or model_size_in_billions
+                and model_size_in_billions != spec.model_size_in_billions
+                or quantization
+                and matched_quantization is None
+            ):
+                continue
+            if quantization:
+                return (
+                    family,
+                    _apply_format_to_model_id(spec, matched_quantization),
+                    matched_quantization,
+                )
+            else:
+                return family, _apply_format_to_model_id(spec, "none"), "none"
+    return None
 
 
-def create_llm_model_instance(
+def create_multimodal_model_instance(
     subpool_addr: str,
     devices: List[str],
     model_uid: str,
@@ -139,106 +244,29 @@ def create_llm_model_instance(
     model_format: Optional[str] = None,
     model_size_in_billions: Optional[int] = None,
     quantization: Optional[str] = None,
-    is_local_deployment: bool = False,
     **kwargs,
-) -> Tuple[LLM, LLMDescription]:
-    from . import match_llm, match_llm_cls
-    from .llm_family import cache
+) -> Tuple[LVLM, LVLMDescription]:
+    from ..llm.llm_family import cache
 
-    match_result = match_llm(
+    match_result = match_multimodal(
         model_name,
         model_format,
         model_size_in_billions,
         quantization,
-        is_local_deployment,
     )
     if not match_result:
         raise ValueError(
             f"Model not found, name: {model_name}, format: {model_format},"
             f" size: {model_size_in_billions}, quantization: {quantization}"
         )
-    llm_family, llm_spec, quantization = match_result
+    model_family, model_spec, quantization = match_result
 
     assert quantization is not None
-    save_path = cache(llm_family, llm_spec, quantization)
+    save_path = cache(model_family, model_spec, quantization)
 
-    llm_cls = match_llm_cls(llm_family, llm_spec, quantization)
-    if not llm_cls:
-        raise ValueError(
-            f"Model not supported, name: {model_name}, format: {model_format},"
-            f" size: {model_size_in_billions}, quantization: {quantization}"
-        )
-    logger.debug(f"Launching {model_uid} with {llm_cls.__name__}")
-
-    model = llm_cls(model_uid, llm_family, llm_spec, quantization, save_path, kwargs)
-    return model, LLMDescription(
-        subpool_addr, devices, llm_family, llm_spec, quantization
-    )
-
-
-def create_speculative_llm_model_instance(
-    subpool_addr: str,
-    devices: List[str],
-    model_uid: str,
-    model_name: str,
-    model_size_in_billions: Optional[int],
-    quantization: Optional[str],
-    draft_model_name: str,
-    draft_model_size_in_billions: Optional[int],
-    draft_quantization: Optional[str],
-    is_local_deployment: bool = False,
-) -> Tuple[LLM, LLMDescription]:
-    from . import match_llm
-    from .llm_family import cache
-
-    match_result = match_llm(
-        model_name,
-        "pytorch",
-        model_size_in_billions,
-        quantization,
-        is_local_deployment,
-    )
-
-    if not match_result:
-        raise ValueError(
-            f"Model not found, name: {model_name}, format: pytorch,"
-            f" size: {model_size_in_billions}, quantization: {quantization}"
-        )
-    llm_family, llm_spec, quantization = match_result
-    assert quantization is not None
-    save_path = cache(llm_family, llm_spec, quantization)
-
-    draft_match_result = match_llm(
-        draft_model_name,
-        "pytorch",
-        draft_model_size_in_billions,
-        draft_quantization,
-        is_local_deployment,
-    )
-
-    if not draft_match_result:
-        raise ValueError(
-            f"Model not found, name: {draft_model_name}, format: pytorch,"
-            f" size: {draft_model_size_in_billions}, quantization: {draft_quantization}"
-        )
-    draft_llm_family, draft_llm_spec, draft_quantization = draft_match_result
-    assert draft_quantization is not None
-    draft_save_path = cache(draft_llm_family, draft_llm_spec, draft_quantization)
-
-    from .pytorch.spec_model import SpeculativeModel
-
-    model = SpeculativeModel(
-        model_uid,
-        model_family=llm_family,
-        model_spec=llm_spec,
-        quantization=quantization,
-        model_path=save_path,
-        draft_model_family=draft_llm_family,
-        draft_model_spec=draft_llm_spec,
-        draft_quantization=draft_quantization,
-        draft_model_path=draft_save_path,
-    )
+    logger.debug(f"Launching {model_uid} with {LVLM.__name__}")
 
-    return model, LLMDescription(
-        subpool_addr, devices, llm_family, llm_spec, quantization
+    model = LVLM(model_uid, model_family, model_spec, quantization, save_path, kwargs)
+    return model, LVLMDescription(
+        subpool_addr, devices, model_family, model_spec, quantization
     )
diff --git a/xinference/model/multimodal/model_spec.json b/xinference/model/multimodal/model_spec.json
index e69de29bb2..ef160ee6f4 100644
--- a/xinference/model/multimodal/model_spec.json
+++ b/xinference/model/multimodal/model_spec.json
@@ -0,0 +1,37 @@
+[
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "qwen-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Qwen-VL-Chat supports more flexible interaction, such as multiple image inputs, multi-round question answering, and creative capabilities.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "Qwen/Qwen-VL-Chat",
+        "model_revision": "989c61aac20be61660684ab7400e2e383e67b3ef"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "QWEN",
+      "system_prompt": "You are a helpful assistant.",
+      "roles": [
+        "user",
+        "assistant"
+      ],
+      "image_formatter": "Picture {idx}: <img>{image}</img>",
+      "text_formatter": "{text}",
+      "sep": "\n"
+    }
+  }
+]

From a6fb3feeddae3b7ec88a5bd420894b8007e8baab Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Tue, 26 Dec 2023 17:05:47 +0800
Subject: [PATCH 03/20] dev

---
 xinference/core/supervisor.py           | 32 +++++++++++++++++++++++++
 xinference/model/core.py                |  2 +-
 xinference/model/multimodal/__init__.py | 11 ++++++---
 xinference/model/multimodal/core.py     | 17 ++++++++++---
 4 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index 1d531367ed..480e879627 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -36,6 +36,7 @@
     from ..model.embedding import EmbeddingModelSpec
     from ..model.image import ImageModelFamilyV1
     from ..model.llm import LLMFamilyV1
+    from ..model.multimodal import LVLMFamilyV1
     from ..model.rerank import RerankModelSpec
     from .worker import WorkerActor
 
@@ -215,6 +216,25 @@ def _to_image_model_reg(
                 "is_builtin": is_builtin,
             }
 
+    def _to_multimodal_reg(
+        self, model_family: "LVLMFamilyV1", is_builtin: bool
+    ) -> Dict[str, Any]:
+        from ..model.llm import get_cache_status
+
+        if self.is_local_deployment():
+            specs = []
+            # TODO: does not work when the supervisor and worker are running on separate nodes.
+            for spec in model_family.model_specs:
+                cache_status = get_cache_status(model_family, spec)
+                specs.append({**spec.dict(), "cache_status": cache_status})
+            return {
+                **model_family.dict(),
+                "is_builtin": is_builtin,
+                "model_specs": specs,
+            }
+        else:
+            return {**model_family.dict(), "is_builtin": is_builtin}
+
     @log_sync(logger=logger)
     def list_model_registrations(
         self, model_type: str, detailed: bool = False
@@ -286,6 +306,18 @@ def sort_helper(item):
                 else:
                     ret.append({"model_name": model_name, "is_builtin": True})
 
+            ret.sort(key=sort_helper)
+            return ret
+        elif model_type == "multimodal":
+            from ..model.multimodal import BUILTIN_LVLM_FAMILIES
+
+            ret = []
+            for family in BUILTIN_LVLM_FAMILIES:
+                if detailed:
+                    ret.append(self._to_multimodal_reg(family, True))
+                else:
+                    ret.append({"model_name": family.model_name, "is_builtin": True})
+
             ret.sort(key=sort_helper)
             return ret
         else:
diff --git a/xinference/model/core.py b/xinference/model/core.py
index 3494c8dc51..bcc465247c 100644
--- a/xinference/model/core.py
+++ b/xinference/model/core.py
@@ -44,8 +44,8 @@ def create_model_instance(
     from .embedding.core import create_embedding_model_instance
     from .image.core import create_image_model_instance
     from .llm.core import create_llm_model_instance
-    from .rerank.core import create_rerank_model_instance
     from .multimodal.core import create_multimodal_model_instance
+    from .rerank.core import create_rerank_model_instance
 
     if model_type == "LLM":
         return create_llm_model_instance(
diff --git a/xinference/model/multimodal/__init__.py b/xinference/model/multimodal/__init__.py
index 1f9c3acee0..5f0269253a 100644
--- a/xinference/model/multimodal/__init__.py
+++ b/xinference/model/multimodal/__init__.py
@@ -20,6 +20,7 @@
     BUILTIN_LVLM_FAMILIES,
     BUILTIN_MODELSCOPE_LVLM_FAMILIES,
     LVLM,
+    MODEL_NAME_TO_REVISION,
     LVLMFamilyV1,
     LVLMPromptStyleV1,
 )
@@ -27,11 +28,15 @@
 
 def _install():
     json_path = os.path.join(
-        os.path.dirname(os.path.abspath(__file__)), "llm_family.json"
+        os.path.dirname(os.path.abspath(__file__)), "model_spec.json"
     )
     for json_obj in json.load(codecs.open(json_path, "r", encoding="utf-8")):
-        model_spec = LVLMFamilyV1.parse_obj(json_obj)
-        BUILTIN_LVLM_FAMILIES.append(model_spec)
+        model_family = LVLMFamilyV1.parse_obj(json_obj)
+        BUILTIN_LVLM_FAMILIES.append(model_family)
+        for model_spec in model_family.model_specs:
+            MODEL_NAME_TO_REVISION[model_family.model_name].append(
+                model_spec.model_revision
+            )
 
 
 _install()
diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index 1d6891ff48..5329d96ad3 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -17,17 +17,22 @@
 import os
 import platform
 from abc import abstractmethod
-from typing import TYPE_CHECKING, List, Literal, Optional, Tuple, Union
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
 
 from pydantic import BaseModel, validator
 
+from ...constants import XINFERENCE_CACHE_DIR
 from ...core.utils import parse_replica_model_uid
 from ..core import ModelDescription
-from ..utils import download_from_modelscope
+from ..utils import download_from_modelscope, is_model_cached, valid_model_revision
 
 logger = logging.getLogger(__name__)
 
 DEFAULT_CONTEXT_LENGTH = 2048
+# Used for check whether the model is cached.
+# Init when registering all the builtin models.
+MODEL_NAME_TO_REVISION: Dict[str, List[str]] = defaultdict(list)
 
 
 class LVLMSpecV1(BaseModel):
@@ -207,7 +212,7 @@ def _apply_format_to_model_id(spec: LVLMSpecV1, q: str) -> LVLMSpecV1:
         return spec
 
     if download_from_modelscope():
-        all_families = BUILTIN_MODELSCOPE_LVLM_FAMILIES
+        all_families = BUILTIN_MODELSCOPE_LVLM_FAMILIES + BUILTIN_LVLM_FAMILIES
     else:
         all_families = BUILTIN_LVLM_FAMILIES
 
@@ -270,3 +275,9 @@ def create_multimodal_model_instance(
     return model, LVLMDescription(
         subpool_addr, devices, model_family, model_spec, quantization
     )
+
+
+def get_cache_status(
+    model_spec: LVLMSpecV1,
+) -> bool:
+    return is_model_cached(model_spec, MODEL_NAME_TO_REVISION)

From 0fbb70c61105a3ba4ac3a2c1e7eb4283dbba8ef8 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Tue, 26 Dec 2023 17:27:24 +0800
Subject: [PATCH 04/20] dev

---
 xinference/model/multimodal/core.py | 226 ++++++++++++++++++++++++++--
 1 file changed, 210 insertions(+), 16 deletions(-)

diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index 5329d96ad3..b3111a804d 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -25,7 +25,13 @@
 from ...constants import XINFERENCE_CACHE_DIR
 from ...core.utils import parse_replica_model_uid
 from ..core import ModelDescription
-from ..utils import download_from_modelscope, is_model_cached, valid_model_revision
+from ..utils import (
+    download_from_modelscope,
+    is_model_cached,
+    retry_download,
+    symlink_local_file,
+    valid_model_revision,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -82,13 +88,13 @@ def __init__(
         self,
         address: Optional[str],
         devices: Optional[List[str]],
-        llm_family: "LVLMFamilyV1",
-        llm_spec: "LVLMSpecV1",
+        model_family: "LVLMFamilyV1",
+        model_spec: "LVLMSpecV1",
         quantization: Optional[str],
     ):
         super().__init__(address, devices)
-        self._llm_family = llm_family
-        self._llm_spec = llm_spec
+        self._model_family = model_family
+        self._model_spec = model_spec
         self._quantization = quantization
 
     def to_dict(self):
@@ -96,16 +102,16 @@ def to_dict(self):
             "model_type": "LLM",
             "address": self.address,
             "accelerators": self.devices,
-            "model_name": self._llm_family.model_name,
-            "model_lang": self._llm_family.model_lang,
-            "model_ability": self._llm_family.model_ability,
-            "model_description": self._llm_family.model_description,
-            "model_format": self._llm_spec.model_format,
-            "model_size_in_billions": self._llm_spec.model_size_in_billions,
+            "model_name": self._model_family.model_name,
+            "model_lang": self._model_family.model_lang,
+            "model_ability": self._model_family.model_ability,
+            "model_description": self._model_family.model_description,
+            "model_format": self._model_spec.model_format,
+            "model_size_in_billions": self._model_spec.model_size_in_billions,
             "quantization": self._quantization,
-            "model_hub": self._llm_spec.model_hub,
-            "revision": self._llm_spec.model_revision,
-            "context_length": self._llm_family.context_length,
+            "model_hub": self._model_spec.model_hub,
+            "revision": self._model_spec.model_revision,
+            "context_length": self._model_family.context_length,
         }
 
 
@@ -251,8 +257,6 @@ def create_multimodal_model_instance(
     quantization: Optional[str] = None,
     **kwargs,
 ) -> Tuple[LVLM, LVLMDescription]:
-    from ..llm.llm_family import cache
-
     match_result = match_multimodal(
         model_name,
         model_format,
@@ -277,6 +281,196 @@ def create_multimodal_model_instance(
     )
 
 
+def _get_cache_dir(
+    model_family: LVLMFamilyV1,
+    model_spec: "LVLMSpecV1",
+    create_if_not_exist=True,
+):
+    cache_dir_name = (
+        f"{model_family.model_name}-{model_spec.model_format}"
+        f"-{model_spec.model_size_in_billions}b"
+    )
+    cache_dir = os.path.realpath(os.path.join(XINFERENCE_CACHE_DIR, cache_dir_name))
+    if create_if_not_exist and not os.path.exists(cache_dir):
+        os.makedirs(cache_dir, exist_ok=True)
+    return cache_dir
+
+
+def _get_meta_path(
+    cache_dir: str,
+    model_format: str,
+    model_hub: str,
+    quantization: Optional[str] = None,
+):
+    if model_format == "pytorch":
+        if model_hub == "huggingface":
+            return os.path.join(cache_dir, "__valid_download")
+        else:
+            return os.path.join(cache_dir, f"__valid_download_{model_hub}")
+    elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
+        assert quantization is not None
+        if model_hub == "huggingface":
+            return os.path.join(cache_dir, f"__valid_download_{quantization}")
+        else:
+            return os.path.join(
+                cache_dir, f"__valid_download_{model_hub}_{quantization}"
+            )
+    else:
+        raise ValueError(f"Unsupported format: {model_format}")
+
+
+def _skip_download(
+    cache_dir: str,
+    model_format: str,
+    model_hub: str,
+    model_revision: Optional[str],
+    quantization: Optional[str] = None,
+) -> bool:
+    if model_format == "pytorch":
+        model_hub_to_meta_path = {
+            "huggingface": _get_meta_path(
+                cache_dir, model_format, "huggingface", quantization
+            ),
+            "modelscope": _get_meta_path(
+                cache_dir, model_format, "modelscope", quantization
+            ),
+        }
+        if valid_model_revision(model_hub_to_meta_path[model_hub], model_revision):
+            logger.info(f"Cache {cache_dir} exists")
+            return True
+        else:
+            for hub, meta_path in model_hub_to_meta_path.items():
+                if hub != model_hub and os.path.exists(meta_path):
+                    # PyTorch models from modelscope can also be loaded by transformers.
+                    logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
+                    return True
+            return False
+    else:
+        raise ValueError(f"Unsupported format: {model_format}")
+
+
+def _generate_meta_file(
+    meta_path: str,
+    model_family: "LVLMFamilyV1",
+    model_spec: "LVLMSpecV1",
+    quantization: Optional[str] = None,
+):
+    assert not valid_model_revision(
+        meta_path, model_spec.model_revision
+    ), f"meta file {meta_path} should not be valid"
+    with open(meta_path, "w") as f:
+        import json
+
+        desc = LVLMDescription(None, None, model_family, model_spec, quantization)
+        json.dump(desc.to_dict(), f)
+
+
+def cache_from_modelscope(
+    model_family: LVLMFamilyV1,
+    model_spec: "LVLMSpecV1",
+    quantization: Optional[str] = None,
+) -> str:
+    """
+    Cache model from Modelscope. Return the cache directory.
+    """
+    from modelscope.hub.snapshot_download import snapshot_download
+
+    cache_dir = _get_cache_dir(model_family, model_spec)
+    if _skip_download(
+        cache_dir,
+        model_spec.model_format,
+        model_spec.model_hub,
+        model_spec.model_revision,
+        quantization,
+    ):
+        return cache_dir
+
+    if model_spec.model_format in ["pytorch", "gptq"]:
+        download_dir = retry_download(
+            snapshot_download,
+            model_family.model_name,
+            {
+                "model_size": model_spec.model_size_in_billions,
+                "model_format": model_spec.model_format,
+            },
+            model_spec.model_id,
+            revision=model_spec.model_revision,
+        )
+        for subdir, dirs, files in os.walk(download_dir):
+            for file in files:
+                relpath = os.path.relpath(os.path.join(subdir, file), download_dir)
+                symlink_local_file(os.path.join(subdir, file), cache_dir, relpath)
+    else:
+        raise ValueError(f"Unsupported format: {model_spec.model_format}")
+
+    meta_path = _get_meta_path(
+        cache_dir, model_spec.model_format, model_spec.model_hub, quantization
+    )
+    _generate_meta_file(meta_path, model_family, model_spec, quantization)
+
+    return cache_dir
+
+
+def cache_from_huggingface(
+    model_family: LVLMFamilyV1,
+    model_spec: "LVLMSpecV1",
+    quantization: Optional[str] = None,
+) -> str:
+    """
+    Cache model from Hugging Face. Return the cache directory.
+    """
+    import huggingface_hub
+
+    cache_dir = _get_cache_dir(model_family, model_spec)
+    if _skip_download(
+        cache_dir,
+        model_spec.model_format,
+        model_spec.model_hub,
+        model_spec.model_revision,
+        quantization,
+    ):
+        return cache_dir
+
+    if model_spec.model_format in ["pytorch"]:
+        assert isinstance(model_spec, LVLMSpecV1)
+        retry_download(
+            huggingface_hub.snapshot_download,
+            model_family.model_name,
+            {
+                "model_size": model_spec.model_size_in_billions,
+                "model_format": model_spec.model_format,
+            },
+            model_spec.model_id,
+            revision=model_spec.model_revision,
+            local_dir=cache_dir,
+            local_dir_use_symlinks=True,
+        )
+    else:
+        raise ValueError(f"Unsupported model format: {model_spec.model_format}")
+
+    meta_path = _get_meta_path(
+        cache_dir, model_spec.model_format, model_spec.model_hub, quantization
+    )
+    _generate_meta_file(meta_path, model_family, model_spec, quantization)
+
+    return cache_dir
+
+
+def cache(
+    llm_family: LVLMFamilyV1,
+    llm_spec: "LVLMSpecV1",
+    quantization: Optional[str] = None,
+) -> str:
+    if llm_spec.model_hub == "huggingface":
+        logger.info(f"Caching from Hugging Face: {llm_spec.model_id}")
+        return cache_from_huggingface(llm_family, llm_spec, quantization)
+    elif llm_spec.model_hub == "modelscope":
+        logger.info(f"Caching from Modelscope: {llm_spec.model_id}")
+        return cache_from_modelscope(llm_family, llm_spec, quantization)
+    else:
+        raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
+
+
 def get_cache_status(
     model_spec: LVLMSpecV1,
 ) -> bool:

From 7253c658bbf2e69a1f43b2e5f71ac5498949906d Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Tue, 26 Dec 2023 18:24:44 +0800
Subject: [PATCH 05/20] dev

---
 xinference/model/multimodal/__init__.py       |  5 ++-
 xinference/model/multimodal/core.py           | 35 ++++++++++++++++---
 xinference/model/multimodal/qwen_vl.py        | 26 ++++++++++++++
 .../model/multimodal/tests/test_multimodal.py | 28 +++++++++++++++
 4 files changed, 89 insertions(+), 5 deletions(-)
 create mode 100644 xinference/model/multimodal/qwen_vl.py

diff --git a/xinference/model/multimodal/__init__.py b/xinference/model/multimodal/__init__.py
index 5f0269253a..bae4627739 100644
--- a/xinference/model/multimodal/__init__.py
+++ b/xinference/model/multimodal/__init__.py
@@ -19,11 +19,14 @@
 from .core import (
     BUILTIN_LVLM_FAMILIES,
     BUILTIN_MODELSCOPE_LVLM_FAMILIES,
-    LVLM,
+    MODEL_CLASSES,
     MODEL_NAME_TO_REVISION,
     LVLMFamilyV1,
     LVLMPromptStyleV1,
 )
+from .qwen_vl import QwenVLChat
+
+MODEL_CLASSES.append(QwenVLChat)
 
 
 def _install():
diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index b3111a804d..02751ae441 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -18,12 +18,13 @@
 import platform
 from abc import abstractmethod
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Dict, Iterator, List, Literal, Optional, Tuple, Type, Union
 
 from pydantic import BaseModel, validator
 
 from ...constants import XINFERENCE_CACHE_DIR
 from ...core.utils import parse_replica_model_uid
+from ...types import ChatCompletion, ChatCompletionChunk
 from ..core import ModelDescription
 from ..utils import (
     download_from_modelscope,
@@ -179,9 +180,19 @@ def _get_cuda_count():
     def load(self):
         raise NotImplementedError
 
+    @abstractmethod
+    def chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[Dict]] = None,
+        generate_config: Optional[Dict] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        raise NotImplementedError
+
     @classmethod
     def match(
-        cls, llm_family: "LVLMFamilyV1", llm_spec: "LVLMSpecV1", quantization: str
+        cls, model_family: "LVLMFamilyV1", model_spec: "LVLMSpecV1", quantization: str
     ) -> bool:
         raise NotImplementedError
 
@@ -273,14 +284,30 @@ def create_multimodal_model_instance(
     assert quantization is not None
     save_path = cache(model_family, model_spec, quantization)
 
-    logger.debug(f"Launching {model_uid} with {LVLM.__name__}")
+    cls = match_cls(model_family, model_spec, quantization)
+    logger.debug(f"Launching {model_uid} with {cls.__name__}")
 
-    model = LVLM(model_uid, model_family, model_spec, quantization, save_path, kwargs)
+    model = cls(model_uid, model_family, model_spec, quantization, save_path, kwargs)
     return model, LVLMDescription(
         subpool_addr, devices, model_family, model_spec, quantization
     )
 
 
+MODEL_CLASSES: List[Type[LVLM]] = []
+
+
+def match_cls(
+    model_family: LVLMFamilyV1, model_spec: "LVLMSpecV1", quantization: str
+) -> Optional[Type[LVLM]]:
+    """
+    Find an LLM implementation for given LLM family and spec.
+    """
+    for cls in MODEL_CLASSES:
+        if cls.match(model_family, model_spec, quantization):
+            return cls
+    return None
+
+
 def _get_cache_dir(
     model_family: LVLMFamilyV1,
     model_spec: "LVLMSpecV1",
diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
new file mode 100644
index 0000000000..086040a8c2
--- /dev/null
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -0,0 +1,26 @@
+from typing import Dict, Iterator, List, Optional, Union
+
+from ...types import ChatCompletion, ChatCompletionChunk
+from .core import LVLM, LVLMFamilyV1, LVLMSpecV1
+
+
+class QwenVLChat(LVLM):
+    @classmethod
+    def match(
+        cls, model_family: "LVLMFamilyV1", model_spec: "LVLMSpecV1", quantization: str
+    ) -> bool:
+        if "qwen" in model_family.model_name:
+            return True
+        return False
+
+    def load(self):
+        raise NotImplementedError
+
+    def chat(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        chat_history: Optional[List[Dict]] = None,
+        generate_config: Optional[Dict] = None,
+    ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
+        raise NotImplementedError
diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py
index e69de29bb2..ff6a8b0987 100644
--- a/xinference/model/multimodal/tests/test_multimodal.py
+++ b/xinference/model/multimodal/tests/test_multimodal.py
@@ -0,0 +1,28 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def test_restful_api_for_multimodal(setup):
+    endpoint, _ = setup
+    from ....client import Client
+
+    client = Client(endpoint)
+
+    model_uid = client.launch_model(
+        model_uid="my_controlnet",
+        model_name="qwen-vl-chat",
+        model_type="multimodal",
+    )
+    model = client.get_model(model_uid)
+    print(model)

From b5eb073340695f3f3b3ed84c04c674207833315c Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 12:22:47 +0800
Subject: [PATCH 06/20] Implement qwen vl chat

---
 xinference/model/llm/pytorch/core.py          | 30 +------
 xinference/model/llm/pytorch/spec_model.py    |  3 +-
 xinference/model/multimodal/core.py           |  9 +--
 xinference/model/multimodal/model_spec.json   |  2 +-
 xinference/model/multimodal/qwen_vl.py        | 80 ++++++++++++++++++-
 .../model/multimodal/tests/test_multimodal.py | 24 ++++++
 xinference/model/utils.py                     | 28 +++++++
 7 files changed, 136 insertions(+), 40 deletions(-)

diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py
index f57e23d5a0..cc794f8ce3 100644
--- a/xinference/model/llm/pytorch/core.py
+++ b/xinference/model/llm/pytorch/core.py
@@ -29,6 +29,7 @@
     PytorchGenerateConfig,
     PytorchModelConfig,
 )
+from ...utils import select_device
 from ..core import LLM
 from ..llm_family import LLMFamilyV1, LLMSpecV1
 from ..utils import ChatModelMixin
@@ -122,7 +123,7 @@ def load(self):
         quantization = self.quantization
         num_gpus = len(cuda_visible_devices) if cuda_visible_devices_env != "-1" else 0
         device = self._pytorch_model_config.get("device", "auto")
-        self._pytorch_model_config["device"] = self._select_device(device)
+        self._pytorch_model_config["device"] = select_device(device)
         self._device = self._pytorch_model_config["device"]
 
         if self._device == "cpu":
@@ -185,33 +186,6 @@ def load(self):
             self._model.to(self._device)
         logger.debug(f"Model Memory: {self._model.get_memory_footprint()}")
 
-    def _select_device(self, device: str) -> str:
-        try:
-            import torch
-        except ImportError:
-            raise ImportError(
-                f"Failed to import module 'torch'. Please make sure 'torch' is installed.\n\n"
-            )
-
-        if device == "auto":
-            # When env CUDA_VISIBLE_DEVICES=-1, torch.cuda.is_available() return False
-            if torch.cuda.is_available():
-                return "cuda"
-            elif torch.backends.mps.is_available():
-                return "mps"
-            return "cpu"
-        elif device == "cuda":
-            if not torch.cuda.is_available():
-                raise ValueError("cuda is unavailable in your environment")
-        elif device == "mps":
-            if not torch.backends.mps.is_available():
-                raise ValueError("mps is unavailable in your environment")
-        elif device == "cpu":
-            pass
-        else:
-            raise ValueError(f"Device {device} is not supported in temporary")
-        return device
-
     @classmethod
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
diff --git a/xinference/model/llm/pytorch/spec_model.py b/xinference/model/llm/pytorch/spec_model.py
index e438bbb264..a66f6fbfc1 100644
--- a/xinference/model/llm/pytorch/spec_model.py
+++ b/xinference/model/llm/pytorch/spec_model.py
@@ -17,6 +17,7 @@
 from typing import Iterator, List, Optional, Union
 
 from ....types import Completion, CompletionChunk, Embedding
+from ...utils import select_device
 from .. import LLMFamilyV1, LLMSpecV1
 from .core import PytorchChatModel, PytorchGenerateConfig, PytorchModelConfig
 
@@ -85,7 +86,7 @@ def load(self):
 
         num_gpus = len(cuda_visible_devices) if cuda_visible_devices_env != "-1" else 0
         device = self._pytorch_model_config.get("device", "auto")
-        self._pytorch_model_config["device"] = self._select_device(device)
+        self._pytorch_model_config["device"] = select_device(device)
         self._device = self._pytorch_model_config["device"]
 
         if self._device == "cpu":
diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index 02751ae441..c153ff9070 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -124,8 +124,7 @@ def __init__(
         model_spec: "LVLMSpecV1",
         quantization: str,
         model_path: str,
-        *args,
-        **kwargs,
+        kwargs: Dict,
     ):
         self.model_uid, self.replica, self.rep_id = parse_replica_model_uid(
             replica_model_uid
@@ -134,10 +133,8 @@ def __init__(
         self.model_spec = model_spec
         self.quantization = quantization
         self.model_path = model_path
-        if args:
-            raise ValueError(f"Unrecognized positional arguments: {args}")
-        if kwargs:
-            raise ValueError(f"Unrecognized keyword arguments: {kwargs}")
+        self.kwargs = kwargs
+        logger.info("Init model %s with kwargs: %s", self.model_uid, kwargs)
 
     @staticmethod
     def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]:
diff --git a/xinference/model/multimodal/model_spec.json b/xinference/model/multimodal/model_spec.json
index ef160ee6f4..e279441b3c 100644
--- a/xinference/model/multimodal/model_spec.json
+++ b/xinference/model/multimodal/model_spec.json
@@ -19,7 +19,7 @@
           "none"
         ],
         "model_id": "Qwen/Qwen-VL-Chat",
-        "model_revision": "989c61aac20be61660684ab7400e2e383e67b3ef"
+        "model_revision": "6665c780ade5ff3f08853b4262dcb9c8f9598d42"
       }
     ],
     "prompt_style": {
diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
index 086040a8c2..38cbb6a75f 100644
--- a/xinference/model/multimodal/qwen_vl.py
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -1,10 +1,25 @@
+import operator
+import tempfile
+import time
+import uuid
 from typing import Dict, Iterator, List, Optional, Union
 
-from ...types import ChatCompletion, ChatCompletionChunk
+from ...types import (
+    ChatCompletion,
+    ChatCompletionChoice,
+    ChatCompletionChunk,
+    CompletionUsage,
+)
+from ..utils import select_device
 from .core import LVLM, LVLMFamilyV1, LVLMSpecV1
 
 
 class QwenVLChat(LVLM):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._tokenizer = None
+        self._model = None
+
     @classmethod
     def match(
         cls, model_family: "LVLMFamilyV1", model_spec: "LVLMSpecV1", quantization: str
@@ -14,13 +29,70 @@ def match(
         return False
 
     def load(self):
-        raise NotImplementedError
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from transformers.generation import GenerationConfig
+
+        device = self.kwargs.get("device", "auto")
+        device = select_device(device)
+
+        self._tokenizer = AutoTokenizer.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+            code_revision=self.model_spec.model_revision,
+        )
+        self._model = AutoModelForCausalLM.from_pretrained(
+            self.model_path,
+            device_map=device,
+            trust_remote_code=True,
+            code_revision=self.model_spec.model_revision,
+        ).eval()
+        # Specify hyperparameters for generation
+        self._model.generation_config = GenerationConfig.from_pretrained(
+            self.model_path,
+            trust_remote_code=True,
+            code_revision=self.model_spec.model_revision,
+        )
 
     def chat(
         self,
-        prompt: str,
+        prompt: Union[str, List[Dict]],
         system_prompt: Optional[str] = None,
         chat_history: Optional[List[Dict]] = None,
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        raise NotImplementedError
+        if not isinstance(prompt, str):
+            prompt = [
+                {"image": p["image_url"]["url"], "type": "image"}
+                if p.get("type") == "image_url"
+                else p
+                for p in prompt
+            ]
+            prompt = sorted(prompt, key=operator.itemgetter("type"))
+            prompt = self._tokenizer.from_list_format(prompt)
+        response, history = self._model.chat(
+            self._tokenizer, query=prompt, chat_history=chat_history
+        )
+        if "<box>" in response:
+            image = self._tokenizer.draw_bbox_on_latest_picture(response, history)
+            if image:
+                with tempfile.NamedTemporaryFile(
+                    suffix=".jpg", delete_on_close=False
+                ) as output:
+                    image.save(output)
+                response = output.name
+        return ChatCompletion(
+            id="chat" + str(uuid.uuid1()),
+            object="chat.completion",
+            created=int(time.time()),
+            model=self.model_uid,
+            choices=[
+                ChatCompletionChoice(
+                    index=0,
+                    message={"role": "assistant", "content": response},
+                    finish_reason="stop",
+                )
+            ],
+            usage=CompletionUsage(
+                prompt_tokens=-1, completion_tokens=-1, total_tokens=-1
+            ),
+        )
diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py
index ff6a8b0987..5caf58778a 100644
--- a/xinference/model/multimodal/tests/test_multimodal.py
+++ b/xinference/model/multimodal/tests/test_multimodal.py
@@ -23,6 +23,30 @@ def test_restful_api_for_multimodal(setup):
         model_uid="my_controlnet",
         model_name="qwen-vl-chat",
         model_type="multimodal",
+        device="cpu",
     )
     model = client.get_model(model_uid)
     print(model)
+
+    # openai client
+    import openai
+
+    client = openai.Client(api_key="not empty", base_url=f"{endpoint}/v1")
+    completion = client.chat.completions.create(
+        model=model_uid,
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What’s in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                        },
+                    },
+                ],
+            }
+        ],
+    )
+    print(completion)
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index 9ec82c6c4f..a3f67a8d8d 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -256,3 +256,31 @@ def _patched_resolve_trust_remote_code(*args, **kwargs):
             resolve_trust_remote_code.__code__ = (
                 _patched_resolve_trust_remote_code.__code__
             )
+
+
+def select_device(device):
+    try:
+        import torch
+    except ImportError:
+        raise ImportError(
+            f"Failed to import module 'torch'. Please make sure 'torch' is installed.\n\n"
+        )
+
+    if device == "auto":
+        # When env CUDA_VISIBLE_DEVICES=-1, torch.cuda.is_available() return False
+        if torch.cuda.is_available():
+            return "cuda"
+        elif torch.backends.mps.is_available():
+            return "mps"
+        return "cpu"
+    elif device == "cuda":
+        if not torch.cuda.is_available():
+            raise ValueError("cuda is unavailable in your environment")
+    elif device == "mps":
+        if not torch.backends.mps.is_available():
+            raise ValueError("mps is unavailable in your environment")
+    elif device == "cpu":
+        pass
+    else:
+        raise ValueError(f"Device {device} is not supported in temporary")
+    return device

From b7b818635b73d072aa2f9a4f1974cddaee255953 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 12:26:05 +0800
Subject: [PATCH 07/20] Fix

---
 xinference/model/multimodal/qwen_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
index 38cbb6a75f..77adb88d5a 100644
--- a/xinference/model/multimodal/qwen_vl.py
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -70,7 +70,7 @@ def chat(
             prompt = sorted(prompt, key=operator.itemgetter("type"))
             prompt = self._tokenizer.from_list_format(prompt)
         response, history = self._model.chat(
-            self._tokenizer, query=prompt, chat_history=chat_history
+            self._tokenizer, query=prompt, history=chat_history
         )
         if "<box>" in response:
             image = self._tokenizer.draw_bbox_on_latest_picture(response, history)

From d663b3cf6fe96cab9db8d31dfc6b8376070ced6d Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 14:18:32 +0800
Subject: [PATCH 08/20] Add ut

---
 .../model/multimodal/tests/test_multimodal.py | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py
index 5caf58778a..859dcba56c 100644
--- a/xinference/model/multimodal/tests/test_multimodal.py
+++ b/xinference/model/multimodal/tests/test_multimodal.py
@@ -50,3 +50,24 @@ def test_restful_api_for_multimodal(setup):
         ],
     )
     print(completion)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "这是什么?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                    },
+                },
+            ],
+        }
+    ]
+    completion = client.chat.completions.create(model=model_uid, messages=messages)
+    print(completion)
+    messages.append(completion.choices[0].message.model_dump())
+    messages.append({"role": "user", "content": "框出图中击掌的位置"})
+    print(messages)
+    completion = client.chat.completions.create(model=model_uid, messages=messages)
+    print(completion)

From 8c246a4ddf9fabfdee5daa6308819941d9cafd07 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 14:27:01 +0800
Subject: [PATCH 09/20] Fix history

---
 xinference/model/multimodal/qwen_vl.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
index 77adb88d5a..88b077ef99 100644
--- a/xinference/model/multimodal/qwen_vl.py
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -53,6 +53,18 @@ def load(self):
             code_revision=self.model_spec.model_revision,
         )
 
+    def _message_content_to_qwen(self, content):
+        if not isinstance(content, str):
+            content = [
+                {"image": c["image_url"]["url"], "type": "image"}
+                if c.get("type") == "image_url"
+                else c
+                for c in content
+            ]
+            content = sorted(content, key=operator.itemgetter("type"))
+            return self._tokenizer.from_list_format(content)
+        return content
+
     def chat(
         self,
         prompt: Union[str, List[Dict]],
@@ -60,15 +72,9 @@ def chat(
         chat_history: Optional[List[Dict]] = None,
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
-        if not isinstance(prompt, str):
-            prompt = [
-                {"image": p["image_url"]["url"], "type": "image"}
-                if p.get("type") == "image_url"
-                else p
-                for p in prompt
-            ]
-            prompt = sorted(prompt, key=operator.itemgetter("type"))
-            prompt = self._tokenizer.from_list_format(prompt)
+        prompt = self._message_content_to_qwen(prompt)
+        for h in chat_history:
+            h["content"] = self._message_content_to_qwen(h)
         response, history = self._model.chat(
             self._tokenizer, query=prompt, history=chat_history
         )

From fa3a11008089bff04c18900b93568ad35561bf2f Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 14:34:27 +0800
Subject: [PATCH 10/20] Fix

---
 xinference/model/multimodal/qwen_vl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
index 88b077ef99..3f7be7240e 100644
--- a/xinference/model/multimodal/qwen_vl.py
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -74,7 +74,7 @@ def chat(
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         prompt = self._message_content_to_qwen(prompt)
         for h in chat_history:
-            h["content"] = self._message_content_to_qwen(h)
+            h["content"] = self._message_content_to_qwen(h["content"])
         response, history = self._model.chat(
             self._tokenizer, query=prompt, history=chat_history
         )

From 411a696f0154d2c362ef6aecd3e7eacaf6203f1f Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 14:44:10 +0800
Subject: [PATCH 11/20] Fix

---
 xinference/model/multimodal/qwen_vl.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
index 3f7be7240e..8466bfd815 100644
--- a/xinference/model/multimodal/qwen_vl.py
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -73,10 +73,21 @@ def chat(
         generate_config: Optional[Dict] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         prompt = self._message_content_to_qwen(prompt)
+        # Convert openai history to qwen vl history
+        qwen_history = []
+        query_to_response = []
         for h in chat_history:
-            h["content"] = self._message_content_to_qwen(h["content"])
+            role = h["role"]
+            content = self._message_content_to_qwen(h["content"])
+            if len(query_to_response) == 0 and role == "user":
+                query_to_response.append(content)
+            if len(query_to_response) == 1 and role == "assistant":
+                query_to_response.append(content)
+            if len(query_to_response) == 2:
+                qwen_history.append(query_to_response)
+                query_to_response = []
         response, history = self._model.chat(
-            self._tokenizer, query=prompt, history=chat_history
+            self._tokenizer, query=prompt, history=qwen_history
         )
         if "<box>" in response:
             image = self._tokenizer.draw_bbox_on_latest_picture(response, history)

From a293d77bb3a974ecc5c3ce1a6a01d4cf10404a06 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 15:02:24 +0800
Subject: [PATCH 12/20] Remove render bounding box

---
 xinference/model/multimodal/qwen_vl.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
index 8466bfd815..92a97fbd5b 100644
--- a/xinference/model/multimodal/qwen_vl.py
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -1,5 +1,4 @@
 import operator
-import tempfile
 import time
 import uuid
 from typing import Dict, Iterator, List, Optional, Union
@@ -89,14 +88,6 @@ def chat(
         response, history = self._model.chat(
             self._tokenizer, query=prompt, history=qwen_history
         )
-        if "<box>" in response:
-            image = self._tokenizer.draw_bbox_on_latest_picture(response, history)
-            if image:
-                with tempfile.NamedTemporaryFile(
-                    suffix=".jpg", delete_on_close=False
-                ) as output:
-                    image.save(output)
-                response = output.name
         return ChatCompletion(
             id="chat" + str(uuid.uuid1()),
             object="chat.completion",

From 4cd65eaeb95bdc472adb0d38bcf1f7498d979f18 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 15:12:57 +0800
Subject: [PATCH 13/20] Fix

---
 xinference/model/multimodal/core.py              |  4 ++--
 xinference/model/multimodal/qwen_vl.py           |  6 +++---
 .../model/multimodal/tests/test_multimodal.py    | 16 ++++++++++------
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index c153ff9070..30fdf31c57 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -295,14 +295,14 @@ def create_multimodal_model_instance(
 
 def match_cls(
     model_family: LVLMFamilyV1, model_spec: "LVLMSpecV1", quantization: str
-) -> Optional[Type[LVLM]]:
+) -> Type[LVLM]:
     """
     Find an LLM implementation for given LLM family and spec.
     """
     for cls in MODEL_CLASSES:
         if cls.match(model_family, model_spec, quantization):
             return cls
-    return None
+    raise Exception(f"Model {model_family.model_name} is not supported")
 
 
 def _get_cache_dir(
diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
index 92a97fbd5b..69b7d14862 100644
--- a/xinference/model/multimodal/qwen_vl.py
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -52,7 +52,7 @@ def load(self):
             code_revision=self.model_spec.model_revision,
         )
 
-    def _message_content_to_qwen(self, content):
+    def _message_content_to_qwen(self, content) -> str:
         if not isinstance(content, str):
             content = [
                 {"image": c["image_url"]["url"], "type": "image"}
@@ -74,8 +74,8 @@ def chat(
         prompt = self._message_content_to_qwen(prompt)
         # Convert openai history to qwen vl history
         qwen_history = []
-        query_to_response = []
-        for h in chat_history:
+        query_to_response: List = []
+        for h in chat_history or []:
             role = h["role"]
             content = self._message_content_to_qwen(h["content"])
             if len(query_to_response) == 0 and role == "user":
diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py
index 859dcba56c..5176131019 100644
--- a/xinference/model/multimodal/tests/test_multimodal.py
+++ b/xinference/model/multimodal/tests/test_multimodal.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-def test_restful_api_for_multimodal(setup):
+def test_restful_api_for_qwen_vl(setup):
     endpoint, _ = setup
     from ....client import Client
 
@@ -26,7 +26,6 @@ def test_restful_api_for_multimodal(setup):
         device="cpu",
     )
     model = client.get_model(model_uid)
-    print(model)
 
     # openai client
     import openai
@@ -49,7 +48,9 @@ def test_restful_api_for_multimodal(setup):
             }
         ],
     )
-    print(completion)
+    assert "grass" in completion.choices[0].message.content
+    assert "tree" in completion.choices[0].message.content
+    assert "sky" in completion.choices[0].message.content
     messages = [
         {
             "role": "user",
@@ -65,9 +66,12 @@ def test_restful_api_for_multimodal(setup):
         }
     ]
     completion = client.chat.completions.create(model=model_uid, messages=messages)
-    print(completion)
+    assert "女" in completion.choices[0].message.content
+    assert "狗" in completion.choices[0].message.content
+    assert "沙滩" in completion.choices[0].message.content
     messages.append(completion.choices[0].message.model_dump())
     messages.append({"role": "user", "content": "框出图中击掌的位置"})
-    print(messages)
     completion = client.chat.completions.create(model=model_uid, messages=messages)
-    print(completion)
+    assert "击掌" in completion.choices[0].message.content
+    assert "<ref>" in completion.choices[0].message.content
+    assert "<box>" in completion.choices[0].message.content

From 92fde203decf33feecb315e7a61dbd0c9ea965cf Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 15:16:09 +0800
Subject: [PATCH 14/20] Clean code

---
 xinference/model/multimodal/core.py | 39 +----------------------------
 1 file changed, 1 insertion(+), 38 deletions(-)

diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index 30fdf31c57..26d8a8d9da 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -100,7 +100,7 @@ def __init__(
 
     def to_dict(self):
         return {
-            "model_type": "LLM",
+            "model_type": "LVLM",
             "address": self.address,
             "accelerators": self.devices,
             "model_name": self._model_family.model_name,
@@ -136,43 +136,6 @@ def __init__(
         self.kwargs = kwargs
         logger.info("Init model %s with kwargs: %s", self.model_uid, kwargs)
 
-    @staticmethod
-    def handle_model_size(model_size_in_billions: Union[str, int]) -> Union[int, float]:
-        if isinstance(model_size_in_billions, str):
-            if "_" in model_size_in_billions:
-                ms = model_size_in_billions.replace("_", ".")
-                return float(ms)
-            else:
-                raise ValueError("Invalid format for `model_size_in_billions`")
-        return model_size_in_billions
-
-    @staticmethod
-    def _is_darwin_and_apple_silicon():
-        return platform.system() == "Darwin" and platform.processor() == "arm"
-
-    @staticmethod
-    def _is_linux():
-        return platform.system() == "Linux"
-
-    @staticmethod
-    def _has_cuda_device():
-        from ...utils import cuda_count
-
-        return cuda_count() > 0
-
-    @staticmethod
-    def _get_cuda_count():
-        from ...utils import cuda_count
-
-        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
-        if cuda_visible_devices is None:
-            return cuda_count()
-
-        if cuda_visible_devices == "-1":
-            return 0
-        else:
-            return len(cuda_visible_devices.split(","))
-
     @abstractmethod
     def load(self):
         raise NotImplementedError

From 4b262a1a75153a50ee3fe6c2cd8bceba3f442fb8 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 15:24:28 +0800
Subject: [PATCH 15/20] Fix

---
 xinference/model/multimodal/core.py                  | 4 ----
 xinference/model/multimodal/model_spec.json          | 5 +----
 xinference/model/multimodal/tests/test_multimodal.py | 1 +
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index 26d8a8d9da..88c7eb3331 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -15,7 +15,6 @@
 import abc
 import logging
 import os
-import platform
 from abc import abstractmethod
 from collections import defaultdict
 from typing import Dict, Iterator, List, Literal, Optional, Tuple, Type, Union
@@ -68,9 +67,6 @@ class LVLMPromptStyleV1(BaseModel):
     style_name: str
     system_prompt: str = ""
     roles: List[str]
-    image_formatter: str = ""
-    text_formatter: str = ""
-    sep: str = ""
 
 
 class LVLMFamilyV1(BaseModel):
diff --git a/xinference/model/multimodal/model_spec.json b/xinference/model/multimodal/model_spec.json
index e279441b3c..07af7f2f19 100644
--- a/xinference/model/multimodal/model_spec.json
+++ b/xinference/model/multimodal/model_spec.json
@@ -28,10 +28,7 @@
       "roles": [
         "user",
         "assistant"
-      ],
-      "image_formatter": "Picture {idx}: <img>{image}</img>",
-      "text_formatter": "{text}",
-      "sep": "\n"
+      ]
     }
   }
 ]
diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py
index 5176131019..cc1d0ae234 100644
--- a/xinference/model/multimodal/tests/test_multimodal.py
+++ b/xinference/model/multimodal/tests/test_multimodal.py
@@ -26,6 +26,7 @@ def test_restful_api_for_qwen_vl(setup):
         device="cpu",
     )
     model = client.get_model(model_uid)
+    assert model
 
     # openai client
     import openai

From 3fcb452ef0ad80f4b4835adfe41c93a9a0ad4746 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 15:42:30 +0800
Subject: [PATCH 16/20] Remove files

---
 create_test_data.py | 0
 qwen_vl_demo.py     | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 create_test_data.py
 delete mode 100644 qwen_vl_demo.py

diff --git a/create_test_data.py b/create_test_data.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/qwen_vl_demo.py b/qwen_vl_demo.py
deleted file mode 100644
index e69de29bb2..0000000000

From f9f88c8308a964d9f03b75effd5648eb387e4c23 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 15:44:02 +0800
Subject: [PATCH 17/20] Add copyright

---
 xinference/model/multimodal/qwen_vl.py        | 14 ++++++++++++++
 xinference/model/multimodal/tests/__init__.py | 13 +++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/xinference/model/multimodal/qwen_vl.py b/xinference/model/multimodal/qwen_vl.py
index 69b7d14862..55e29fe182 100644
--- a/xinference/model/multimodal/qwen_vl.py
+++ b/xinference/model/multimodal/qwen_vl.py
@@ -1,3 +1,17 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import operator
 import time
 import uuid
diff --git a/xinference/model/multimodal/tests/__init__.py b/xinference/model/multimodal/tests/__init__.py
index e69de29bb2..37f6558d95 100644
--- a/xinference/model/multimodal/tests/__init__.py
+++ b/xinference/model/multimodal/tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2022-2023 XProbe Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From 0f1f6f95995cf116d1a67a7394b7bdeb024fd10a Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 15:46:03 +0800
Subject: [PATCH 18/20] Fix

---
 xinference/model/multimodal/core.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/xinference/model/multimodal/core.py b/xinference/model/multimodal/core.py
index 88c7eb3331..678c8583b1 100644
--- a/xinference/model/multimodal/core.py
+++ b/xinference/model/multimodal/core.py
@@ -164,7 +164,7 @@ def match_multimodal(
     quantization: Optional[str] = None,
 ) -> Optional[Tuple[LVLMFamilyV1, LVLMSpecV1, str]]:
     """
-    Find an LLM family, spec, and quantization that satisfy given criteria.
+    Find an multimodal family, spec, and quantization that satisfy given criteria.
     """
 
     def _match_quantization(q: Union[str, None], quantizations: List[str]):
@@ -256,7 +256,7 @@ def match_cls(
     model_family: LVLMFamilyV1, model_spec: "LVLMSpecV1", quantization: str
 ) -> Type[LVLM]:
     """
-    Find an LLM implementation for given LLM family and spec.
+    Find an multimodal implementation for given multimodal family and spec.
     """
     for cls in MODEL_CLASSES:
         if cls.match(model_family, model_spec, quantization):
@@ -440,18 +440,18 @@ def cache_from_huggingface(
 
 
 def cache(
-    llm_family: LVLMFamilyV1,
-    llm_spec: "LVLMSpecV1",
+    model_family: LVLMFamilyV1,
+    model_spec: "LVLMSpecV1",
     quantization: Optional[str] = None,
 ) -> str:
-    if llm_spec.model_hub == "huggingface":
-        logger.info(f"Caching from Hugging Face: {llm_spec.model_id}")
-        return cache_from_huggingface(llm_family, llm_spec, quantization)
-    elif llm_spec.model_hub == "modelscope":
-        logger.info(f"Caching from Modelscope: {llm_spec.model_id}")
-        return cache_from_modelscope(llm_family, llm_spec, quantization)
+    if model_spec.model_hub == "huggingface":
+        logger.info(f"Caching from Hugging Face: {model_spec.model_id}")
+        return cache_from_huggingface(model_family, model_spec, quantization)
+    elif model_spec.model_hub == "modelscope":
+        logger.info(f"Caching from Modelscope: {model_spec.model_id}")
+        return cache_from_modelscope(model_family, model_spec, quantization)
     else:
-        raise ValueError(f"Unknown model hub: {llm_spec.model_hub}")
+        raise ValueError(f"Unknown model hub: {model_spec.model_hub}")
 
 
 def get_cache_status(

From b9ed91b34af667f063a4bbd7928816e4dd2ffde7 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Wed, 27 Dec 2023 17:29:18 +0800
Subject: [PATCH 19/20] Skip ut

---
 xinference/model/multimodal/tests/test_multimodal.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xinference/model/multimodal/tests/test_multimodal.py b/xinference/model/multimodal/tests/test_multimodal.py
index cc1d0ae234..38317049b8 100644
--- a/xinference/model/multimodal/tests/test_multimodal.py
+++ b/xinference/model/multimodal/tests/test_multimodal.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import pytest
 
 
+@pytest.mark.skip(reason="Cost too many resources.")
 def test_restful_api_for_qwen_vl(setup):
     endpoint, _ = setup
     from ....client import Client

From 9da800671f573084dfdb572ccf929d7a6edcdeb2 Mon Sep 17 00:00:00 2001
From: codingl2k1 <codingl2k1@outlook.com>
Date: Fri, 29 Dec 2023 11:19:23 +0800
Subject: [PATCH 20/20] Fix

---
 xinference/core/supervisor.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/xinference/core/supervisor.py b/xinference/core/supervisor.py
index 480e879627..54447597d9 100644
--- a/xinference/core/supervisor.py
+++ b/xinference/core/supervisor.py
@@ -357,6 +357,13 @@ def get_model_registration(self, model_type: str, model_name: str) -> Any:
                 if f.model_name == model_name:
                     return f
             raise ValueError(f"Model {model_name} not found")
+        elif model_type == "multimodal":
+            from ..model.multimodal import BUILTIN_LVLM_FAMILIES
+
+            for f in BUILTIN_LVLM_FAMILIES:
+                if f.model_name == model_name:
+                    return f
+            raise ValueError(f"Model {model_name} not found")
         else:
             raise ValueError(f"Unsupported model type: {model_type}")