diff --git a/xinference/model/llm/pytorch/intern_vl.py b/xinference/model/llm/pytorch/intern_vl.py index b76899b0be..661987df3a 100644 --- a/xinference/model/llm/pytorch/intern_vl.py +++ b/xinference/model/llm/pytorch/intern_vl.py @@ -21,9 +21,7 @@ import requests import torch -import torchvision.transforms as T from PIL import Image -from torchvision.transforms.functional import InterpolationMode from ....model.utils import select_device from ....types import ( @@ -205,47 +203,6 @@ def _image_to_piexl_values(images): history.append(tuple(tmp)) return history, pixel_values - def _load_image(_url): - if _url.startswith("data:"): - logging.info("Parse url by base64 decoder.") - # https://platform.openai.com/docs/guides/vision/uploading-base-64-encoded-images - # e.g. f"data:image/jpeg;base64,{base64_image}" - _type, data = _url.split(";") - _, ext = _type.split("/") - data = data[len("base64,") :] - data = base64.b64decode(data.encode("utf-8")) - - return Image.open(BytesIO(data)).convert("RGB") - else: - try: - response = requests.get(_url) - except requests.exceptions.MissingSchema: - return Image.open(_url).convert("RGB") - else: - return Image.open(BytesIO(response.content)).convert("RGB") - - if not isinstance(content, str): - texts = [] - image_urls = [] - for c in content: - c_type = c.get("type") - if c_type == "text": - texts.append(c["text"]) - elif c_type == "image_url": - image_urls.append(c["image_url"]["url"]) - image_futures = [] - with ThreadPoolExecutor() as executor: - for image_url in image_urls: - fut = executor.submit(_load_image, image_url) - image_futures.append(fut) - images = [fut.result() for fut in image_futures] - text = " ".join(texts) - if len(images) == 0: - return text - else: - return text, images - return content - def _find_closest_aspect_ratio( self, aspect_ratio, target_ratios, width, height, image_size ): @@ -309,6 +266,9 @@ def _dynamic_preprocess( return processed_images def _build_transform(self, input_size): + import torchvision.transforms as T + from torchvision.transforms.functional import InterpolationMode + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD transform = T.Compose( [