From 33791ce2a85593e92f1349f03090a186cb1a4f7c Mon Sep 17 00:00:00 2001 From: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com> Date: Fri, 20 Dec 2024 18:26:26 +0800 Subject: [PATCH] Add `modality` to `BaseModel` (#937) * base model added "modality" field * ovis add "modality = [MODALITY.IMAGE_TO_TEXT]" * qwen2_vl add "modality = [MODALITY.TEXT, MODALITY.IMAGE_TO_TEXT]" * fix test * cleanup * cleanup * change quant_override_files hint type to: Dict[str, Union[str | Dict[str, Any]]] * cleanup --- gptqmodel/models/base.py | 59 +++++--- gptqmodel/models/definitions/ovis.py | 68 +++++++++- gptqmodel/models/definitions/qwen2_vl.py | 69 +++++++++- gptqmodel/utils/calibration.py | 12 ++ gptqmodel/utils/image.py | 27 ++++ gptqmodel/utils/model.py | 6 + tests/models/model_test.py | 34 +++-- tests/models/ovis/__init__.py | 0 tests/models/ovis/image_to_test_dataset.py | 52 ++++++++ tests/models/ovis/ovis_calibration_dataset.py | 126 ------------------ tests/models/test_ovis_1_6_llama.py | 79 +++++------ tests/models/test_qwen2_vl.py | 54 +++++++- 12 files changed, 385 insertions(+), 201 deletions(-) create mode 100644 gptqmodel/utils/calibration.py create mode 100644 gptqmodel/utils/image.py create mode 100644 tests/models/ovis/__init__.py create mode 100644 tests/models/ovis/image_to_test_dataset.py delete mode 100644 tests/models/ovis/ovis_calibration_dataset.py diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index fb1e934d..c5de0c65 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -1,9 +1,10 @@ from __future__ import annotations import copy +import json import os import shutil -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, Any import accelerate import torch @@ -30,6 +31,7 @@ nested_move_to, pack_model, simple_dispatch_model, + MODALITY, ) from ..utils.progress import ProgressBar from ..utils.torch import torch_empty_cache @@ -87,6 +89,10 @@ class BaseGPTQModel(nn.Module): supports_desc_act = [True, False] + modality: List[MODALITY] = [MODALITY.TEXT] + + quant_override_files: Dict[str, Union[str | Dict[str, Any]]] = {} + def __init__( self, model: PreTrainedModel, @@ -124,7 +130,7 @@ def quantized(self): def hf_device_map(self): return getattr(self.model, "hf_device_map", None) - def _prepare_dataset_for_quantization( + def prepare_dataset( self, calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]], batch_size: int = 1, @@ -265,13 +271,30 @@ def quantize( if BITBLAS_AVAILABLE is False: raise ValueError(BITBLAS_INSTALL_HINT) + + device_map = self.hf_device_map + if device_map: + for name, device in device_map.items(): + if device == "cpu" and best_device != CPU: + logger.info(f"truly offloading {name} to cpu with hook.") + module = get_module_by_name_suffix(self.model, name) + remove_hook_from_module(module, recurse=True) + accelerate.cpu_offload_with_hook(module, best_device) + + calibration_dataset = self.prepare_dataset(calibration_dataset, batch_size, tokenizer,) + # Calculate the average length of the average input_ids total_input_ids_length = 0 max_input_id_length = 0 for row in calibration_dataset: input_ids = row["input_ids"] if isinstance(input_ids, torch.Tensor): - input_ids_length = input_ids.numel() + if input_ids.dim() <= 2: + input_ids_length = input_ids.shape[-1] + else: + raise ValueError( + "Expected a 1-dimensional tensor or 2-dimensional tensor for 'input_ids', but got a tensor with {0} dimensions.".format( + input_ids.dim())) else: input_ids_length = len(input_ids) @@ -284,17 +307,6 @@ def quantize( logger.warning(f"The average length of input_ids of calibration_dataset should be greater than " f"{min_calibration_dataset_input_ids_avg_length}: actual avg: {avg}.") - device_map = self.hf_device_map - if device_map: - for name, device in device_map.items(): - if device == "cpu" and best_device != CPU: - logger.info(f"truly offloading {name} to cpu with hook.") - module = get_module_by_name_suffix(self.model, name) - remove_hook_from_module(module, recurse=True) - accelerate.cpu_offload_with_hook(module, best_device) - - calibration_dataset = self._prepare_dataset_for_quantization(calibration_dataset, batch_size, tokenizer,) - if isinstance(self.quantize_config, AutoRoundQuantizeConfig): from auto_round import AutoRound from auto_round import __version__ as auto_round_version @@ -760,14 +772,25 @@ def save( meta_quantizer: Optional[str] = None, **kwargs, ): - preprocessor_config_path = os.path.join(self.model_id_or_path, "preprocessor_config.json") - if os.path.exists(preprocessor_config_path): - os.makedirs(save_dir, exist_ok=True) + extra_json_file_names = ["preprocessor_config.json", "chat_template.json"] + for name in extra_json_file_names: + json_path = os.path.join(self.model_id_or_path, name) + if os.path.exists(json_path): + os.makedirs(save_dir, exist_ok=True) - shutil.copyfile(preprocessor_config_path, os.path.join(save_dir, "preprocessor_config.json")) + shutil.copyfile(json_path, os.path.join(save_dir, name)) if self.quantized: self.save_quantized(save_dir, safetensors_metadata, max_shard_size, meta_quantizer) + + # overwrite quant_override_files + for name, value in self.quant_override_files.items(): + json_path = os.path.join(save_dir, name) + with open(json_path, "w", encoding="utf-8") as f: + if isinstance(value, str): + f.write(value) + else: + f.write(json.dumps(value)) else: self.save_pretrained(save_dir, **kwargs) diff --git a/gptqmodel/models/definitions/ovis.py b/gptqmodel/models/definitions/ovis.py index b4baf43e..c23492c3 100644 --- a/gptqmodel/models/definitions/ovis.py +++ b/gptqmodel/models/definitions/ovis.py @@ -1,3 +1,10 @@ +import copy +import logging +from typing import Dict + +from ...utils.calibration import batched +from ...utils.image import fetch_image +from ...utils.model import MODALITY import torch from ..base import BaseGPTQModel @@ -15,13 +22,68 @@ class OvisGPTQ(BaseGPTQModel): ["mlp.down_proj"], ] - # hack so one can prepare examples outside - def _prepare_dataset_for_quantization( + modality = [MODALITY.IMAGE_TO_TEXT] + + IGNORE_ID = -100 + + def preprocess_dataset(self, sample: Dict) -> Dict: + text_max_length = 832 + conversations = copy.deepcopy(sample["conversations"]) + images = [fetch_image(sample)] + max_partition = 9 + + prompt, input_ids, pixel_values, labels = self.model.preprocess_inputs( + conversations, + images, + max_partition=max_partition, + generation_preface=None, + return_labels=True, + propagate_exception=False + ) + + if pixel_values is None: + pixel_values, _ = self.visual_tokenizer.mock_input() + + input_ids = input_ids[:text_max_length] + labels = labels[:text_max_length] + + return { + "pixel_values": pixel_values, + "input_ids": input_ids, + "labels": labels, + } + + def prepare_dataset( self, calibration_dataset, batch_size: int = 1, tokenizer=None, ): - return calibration_dataset + calib_data = [] + for batch in batched(calibration_dataset, batch_size, self.preprocess_dataset): + pixel_values, input_ids, labels = tuple([instance[key] for instance in batch] + for key in ("pixel_values", "input_ids", "labels")) + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, + batch_first=True, + padding_value=self.text_tokenizer.pad_token_id) + attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id) + labels = torch.nn.utils.rnn.pad_sequence( + labels, + batch_first=True, + padding_value=self.IGNORE_ID) + + num_valid_label = torch.not_equal(labels, self.IGNORE_ID).sum().item() + if num_valid_label == 0: + logging.warning( + f'[DataCollatorForMultimodalDatasetGPTQ] All labels are ignored, may causing training instability\n{input_ids=}\n{attention_mask=}\n{labels=}') + calib_data.append({ + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + "pixel_values": pixel_values, + }) + + return calib_data def generate(self, inputs, **kwargs): """shortcut for model.generate""" diff --git a/gptqmodel/models/definitions/qwen2_vl.py b/gptqmodel/models/definitions/qwen2_vl.py index 85dc6f25..e00046b4 100644 --- a/gptqmodel/models/definitions/qwen2_vl.py +++ b/gptqmodel/models/definitions/qwen2_vl.py @@ -1,6 +1,12 @@ -from transformers import AutoModelForVision2Seq +from typing import Dict + +from qwen_vl_utils import process_vision_info + +from transformers import AutoModelForVision2Seq, Qwen2VLProcessor from ..base import BaseGPTQModel +from ...utils.calibration import batched +from ...utils.model import MODALITY class Qwen2VLGPTQ(BaseGPTQModel): @@ -16,3 +22,64 @@ class Qwen2VLGPTQ(BaseGPTQModel): ["mlp.up_proj", "mlp.gate_proj"], ["mlp.down_proj"], ] + + modality = [MODALITY.TEXT, MODALITY.IMAGE_TO_TEXT] + + quant_override_files = { + "preprocessor_config.json": { + "do_convert_rgb": True, + "do_normalize": True, + "do_rescale": True, + "do_resize": True, + "image_mean": [ + 0.48145466, + 0.4578275, + 0.40821073 + ], + "image_processor_type": "Qwen2VLImageProcessor", + "image_std": [ + 0.26862954, + 0.26130258, + 0.27577711 + ], + "max_pixels": 1003520, + "merge_size": 2, + "min_pixels": 3136, + "patch_size": 14, + "processor_class": "Qwen2VLProcessor", + "resample": 3, + "rescale_factor": 0.00392156862745098, + "size": { + "max_pixels": 1003520, + "min_pixels": 3136 + }, + "temporal_patch_size": 2, + "vision_token_id": 151654 + } + } + + def preprocess_dataset(self, sample: Dict) -> Dict: + return sample + + def prepare_dataset( + self, + calibration_dataset, + batch_size: int = 1, + tokenizer=None, ): + processor = Qwen2VLProcessor.from_pretrained(self.model_id_or_path) + calib_data = [] + for batch in batched(calibration_dataset, batch_size, process_func=self.preprocess_dataset): + text = processor.apply_chat_template( + batch, tokenize=False, add_generation_prompt=True + ) + image_inputs, video_inputs = process_vision_info(batch) + inputs = processor( + text=text, + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + calib_data.append(inputs) + del processor + return calib_data diff --git a/gptqmodel/utils/calibration.py b/gptqmodel/utils/calibration.py new file mode 100644 index 00000000..c4b4fc94 --- /dev/null +++ b/gptqmodel/utils/calibration.py @@ -0,0 +1,12 @@ +def batched(iterable, n: int, process_func): + # batched('ABCDEFG', 3) → ABC DEF G + assert n >= 1, "batch size must be at least one" + from itertools import islice + + iterator = iter(iterable) + + while batch := tuple(islice(iterator, n)): + if process_func is None: + yield batch + else: + yield [process_func(item) for item in batch] diff --git a/gptqmodel/utils/image.py b/gptqmodel/utils/image.py new file mode 100644 index 00000000..356213a2 --- /dev/null +++ b/gptqmodel/utils/image.py @@ -0,0 +1,27 @@ +from PIL import Image +from io import BytesIO +import requests +import base64 + +def fetch_image(ele: dict[str, str | Image.Image]) -> Image.Image: + if "image" in ele: + image = ele["image"] + else: + image = ele["image_url"] + image_obj = None + if isinstance(image, Image.Image): + image_obj = image + elif image.startswith("http://") or image.startswith("https://"): + image_obj = Image.open(requests.get(image, stream=True).raw) + elif image.startswith("file://"): + image_obj = Image.open(image[7:]) + elif image.startswith("data:image"): + if "base64," in image: + _, base64_data = image.split("base64,", 1) + data = base64.b64decode(base64_data) + image_obj = Image.open(BytesIO(data)) + else: + image_obj = Image.open(image) + if image_obj is None: + raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}") + return image_obj \ No newline at end of file diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 03f42c42..9a6d56e2 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -9,6 +9,7 @@ import shutil import sys from concurrent.futures import ThreadPoolExecutor +from enum import Enum from typing import Dict, List, Optional, Tuple, Type, Union import accelerate @@ -782,3 +783,8 @@ def check_requires_version(requires_version, current_version): return OPERATOR_MAP[op_symbol](current_version, required_version) else: return None + +class MODALITY(str, Enum): + TEXT = "text" + IMAGE_TO_TEXT = "image_to_text" + # TEXT_TO_IMAGE = "text_to_image" diff --git a/tests/models/model_test.py b/tests/models/model_test.py index 215e2be1..e4f16eb9 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -1,7 +1,9 @@ # -- do not touch +import gc import os import sys +from gptqmodel.utils.model import MODALITY if sys.platform == "darwin": os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" @@ -19,8 +21,8 @@ import torch.cuda # noqa: E402 from datasets import load_dataset # noqa: E402 from lm_eval.utils import make_table # noqa: E402 -from ovis.ovis_calibration_dataset import get_calib_dataset # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 +from ovis.image_to_test_dataset import get_calib_dataset # noqa: E402 +from transformers import AutoTokenizer, AutoProcessor # noqa: E402 from gptqmodel import BACKEND, GPTQModel # noqa: E402 from gptqmodel.nn_modules.qlinear import BaseQuantLinear # noqa: E402 @@ -105,7 +107,7 @@ def check_kernel(self, model, expected_kernels): if expected_kernels: assert modules == expected_kernels, f"kernels are different with expected. found: {modules}. expected: {expected_kernels}" - def quantModel(self, model_id_or_path, trust_remote_code=False, torch_dtype="auto", need_eval=True, **kwargs): + def quantModel(self, model_id_or_path, trust_remote_code=False, torch_dtype="auto", need_eval=True, batch_size: int=4, **kwargs): quantize_config = QuantizeConfig( bits=4, group_size=128, @@ -125,8 +127,8 @@ def quantModel(self, model_id_or_path, trust_remote_code=False, torch_dtype="aut tokenizer = self.load_tokenizer(model_id_or_path, trust_remote_code=trust_remote_code) - is_ovis_model = "Ovis" in model_id_or_path - calibration_dataset = self.load_dataset(tokenizer) if not is_ovis_model else get_calib_dataset(model) + is_image_to_text_model = MODALITY.IMAGE_TO_TEXT in model.modality + calibration_dataset = get_calib_dataset(model) if is_image_to_text_model else self.load_dataset(tokenizer) # mpt model need if not model.config.pad_token_id: @@ -135,8 +137,12 @@ def quantModel(self, model_id_or_path, trust_remote_code=False, torch_dtype="aut model.config.eos_token_id = tokenizer.eos_token_id or 0 is_quantized = model.quantized + + # ovis cannot load processor + is_ovis_model = model.__class__.__name__ == "OvisGPTQ" + need_create_processor = is_image_to_text_model and not is_ovis_model if not is_quantized: - model.quantize(calibration_dataset, batch_size=4) + model.quantize(calibration_dataset, batch_size=batch_size) self.check_kernel(model, self.KERNEL_QUANT) @@ -144,13 +150,23 @@ def quantModel(self, model_id_or_path, trust_remote_code=False, torch_dtype="aut model.save(tmpdirname) tokenizer.save_pretrained(tmpdirname) q_model, q_tokenizer = self.loadQuantModel(tmpdirname, trust_remote_code=trust_remote_code) - + if need_create_processor: + processor = AutoProcessor.from_pretrained(tmpdirname) + else: + if need_create_processor: + processor = AutoProcessor.from_pretrained(model_id_or_path) if not is_quantized: del model torch_empty_cache() - return q_model, q_tokenizer + if need_create_processor: + return q_model, q_tokenizer, processor + else: + return q_model, q_tokenizer else: - return model, tokenizer + if need_create_processor: + return model, tokenizer, processor + else: + return model, tokenizer def loadQuantModel(self, model_id_or_path, trust_remote_code=False, tokenizer_path=None): if tokenizer_path is None: diff --git a/tests/models/ovis/__init__.py b/tests/models/ovis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/models/ovis/image_to_test_dataset.py b/tests/models/ovis/image_to_test_dataset.py new file mode 100644 index 00000000..22645c4e --- /dev/null +++ b/tests/models/ovis/image_to_test_dataset.py @@ -0,0 +1,52 @@ +from gptqmodel.models import OvisGPTQ, Qwen2VLGPTQ + + +def format_ovis_dataset(image, assistant): + return { + "image": image, + "conversations": [ + { + "from": "human", + "value": f"\nWrite a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible." + }, + { + "from": "gpt", + "value": assistant + } + ] + } + + +def format_qwen2_vl_dataset(image, assistant): + return [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "generate a caption for this image"}, + ], + }, + {"role": "assistant", "content": assistant}, + ] + + +def prepare_dataset(format_func, n_sample: int = 20) -> list[list[dict]]: + from datasets import load_dataset + + dataset = load_dataset( + "laion/220k-GPT4Vision-captions-from-LIVIS", split=f"train[:{n_sample}]" + ) + return [ + format_func(sample["url"], sample["caption"]) + for sample in dataset + ] + + +def get_calib_dataset(model): + if isinstance(model, OvisGPTQ): + return prepare_dataset(format_ovis_dataset, n_sample=20) + + if isinstance(model, Qwen2VLGPTQ): + return prepare_dataset(format_qwen2_vl_dataset, n_sample=20) + + raise NotImplementedError(f"Unsupported MODEL: {model.__class__}") diff --git a/tests/models/ovis/ovis_calibration_dataset.py b/tests/models/ovis/ovis_calibration_dataset.py deleted file mode 100644 index 100caae2..00000000 --- a/tests/models/ovis/ovis_calibration_dataset.py +++ /dev/null @@ -1,126 +0,0 @@ -import copy -import logging -import os.path -from typing import Dict, List, Sequence, Union - -import torch -from PIL import Image -from torch.utils.data import DataLoader, Dataset - - -IGNORE_ID = -100 - -# prepare calibration samples -class CalibrationDataset(Dataset): - """ - Dataset class for calibration. Initialize with the loaded Ovis model, and a sample list in the following format: - data_list = [ - { - "image": "path/to/image/of/this/sample", - "conversations": [ - { - "from": "human", - "value": "\n[Your sample prompt]" - }, - { - "from": "gpt", - "value": "[Anything]" - } - ] - }, - ... - ] - """ - - def __init__(self, model, text_max_length, data_list: List[Dict]): - self.data = data_list - self.model = model - self.visual_tokenizer = model.get_visual_tokenizer() - self.text_max_length = text_max_length - - def __len__(self): - return len(self.data) - - def __getitem__(self, i: int) -> Dict[str, torch.Tensor]: - sample = self.data[i] - conversations = copy.deepcopy(sample["conversations"]) - images = [Image.open(os.path.join(self.model.model_id_or_path, f"images/{sample['id']}"))] - max_partition = 9 - - prompt, input_ids, pixel_values, labels = self.model.preprocess_inputs( - conversations, - images, - max_partition=max_partition, - generation_preface=None, - return_labels=True, - propagate_exception=False - ) - - if pixel_values is None: - pixel_values, _ = self.visual_tokenizer.mock_input() - - input_ids = input_ids[:self.text_max_length] - labels = labels[:self.text_max_length] - - return { - "pixel_values": pixel_values, - "input_ids": input_ids, - "labels": labels - } - - -class DataCollatorForMultimodalDatasetGPTQ: - def __init__(self, text_tokenizer): - self.text_tokenizer = text_tokenizer - - def __call__(self, instances: Sequence[Dict]) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]: - pixel_values, input_ids, labels = tuple([instance[key] for instance in instances] - for key in ("pixel_values", "input_ids", "labels")) - input_ids = torch.nn.utils.rnn.pad_sequence( - input_ids, - batch_first=True, - padding_value=self.text_tokenizer.pad_token_id) - attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id) - labels = torch.nn.utils.rnn.pad_sequence( - labels, - batch_first=True, - padding_value=IGNORE_ID) - - num_valid_label = torch.not_equal(labels, IGNORE_ID).sum().item() - if num_valid_label == 0: - logging.warning( - f'[DataCollatorForMultimodalDatasetGPTQ] All labels are ignored, may causing training instability\n{input_ids=}\n{attention_mask=}\n{labels=}') - return { - "input_ids": input_ids, - "attention_mask": attention_mask, - "labels": labels, - "pixel_values": pixel_values - } - - -class MyDataLoader(DataLoader): - def __len__(self): - return len(self.dataset) // self.batch_size # must set drop last=True - - -# prepare your own calibration samples here -data_list = [{"id": "10593.jpg", "image": ["10593.jpg"], "conversations": [{"from": "human", "value": "\nWrite a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible."}, {"from": "gpt", "value": "A digital rendering shows two sleek, metal cabinet handles against a white background. Each handle is cylindrical with a matte silver finish and mounts using two cylindrical pegs, visualizing a minimalistic and modern design suitable for kitchen or furniture cabinetry."}], "url": "https://www.shopgoldenwarm.com/cdn/shop/products/SKU-24-1003-3_4inholecenters_700x700.jpg?v=1691814633"}, {"id": "8383.jpg", "image": ["8383.jpg"], "conversations": [{"from": "human", "value": "Write a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible.\n"}, {"from": "gpt", "value": "The image displays a screenshot of a Wikipedia page in French. The title of the article is \"Attention,\" which is prominently displayed at the top of the page. Below the title is the Wikipedia logo and a series of links such as \"Article,\" \"Discussion,\" \"Lire,\" \"Modifier,\" \"Modifier le code,\" \"Historique,\" and more, typical of Wikipedia's interface.\n\nThe main body of the screenshot shows a paragraph of text in French, beginning with a definition of attention as the faculty of focusing the mind on an object. The text explains various aspects of attention, mentioning philosophers, psychologists, and neuroscientists.\n\nTo the right of the paragraph, there is a small image within the Wikipedia page. It features three people sitting on a bench. The person on the left is wearing a green top, the one in the middle is in a red top, and the one on the right is in a blue top. They appear to be waiting or resting, with the central figure looking towards the camera.\n\nBelow the initial paragraph, there is a bullet-point list, which seems to continue the discussion on attention, possibly enumerating different points or aspects related to the topic.\n\nThe Wikipedia interface elements, such as tabs and side navigation options, are visible on the left side of the screenshot, indicating options like \"Accueil,\" \"Portails thématiques,\" \"Contact,\" \"Contribuer,\" and others."}], "url": "http://cdn.pearltrees.com/s/preview/index?urlId=236938"}, {"id": "6660.jpg", "image": ["6660.jpg"], "conversations": [{"from": "human", "value": "Write a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible.\n"}, {"from": "gpt", "value": "The image is a photograph of an individual riding a two-wheeled electric scooter on a paved road. The rider is wearing protective gear which includes a helmet with a predominantly white, black, and orange color scheme, a black jacket with white accents, and black pants. The jacket is branded with the name \"VANUCCI\" in white letters on the sleeve. The electric scooter features a prominent front wheel with a disc brake, a smaller rear wheel also with a disc brake, and a blue foot platform with orange and white details. The scooter is equipped with a side-mounted kickstand, which is currently retracted, and a digital display screen mounted on the handlebars. The rider's shoes have orange accents that match the scooter's color scheme. The setting is a natural environment with trees and shrubs in the background, suggesting that the location could be a park or a rural area. Additionally, there is a watermark overlay on the image with the word \"PREVIEW\" in capital letters, indicating that the image is likely for review or pre-purchase purposes."}], "url": "https://mcn-images.bauersecure.com/PageFiles/670437/KTM-City-e-Scooter.jpg"}, {"id": "7977.jpg", "image": ["7977.jpg"], "conversations": [{"from": "human", "value": "\nWrite a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible."}, {"from": "gpt", "value": "This is a promotional image for a StoryBook Legacy product by Forever, featuring \"Delight Diva 11x8.5 Pre-designed Pages\". The image is a collage-style layout that includes several photographs and graphic elements. At the top of the image, the text reads \"StoryBook Legacy™ by FOREVER\" in bold, black letters against a white background. Below that, there's a disclaimer in red text stating \"not all art shown\".\n\nThe central part of the image shows a scrapbook page layout with a pink and black color scheme. It includes three photographs: one large image of a young girl lying in grass, peeking through a hole in a pink fence, and two smaller images showing the same girl—one where she is holding a flower and smiling, and another where she is twirling in a blue dress in a grassy field. The photos are framed with decorative borders, and there are ornamental graphic elements like a circular design with concentric rings and a small graphic resembling a pink luggage tag.\n\nAt the bottom of the image, there is a text bubble in blue with white text that says \"new preview same great content.\" The overall style of the image is colorful and resembles a scrapbook or photo album page, promoting the idea of preserving memories in a creative and decorative way."}], "url": "https://d31czii1zefd9w.cloudfront.net/product_images/images/000/001/019/medium/SBL_DelightDiva11x8PDP_Preview-Med.jpg?1547494354"}, {"id": "8957.jpg", "image": ["8957.jpg"], "conversations": [{"from": "human", "value": "\nWrite a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible."}, {"from": "gpt", "value": "The image is a portrait-style photograph of a man standing against a neutral background. The man appears to be in his mid to late twenties, with short brown hair and a light beard. He is looking directly at the camera, with a neutral expression on his face.\n\nHe is wearing a light gray short-sleeved t-shirt with a graphic design across the chest. The design features the word \"HILFIGER\" in capital letters. The letters are stylized with a combination of vertical stripes and a gradient that transitions from blue, white, to pink from left to right.\n\nThe man is also wearing dark blue denim jeans, although only the top part of the jeans is visible in the image. The photograph focuses on the man from the waist up, and his arms are hanging by his sides, with his hands slightly curled.\n\nThe style of the image seems to be a typical fashion or retail display meant to showcase the t-shirt design. The lighting is even and bright, enhancing the visibility of the t-shirt's graphic and the details of the man's appearance. The photograph is taken with the subject centered, emphasizing the apparel."}], "url": "https://i8.amplience.net/i/jpl/te_142563_a?qlt=80&unsharp=0,1,1,7&w=600&h=777&v=1"}, {"id": "2627.jpg", "image": ["2627.jpg"], "conversations": [{"from": "human", "value": "\nWrite a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible."}, {"from": "gpt", "value": "The image is a digital presentation slide with a minimalist design. The background is split into two distinct colors: the top portion is orange, and the bottom portion is gray. Centered on the orange section in white, sans-serif font is the text \"Intermediate Strategic Management,\" which appears to be the title of the presentation. Below the title, also in white text but smaller size, is the name \"Marco Clemente.\" In the gray section, two lines of white text provide additional information: \"Spring 2014\" on the first line and \"3rd session – 5 March 2014\" on the second line, indicating the time frame of the course and the specific session date. In the upper left corner of the image, there is a stylized 'A' logo with a superscript exclamation mark, suggesting it is the logo of the institution or organization associated with the presentation, which is Aalto University School of Business. The overall style of the image is formal and academic."}], "url": "https://thumbs.slideserve.com/1_1678414.jpg"}, {"id": "9370.jpg", "image": ["9370.jpg"], "conversations": [{"from": "human", "value": "Write a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible.\n"}, {"from": "gpt", "value": "The image depicts a red wax seal with embossed text. The text within the seal reads \"DUTY\" in large, bold letters at the center, and a border text that is partially obscured, includes the words \"QUALITY\" and \"SERVICE.\" The seal has a circular shape with scalloped edges, which is typical of traditional wax seals, and it has a glossy texture that reflects light, suggesting a three-dimensional appearance. The background is white, which contrasts with the red color of the seal, making the seal the focal point of the image. The style of the image is graphic and appears to be digitally created rather than an actual photograph of a physical wax seal."}], "url": "https://static0.bigstockphoto.com/thumbs/7/8/5/small2/5871191.jpg"}, {"id": "558.jpg", "image": ["558.jpg"], "conversations": [{"from": "human", "value": "\nWrite a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible."}, {"from": "gpt", "value": "The image depicts an interior view of a warehouse. The flooring is a smooth concrete surface that extends throughout the space. On the left, there are tall orange and blue industrial shelving units filled with various materials and boxes, suggesting a storage function. A yellow pallet jack stands in front of the shelves, positioned perpendicular to them.\n\nIn the middle of the warehouse, there is a large, yellow overhead crane system with a hook, which spans across the width of the space. Below the crane, there are stacks of construction materials, such as boards and panels, neatly arranged on the floor. These materials appear to be ready for transport or storage.\n\nThe warehouse has a high ceiling with multiple skylights and artificial lighting fixtures that illuminate the space evenly. To the right, there is a semi-opened roller shutter door, providing access to the outside and allowing natural light to enter. There are no visible texts or distinctive brands in the image. The style of the photograph is professional, with a focus on capturing the layout and equipment of the warehouse environment for either inventory management or real estate presentation purposes."}], "url": "https://img.agentaccount.com/5d8aee01cb7cd06e6f366e134e11494b8ee1ea47"}, {"id": "11323.jpg", "image": ["11323.jpg"], "conversations": [{"from": "human", "value": "\nWrite a detailed description of this image, do not forget about the texts on it if they exist. Also, do not forget to mention the type / style of the image. No bullet points. When writing descriptions, prioritize clarity and direct observation over embellishment or interpretation.\nDon't forget these rules:\n1. **Be Direct and Concise**: Provide straightforward descriptions without adding interpretative or speculative elements.\n2. **Use Segmented Details**: Break down details about different elements of an image into distinct sentences, focusing on one aspect at a time.\n3. **Maintain a Descriptive Focus**: Prioritize purely visible elements of the image, avoiding conclusions or inferences.\n4. **Follow a Logical Structure**: Begin with the central figure or subject and expand outward, detailing its appearance before addressing the surrounding setting.\n5. **Avoid Juxtaposition**: Do not use comparison or contrast language; keep the description purely factual.\n6. **Incorporate Specificity**: Mention age, gender, race, and specific brands or notable features when present, and clearly identify the medium if it's discernible."}, {"from": "gpt", "value": "The image is a photograph that displays a segment of an object divided into two colored sections: purple on the left and green on the right. On the purple section, there is a circular port or interface with multiple pin slots arranged in a pattern. Above this port, a small rectangular keyboard icon is affixed. On the green section, another circular port or interface is present, with a different configuration of pin slots. Just above this port on the green section, there is an icon depicting a mouse. The background beyond the two sections is black, creating a contrast with the vibrant colors of the object. The ports resemble PS/2 ports, commonly used for keyboards and mice in older computers."}], "url": "https://i.imgur.com/skIxCXJ.jpg"}] -target_length = 16 -data_list = (data_list * (target_length // len(data_list) + 1))[:target_length] - - -def get_calib_dataset(model): - train_dataset = CalibrationDataset(model, text_max_length=832, data_list=data_list) - print("Dataset Loaded!") - print(f"Total length of the training set: {len(train_dataset)}") - - train_loader = MyDataLoader( - train_dataset, - collate_fn=DataCollatorForMultimodalDatasetGPTQ(model.get_text_tokenizer()), - shuffle=False, - batch_size=1, - drop_last=True, - pin_memory=True, - ) - print("Dataloader Loaded!") - return train_loader diff --git a/tests/models/test_ovis_1_6_llama.py b/tests/models/test_ovis_1_6_llama.py index 64b321bc..e61b8677 100644 --- a/tests/models/test_ovis_1_6_llama.py +++ b/tests/models/test_ovis_1_6_llama.py @@ -16,49 +16,42 @@ class TestOvis1_6_Llama(ModelTest): BATCH_SIZE = 1 def test_ovis_1_6(self): - model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE, torch_dtype=self.TORCH_DTYPE, multimodal_max_length=8192,) - with tempfile.TemporaryDirectory() as tmp_dir: - model.save(tmp_dir) - tokenizer.save_pretrained(tmp_dir) - - del model - del tokenizer - - model = GPTQModel.load(tmp_dir, trust_remote_code=True) - - text_tokenizer = model.get_text_tokenizer() - visual_tokenizer = model.get_visual_tokenizer() - - # enter image path and prompt - image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ovis/10016.jpg") - image = Image.open(image_path) - text = "What does this picture show?" - query = f'\n{text}' - - # format conversation - prompt, input_ids, pixel_values = model.preprocess_inputs(query, [image]) - attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id) - input_ids = input_ids.unsqueeze(0).to(device=model.device) - attention_mask = attention_mask.unsqueeze(0).to(device=model.device) - pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)] - - # generate output - with torch.inference_mode(): - gen_kwargs = { - "max_new_tokens": 1024, - "do_sample": False, - "top_p": None, - "top_k": None, - "temperature": None, - "repetition_penalty": None, - "eos_token_id": model.generation_config.eos_token_id, - "pad_token_id": text_tokenizer.pad_token_id, - "use_cache": True - } - output_ids = \ + model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE, + torch_dtype=self.TORCH_DTYPE, multimodal_max_length=8192, batch_size=1) + + text_tokenizer = model.get_text_tokenizer() + visual_tokenizer = model.get_visual_tokenizer() + + # enter image path and prompt + image_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ovis/10016.jpg") + image = Image.open(image_path) + text = "What does this picture show?" + query = f'\n{text}' + + # format conversation + prompt, input_ids, pixel_values = model.preprocess_inputs(query, [image]) + attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id) + input_ids = input_ids.unsqueeze(0).to(device=model.device) + attention_mask = attention_mask.unsqueeze(0).to(device=model.device) + pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)] + + # generate output + with torch.inference_mode(): + gen_kwargs = { + "max_new_tokens": 1024, + "do_sample": False, + "top_p": None, + "top_k": None, + "temperature": None, + "repetition_penalty": None, + "eos_token_id": model.generation_config.eos_token_id, + "pad_token_id": text_tokenizer.pad_token_id, + "use_cache": True + } + output_ids = \ model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0] - output = text_tokenizer.decode(output_ids, skip_special_tokens=True) + output = text_tokenizer.decode(output_ids, skip_special_tokens=True) - print(f'Output:\n{output}') + print(f'Output:\n{output}') - self.assertIn("snow", output.lower()) + self.assertIn("snow", output.lower()) diff --git a/tests/models/test_qwen2_vl.py b/tests/models/test_qwen2_vl.py index b1415804..2c550f16 100644 --- a/tests/models/test_qwen2_vl.py +++ b/tests/models/test_qwen2_vl.py @@ -1,3 +1,5 @@ +from qwen_vl_utils import process_vision_info + from model_test import ModelTest @@ -11,4 +13,54 @@ class TestQwen2_VL(ModelTest): BATCH_SIZE = 6 def test_qwen2_vl(self): - self.quant_lm_eval() + model, tokenizer, processor = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=self.TRUST_REMOTE_CODE, + torch_dtype=self.TORCH_DTYPE) + + # check image to text + messages = [ + { + "role": "user", + "content": [ + { + "type": "image", + "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + }, + {"type": "text", "text": "Describe this image."}, + ], + } + ] + # Preparation for inference + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + inputs = inputs.to("cuda") + + # Inference: Generation of the output + generated_ids = model.generate(**inputs, max_new_tokens=128) + generated_ids_trimmed = [ + out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + print("output_text:", output_text) + + self.assertIn("dog", output_text) + + + # check lm_eval results + self.check_kernel(model, self.KERNEL_INFERENCE) + + task_results = self.lm_eval(model=model, + apply_chat_template=self.APPLY_CHAT_TEMPLATE, + trust_remote_code=self.TRUST_REMOTE_CODE, + delete_quantized_model=self.DELETE_QUANTIZED_MODEL) + self.check_results(task_results)