Add modality to BaseModel (#937)

* base model added "modality" field * ovis add "modality = [MODALITY.IMAGE_TO_TEXT]" * qwen2_vl add "modality = [MODALITY.TEXT, MODALITY.IMAGE_TO_TEXT]" * fix test * cleanup * cleanup * change quant_override_files hint type to: Dict[str, Union[str | Dict[str, Any]]] * cleanup
ModelCloud · Dec 20, 2024 · 33791ce · 33791ce
1 parent 2707ac7
commit 33791ce
Show file tree

Hide file tree

Showing 12 changed files with 385 additions and 201 deletions.
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
 import copy
+import json
 import os
 import shutil
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union, Any
 
 import accelerate
 import torch
@@ -30,6 +31,7 @@
     nested_move_to,
     pack_model,
     simple_dispatch_model,
+    MODALITY,
 )
 from ..utils.progress import ProgressBar
 from ..utils.torch import torch_empty_cache
@@ -87,6 +89,10 @@ class BaseGPTQModel(nn.Module):
 
     supports_desc_act = [True, False]
 
+    modality: List[MODALITY] = [MODALITY.TEXT]
+
+    quant_override_files: Dict[str, Union[str | Dict[str, Any]]] = {}
+
     def __init__(
         self,
         model: PreTrainedModel,
@@ -124,7 +130,7 @@ def quantized(self):
     def hf_device_map(self):
         return getattr(self.model, "hf_device_map", None)
 
-    def _prepare_dataset_for_quantization(
+    def prepare_dataset(
         self,
         calibration_dataset: List[Dict[str, Union[List[int], torch.LongTensor]]],
         batch_size: int = 1,
@@ -265,13 +271,30 @@ def quantize(
             if BITBLAS_AVAILABLE is False:
                 raise ValueError(BITBLAS_INSTALL_HINT)
 
+
+        device_map = self.hf_device_map
+        if device_map:
+            for name, device in device_map.items():
+                if device == "cpu" and best_device != CPU:
+                    logger.info(f"truly offloading {name} to cpu with hook.")
+                    module = get_module_by_name_suffix(self.model, name)
+                    remove_hook_from_module(module, recurse=True)
+                    accelerate.cpu_offload_with_hook(module, best_device)
+
+        calibration_dataset = self.prepare_dataset(calibration_dataset, batch_size, tokenizer,)
+
         # Calculate the average length of the average input_ids
         total_input_ids_length = 0
         max_input_id_length = 0
         for row in calibration_dataset:
             input_ids = row["input_ids"]
             if isinstance(input_ids, torch.Tensor):
-                input_ids_length = input_ids.numel()
+                if input_ids.dim() <= 2:
+                    input_ids_length = input_ids.shape[-1]
+                else:
+                    raise ValueError(
+                        "Expected a 1-dimensional tensor or 2-dimensional tensor for 'input_ids', but got a tensor with {0} dimensions.".format(
+                            input_ids.dim()))
             else:
                 input_ids_length = len(input_ids)
 
@@ -284,17 +307,6 @@ def quantize(
             logger.warning(f"The average length of input_ids of calibration_dataset should be greater than "
                            f"{min_calibration_dataset_input_ids_avg_length}: actual avg: {avg}.")
 
-        device_map = self.hf_device_map
-        if device_map:
-            for name, device in device_map.items():
-                if device == "cpu" and best_device != CPU:
-                    logger.info(f"truly offloading {name} to cpu with hook.")
-                    module = get_module_by_name_suffix(self.model, name)
-                    remove_hook_from_module(module, recurse=True)
-                    accelerate.cpu_offload_with_hook(module, best_device)
-
-        calibration_dataset = self._prepare_dataset_for_quantization(calibration_dataset, batch_size, tokenizer,)
-
         if isinstance(self.quantize_config, AutoRoundQuantizeConfig):
             from auto_round import AutoRound
             from auto_round import __version__ as auto_round_version
@@ -760,14 +772,25 @@ def save(
             meta_quantizer: Optional[str] = None,
             **kwargs,
     ):
-        preprocessor_config_path = os.path.join(self.model_id_or_path, "preprocessor_config.json")
-        if os.path.exists(preprocessor_config_path):
-            os.makedirs(save_dir, exist_ok=True)
+        extra_json_file_names = ["preprocessor_config.json", "chat_template.json"]
+        for name in extra_json_file_names:
+            json_path = os.path.join(self.model_id_or_path, name)
+            if os.path.exists(json_path):
+                os.makedirs(save_dir, exist_ok=True)
 
-            shutil.copyfile(preprocessor_config_path, os.path.join(save_dir, "preprocessor_config.json"))
+                shutil.copyfile(json_path, os.path.join(save_dir, name))
 
         if self.quantized:
             self.save_quantized(save_dir, safetensors_metadata, max_shard_size, meta_quantizer)
+
+            # overwrite quant_override_files
+            for name, value in self.quant_override_files.items():
+                json_path = os.path.join(save_dir, name)
+                with open(json_path, "w", encoding="utf-8") as f:
+                    if isinstance(value, str):
+                        f.write(value)
+                    else:
+                        f.write(json.dumps(value))
         else:
             self.save_pretrained(save_dir, **kwargs)
 

diff --git a/gptqmodel/models/definitions/ovis.py b/gptqmodel/models/definitions/ovis.py
@@ -1,3 +1,10 @@
+import copy
+import logging
+from typing import Dict
+
+from ...utils.calibration import batched
+from ...utils.image import fetch_image
+from ...utils.model import MODALITY
 import torch
 
 from ..base import BaseGPTQModel
@@ -15,13 +22,68 @@ class OvisGPTQ(BaseGPTQModel):
         ["mlp.down_proj"],
     ]
 
-    # hack so one can prepare examples outside
-    def _prepare_dataset_for_quantization(
+    modality = [MODALITY.IMAGE_TO_TEXT]
+
+    IGNORE_ID = -100
+
+    def preprocess_dataset(self, sample: Dict) -> Dict:
+        text_max_length = 832
+        conversations = copy.deepcopy(sample["conversations"])
+        images = [fetch_image(sample)]
+        max_partition = 9
+
+        prompt, input_ids, pixel_values, labels = self.model.preprocess_inputs(
+            conversations,
+            images,
+            max_partition=max_partition,
+            generation_preface=None,
+            return_labels=True,
+            propagate_exception=False
+        )
+
+        if pixel_values is None:
+            pixel_values, _ = self.visual_tokenizer.mock_input()
+
+        input_ids = input_ids[:text_max_length]
+        labels = labels[:text_max_length]
+
+        return {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "labels": labels,
+        }
+
+    def prepare_dataset(
             self,
             calibration_dataset,
             batch_size: int = 1,
             tokenizer=None, ):
-        return calibration_dataset
+        calib_data = []
+        for batch in batched(calibration_dataset, batch_size, self.preprocess_dataset):
+            pixel_values, input_ids, labels = tuple([instance[key] for instance in batch]
+                                                    for key in ("pixel_values", "input_ids", "labels"))
+            input_ids = torch.nn.utils.rnn.pad_sequence(
+                input_ids,
+                batch_first=True,
+                padding_value=self.text_tokenizer.pad_token_id)
+            attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
+            labels = torch.nn.utils.rnn.pad_sequence(
+                labels,
+                batch_first=True,
+                padding_value=self.IGNORE_ID)
+
+            num_valid_label = torch.not_equal(labels, self.IGNORE_ID).sum().item()
+            if num_valid_label == 0:
+                logging.warning(
+                    f'[DataCollatorForMultimodalDatasetGPTQ] All labels are ignored, may causing training instability\n{input_ids=}\n{attention_mask=}\n{labels=}')
+            calib_data.append({
+                "input_ids": input_ids,
+                "attention_mask": attention_mask,
+                "labels": labels,
+                "pixel_values": pixel_values,
+            })
+
+        return calib_data
 
     def generate(self, inputs, **kwargs):
         """shortcut for model.generate"""

diff --git a/gptqmodel/models/definitions/qwen2_vl.py b/gptqmodel/models/definitions/qwen2_vl.py
@@ -1,6 +1,12 @@
-from transformers import AutoModelForVision2Seq
+from typing import Dict
+
+from qwen_vl_utils import process_vision_info
+
+from transformers import AutoModelForVision2Seq, Qwen2VLProcessor
 
 from ..base import BaseGPTQModel
+from ...utils.calibration import batched
+from ...utils.model import MODALITY
 
 
 class Qwen2VLGPTQ(BaseGPTQModel):
@@ -16,3 +22,64 @@ class Qwen2VLGPTQ(BaseGPTQModel):
         ["mlp.up_proj", "mlp.gate_proj"],
         ["mlp.down_proj"],
     ]
+
+    modality = [MODALITY.TEXT, MODALITY.IMAGE_TO_TEXT]
+
+    quant_override_files = {
+        "preprocessor_config.json": {
+            "do_convert_rgb": True,
+            "do_normalize": True,
+            "do_rescale": True,
+            "do_resize": True,
+            "image_mean": [
+                0.48145466,
+                0.4578275,
+                0.40821073
+            ],
+            "image_processor_type": "Qwen2VLImageProcessor",
+            "image_std": [
+                0.26862954,
+                0.26130258,
+                0.27577711
+            ],
+            "max_pixels": 1003520,
+            "merge_size": 2,
+            "min_pixels": 3136,
+            "patch_size": 14,
+            "processor_class": "Qwen2VLProcessor",
+            "resample": 3,
+            "rescale_factor": 0.00392156862745098,
+            "size": {
+                "max_pixels": 1003520,
+                "min_pixels": 3136
+            },
+            "temporal_patch_size": 2,
+            "vision_token_id": 151654
+        }
+    }
+
+    def preprocess_dataset(self, sample: Dict) -> Dict:
+        return sample
+
+    def prepare_dataset(
+            self,
+            calibration_dataset,
+            batch_size: int = 1,
+            tokenizer=None, ):
+        processor = Qwen2VLProcessor.from_pretrained(self.model_id_or_path)
+        calib_data = []
+        for batch in batched(calibration_dataset, batch_size, process_func=self.preprocess_dataset):
+            text = processor.apply_chat_template(
+                batch, tokenize=False, add_generation_prompt=True
+            )
+            image_inputs, video_inputs = process_vision_info(batch)
+            inputs = processor(
+                text=text,
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+            calib_data.append(inputs)
+        del processor
+        return calib_data
diff --git a/gptqmodel/utils/calibration.py b/gptqmodel/utils/calibration.py
@@ -0,0 +1,12 @@
+def batched(iterable, n: int, process_func):
+    # batched('ABCDEFG', 3) → ABC DEF G
+    assert n >= 1, "batch size must be at least one"
+    from itertools import islice
+
+    iterator = iter(iterable)
+
+    while batch := tuple(islice(iterator, n)):
+        if process_func is None:
+            yield batch
+        else:
+            yield [process_func(item) for item in batch]
diff --git a/gptqmodel/utils/image.py b/gptqmodel/utils/image.py
@@ -0,0 +1,27 @@
+from PIL import Image
+from io import BytesIO
+import requests
+import base64
+
+def fetch_image(ele: dict[str, str | Image.Image]) -> Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        image_obj = Image.open(requests.get(image, stream=True).raw)
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = Image.open(BytesIO(data))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    return image_obj
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -9,6 +9,7 @@
 import shutil
 import sys
 from concurrent.futures import ThreadPoolExecutor
+from enum import Enum
 from typing import Dict, List, Optional, Tuple, Type, Union
 
 import accelerate
@@ -782,3 +783,8 @@ def check_requires_version(requires_version, current_version):
         return OPERATOR_MAP[op_symbol](current_version, required_version)
     else:
         return None
+
+class MODALITY(str, Enum):
+    TEXT = "text"
+    IMAGE_TO_TEXT = "image_to_text"
+    # TEXT_TO_IMAGE = "text_to_image"