Token merging updates (#934)

* Updated to the latest diffusers version. Handled vae decoder as well * Updated to the latest diffusers version. Handled vae decoder as well * Fixed requirements * Fixed TOME tests
openvinotoolkit · Oct 11, 2024 · 4272f47 · 4272f47
1 parent e6eb43a
commit 4272f47
Show file tree

Hide file tree

Showing 6 changed files with 49 additions and 24 deletions.
diff --git a/.github/workflows/token_merging.yml b/.github/workflows/token_merging.yml
@@ -23,7 +23,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8]
+        python-version: [3.11]
 
     runs-on: ubuntu-latest
     steps:

diff --git a/modules/token_merging/setup.py b/modules/token_merging/setup.py
@@ -13,7 +13,7 @@
     author="Alexander Kozlov",
     url="https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/token_merging",
     description="Token Merging for OpenVINO",
-    install_requires=["torch~=1.13.1", "torchvision~=0.14.1"],
+    install_requires=["torch~=2.4", "torchvision~=0.19.1"],
     dependency_links=["https://download.pytorch.org/whl/cpu"],
     extras_require=EXTRAS_REQUIRE,
     packages=find_packages(exclude=("examples", "build")),

diff --git a/modules/token_merging/tests/test_precommit.py b/modules/token_merging/tests/test_precommit.py
@@ -9,10 +9,12 @@
 from PIL import Image
 import torch
 import openvino.runtime as ov
+from openvino import convert_model
 
 import tomeov
 from diffusers import StableDiffusionPipeline, DDPMScheduler
 from optimum.intel.openvino import OVStableDiffusionPipeline
+from optimum.exporters.openvino import export_from_model
 import open_clip
 import timm
 
@@ -33,7 +35,7 @@ def test_stable_diffusion(self):
         tomeov.patch_stable_diffusion(loaded_pipeline, ratio=0.3)
 
         with tempfile.TemporaryDirectory() as tmpdirname:
-            tomeov.export_diffusion_pipeline(loaded_pipeline, tmpdirname)
+            export_from_model(loaded_pipeline, tmpdirname)
             ov_pipe = OVStableDiffusionPipeline.from_pretrained(tmpdirname, compile=False)
             ov_pipe.reshape(batch_size=1, height=height, width=width, num_images_per_prompt=1)
             ov_pipe.compile()
@@ -42,26 +44,16 @@ def test_stable_diffusion(self):
     def test_openclip(self):
         model, _, transform = open_clip.create_model_and_transforms(self.OPENCLIP_MODEL[0], pretrained=self.OPENCLIP_MODEL[1])
         tomeov.patch_openclip(model, 8)
-        dummy_image = np.random.rand(100, 100, 3) * 255
+        dummy_image = np.random.rand(224, 224, 3) * 255
         dummy_image = Image.fromarray(dummy_image.astype("uint8"))
         dummy_image = transform(dummy_image).unsqueeze(0)
 
-        with tempfile.TemporaryDirectory(suffix = ".onnx") as tmpdirname:
-            model_file = os.path.join(tmpdirname, "image_encoder.onnx")
-            torch.onnx.export(
-                model.visual,
-                dummy_image,
-                model_file,
-                opset_version=14,
-                input_names=["image"],
-                output_names=["image_embedding"], 
-                dynamic_axes={ 
-                    "image": {0: "batch"},
-                    "image_embedding": {0: "batch"},
-                }
-            )
-            compiled_model = ov.compile_model(model_file)
-            self.assertTrue(compiled_model)
+        ov_model = convert_model(
+            model.visual,
+            example_input=dummy_image
+        )
+        compiled_model = ov.compile_model(ov_model)
+        self.assertTrue(compiled_model)
 
     def test_timm(self):
         model = timm.create_model(self.TIMM_MODEL, pretrained=False)

diff --git a/modules/token_merging/tomeov/openclip.py b/modules/token_merging/tomeov/openclip.py
@@ -119,7 +119,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             self._tome_info["source"] = None
 
             # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1
-            if self.input_patchnorm:
+            if hasattr(self, "input_patchnorm") and self.input_patchnorm:
                 # einops - rearrange(x, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)')
                 x = x.reshape(x.shape[0], x.shape[1], self.grid_size[0], self.patch_size[0], self.grid_size[1], self.patch_size[1])
                 x = x.permute(0, 2, 4, 1, 3, 5)

diff --git a/modules/token_merging/tomeov/stable_diffusion.py b/modules/token_merging/tomeov/stable_diffusion.py
@@ -177,7 +177,9 @@ def patch_stable_diffusion(
         use_rand: bool = True,
         merge_attn: bool = True,
         merge_crossattn: bool = False,
-        merge_mlp: bool = False):
+        merge_mlp: bool = False,
+        optimize_image_encoder: bool = True,
+        ):
     """
     Patches a stable diffusion model with ToMe.
     Apply this to the highest level stable diffusion object (i.e., it should have a .model.diffusion_model).
@@ -242,6 +244,36 @@ def patch_stable_diffusion(
             if not hasattr(module, "disable_self_attn") and not is_diffusers:
                 module.disable_self_attn = False
 
+    if optimize_image_encoder and hasattr(model, "vae_encoder"):
+        image_encoder = model.vae_encoder
+
+        image_encoder._tome_info = {
+            "size": None,
+            "hooks": [],
+            "args": {
+                "ratio": ratio,
+                "max_downsample": max_downsample,
+                "sx": sx, "sy": sy,
+                "use_rand": use_rand,
+                "generator": None,
+                "merge_attn": merge_attn,
+                "merge_crossattn": merge_crossattn,
+                "merge_mlp": merge_mlp
+            }
+        }
+        hook_tome_model(image_encoder)
+
+        for _, module in image_encoder.named_modules():
+            # If for some reason this has a different name, create an issue and I'll fix it
+            if isinstance_str(module, "BasicTransformerBlock"):
+                make_tome_block_fn = make_diffusers_tome_block if is_diffusers else make_tome_block
+                module.__class__ = make_tome_block_fn(module.__class__)
+                module._tome_info = image_encoder._tome_info
+
+                # Something introduced in SD 2.0 (LDM only)
+                if not hasattr(module, "disable_self_attn") and not is_diffusers:
+                    module.disable_self_attn = False
+
     return model
 
 

diff --git a/modules/token_merging/tomeov/utils.py b/modules/token_merging/tomeov/utils.py
@@ -10,7 +10,8 @@
 
 from openvino._offline_transformations import apply_moc_transformations, compress_quantize_weights_transformation
 
-from optimum.exporters.onnx import export_models, get_stable_diffusion_models_for_export
+from optimum.exporters.onnx import export_models
+from optimum.exporters.utils import get_diffusion_models_for_export 
 from optimum.intel import OVStableDiffusionPipeline
 from optimum.utils import (
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
@@ -91,7 +92,7 @@ def _export_to_onnx(pipeline, save_dir):
     ]
 
     with torch.no_grad():
-        models_and_onnx_configs = get_stable_diffusion_models_for_export(pipeline)
+        models_and_onnx_configs = get_diffusion_models_for_export(pipeline)
         pipeline.save_config(save_dir)
         export_models(
             models_and_onnx_configs=models_and_onnx_configs, output_dir=Path(save_dir), output_names=output_names