From 4272f47cb3ffbaf5c0fb5db569deb16856c578a1 Mon Sep 17 00:00:00 2001 From: Alexander Kozlov Date: Fri, 11 Oct 2024 16:01:45 +0400 Subject: [PATCH] Token merging updates (#934) * Updated to the latest diffusers version. Handled vae decoder as well * Updated to the latest diffusers version. Handled vae decoder as well * Fixed requirements * Fixed TOME tests --- .github/workflows/token_merging.yml | 2 +- modules/token_merging/setup.py | 2 +- modules/token_merging/tests/test_precommit.py | 28 ++++++--------- modules/token_merging/tomeov/openclip.py | 2 +- .../token_merging/tomeov/stable_diffusion.py | 34 ++++++++++++++++++- modules/token_merging/tomeov/utils.py | 5 +-- 6 files changed, 49 insertions(+), 24 deletions(-) diff --git a/.github/workflows/token_merging.yml b/.github/workflows/token_merging.yml index c24cdde1e..0e5d7cd8e 100644 --- a/.github/workflows/token_merging.yml +++ b/.github/workflows/token_merging.yml @@ -23,7 +23,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8] + python-version: [3.11] runs-on: ubuntu-latest steps: diff --git a/modules/token_merging/setup.py b/modules/token_merging/setup.py index eb2adb2e1..32005d4cb 100644 --- a/modules/token_merging/setup.py +++ b/modules/token_merging/setup.py @@ -13,7 +13,7 @@ author="Alexander Kozlov", url="https://github.com/openvinotoolkit/openvino_contrib/tree/master/modules/token_merging", description="Token Merging for OpenVINO", - install_requires=["torch~=1.13.1", "torchvision~=0.14.1"], + install_requires=["torch~=2.4", "torchvision~=0.19.1"], dependency_links=["https://download.pytorch.org/whl/cpu"], extras_require=EXTRAS_REQUIRE, packages=find_packages(exclude=("examples", "build")), diff --git a/modules/token_merging/tests/test_precommit.py b/modules/token_merging/tests/test_precommit.py index ef6ee97be..de380123e 100644 --- a/modules/token_merging/tests/test_precommit.py +++ b/modules/token_merging/tests/test_precommit.py @@ -9,10 +9,12 @@ from PIL import Image import torch import openvino.runtime as ov +from openvino import convert_model import tomeov from diffusers import StableDiffusionPipeline, DDPMScheduler from optimum.intel.openvino import OVStableDiffusionPipeline +from optimum.exporters.openvino import export_from_model import open_clip import timm @@ -33,7 +35,7 @@ def test_stable_diffusion(self): tomeov.patch_stable_diffusion(loaded_pipeline, ratio=0.3) with tempfile.TemporaryDirectory() as tmpdirname: - tomeov.export_diffusion_pipeline(loaded_pipeline, tmpdirname) + export_from_model(loaded_pipeline, tmpdirname) ov_pipe = OVStableDiffusionPipeline.from_pretrained(tmpdirname, compile=False) ov_pipe.reshape(batch_size=1, height=height, width=width, num_images_per_prompt=1) ov_pipe.compile() @@ -42,26 +44,16 @@ def test_stable_diffusion(self): def test_openclip(self): model, _, transform = open_clip.create_model_and_transforms(self.OPENCLIP_MODEL[0], pretrained=self.OPENCLIP_MODEL[1]) tomeov.patch_openclip(model, 8) - dummy_image = np.random.rand(100, 100, 3) * 255 + dummy_image = np.random.rand(224, 224, 3) * 255 dummy_image = Image.fromarray(dummy_image.astype("uint8")) dummy_image = transform(dummy_image).unsqueeze(0) - with tempfile.TemporaryDirectory(suffix = ".onnx") as tmpdirname: - model_file = os.path.join(tmpdirname, "image_encoder.onnx") - torch.onnx.export( - model.visual, - dummy_image, - model_file, - opset_version=14, - input_names=["image"], - output_names=["image_embedding"], - dynamic_axes={ - "image": {0: "batch"}, - "image_embedding": {0: "batch"}, - } - ) - compiled_model = ov.compile_model(model_file) - self.assertTrue(compiled_model) + ov_model = convert_model( + model.visual, + example_input=dummy_image + ) + compiled_model = ov.compile_model(ov_model) + self.assertTrue(compiled_model) def test_timm(self): model = timm.create_model(self.TIMM_MODEL, pretrained=False) diff --git a/modules/token_merging/tomeov/openclip.py b/modules/token_merging/tomeov/openclip.py index fdf2107b4..6b91c0ba1 100644 --- a/modules/token_merging/tomeov/openclip.py +++ b/modules/token_merging/tomeov/openclip.py @@ -119,7 +119,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: self._tome_info["source"] = None # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1 - if self.input_patchnorm: + if hasattr(self, "input_patchnorm") and self.input_patchnorm: # einops - rearrange(x, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)') x = x.reshape(x.shape[0], x.shape[1], self.grid_size[0], self.patch_size[0], self.grid_size[1], self.patch_size[1]) x = x.permute(0, 2, 4, 1, 3, 5) diff --git a/modules/token_merging/tomeov/stable_diffusion.py b/modules/token_merging/tomeov/stable_diffusion.py index 3b95439bb..cd131348f 100644 --- a/modules/token_merging/tomeov/stable_diffusion.py +++ b/modules/token_merging/tomeov/stable_diffusion.py @@ -177,7 +177,9 @@ def patch_stable_diffusion( use_rand: bool = True, merge_attn: bool = True, merge_crossattn: bool = False, - merge_mlp: bool = False): + merge_mlp: bool = False, + optimize_image_encoder: bool = True, + ): """ Patches a stable diffusion model with ToMe. Apply this to the highest level stable diffusion object (i.e., it should have a .model.diffusion_model). @@ -242,6 +244,36 @@ def patch_stable_diffusion( if not hasattr(module, "disable_self_attn") and not is_diffusers: module.disable_self_attn = False + if optimize_image_encoder and hasattr(model, "vae_encoder"): + image_encoder = model.vae_encoder + + image_encoder._tome_info = { + "size": None, + "hooks": [], + "args": { + "ratio": ratio, + "max_downsample": max_downsample, + "sx": sx, "sy": sy, + "use_rand": use_rand, + "generator": None, + "merge_attn": merge_attn, + "merge_crossattn": merge_crossattn, + "merge_mlp": merge_mlp + } + } + hook_tome_model(image_encoder) + + for _, module in image_encoder.named_modules(): + # If for some reason this has a different name, create an issue and I'll fix it + if isinstance_str(module, "BasicTransformerBlock"): + make_tome_block_fn = make_diffusers_tome_block if is_diffusers else make_tome_block + module.__class__ = make_tome_block_fn(module.__class__) + module._tome_info = image_encoder._tome_info + + # Something introduced in SD 2.0 (LDM only) + if not hasattr(module, "disable_self_attn") and not is_diffusers: + module.disable_self_attn = False + return model diff --git a/modules/token_merging/tomeov/utils.py b/modules/token_merging/tomeov/utils.py index 2e439c730..3a42778ba 100644 --- a/modules/token_merging/tomeov/utils.py +++ b/modules/token_merging/tomeov/utils.py @@ -10,7 +10,8 @@ from openvino._offline_transformations import apply_moc_transformations, compress_quantize_weights_transformation -from optimum.exporters.onnx import export_models, get_stable_diffusion_models_for_export +from optimum.exporters.onnx import export_models +from optimum.exporters.utils import get_diffusion_models_for_export from optimum.intel import OVStableDiffusionPipeline from optimum.utils import ( DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, @@ -91,7 +92,7 @@ def _export_to_onnx(pipeline, save_dir): ] with torch.no_grad(): - models_and_onnx_configs = get_stable_diffusion_models_for_export(pipeline) + models_and_onnx_configs = get_diffusion_models_for_export(pipeline) pipeline.save_config(save_dir) export_models( models_and_onnx_configs=models_and_onnx_configs, output_dir=Path(save_dir), output_names=output_names