siliconflow · ccssu · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · May 11, 2024
diff --git a/onediff_diffusers_extensions/examples/ip-adapter/README.md b/onediff_diffusers_extensions/examples/ip-adapter/README.md
@@ -0,0 +1,114 @@
+# Run IP-Adapter with OneDiff
+
+1. [Environment Setup](#environment-setup)
+   - [Set Up OneDiff](#set-up-onediff)
+   - [Set Up OneFlow Backend](#set-up-oneflow-backend)
+   - [Set Up NexFort Backend](#set-up-nexfort-backend)
+   - [Set Up Diffusers Library](#set-up-diffusers)
+   - [Set Up SDXL](#set-up-sdxl)
+   - [Set Up IP-Adapter](#set-up-ip-adapter)
+2. [Execution Instructions](#run)
+   - [Run Without Compilation (Baseline)](#run-without-compilation-baseline)
+   - [Run with oneflow backend compilation](#run-with-oneflow-backend-compilation)
+   - [Run with nexfort backend compilation](#run-with-nexfort-backend-compilation)
+3. [Performance Comparison](#performance-comparison)
+4. [Dynamic Shape for IP-Adapter](#dynamic-shape-for-ip-adapter)
+
+## Environment setup
+### Set up onediff
+https://github.com/siliconflow/onediff?tab=readme-ov-file#installation
+
+### Set up oneflow backend
+https://github.com/siliconflow/onediff?tab=readme-ov-file#oneflow
+
+### Set up nexfort backend
+https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
+
+### Set up diffusers
+
+```
+pip3 install --upgrade diffusers[torch]
+```
+### Set up SDXL
+Model version for diffusers: https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0
+
+HF pipeline: https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.md
+
+### Set up IP-Adapter
+Models: https://huggingface.co/h94/IP-Adapter/tree/main/sdxl_models
+
+Docs: https://huggingface.co/docs/diffusers/using-diffusers/ip_adapter
+
+
+## Run
+
+### Run without compilation (Baseline)
+```shell
+python3 onediff_diffusers_extensions/examples/text_to_image_ip_adapter.py \
+  --compiler none \
+  --saved_image ip-adapter-none.png \
+  --print-output \
+  --multi-scale
+```
+
+### Run with oneflow backend compilation
+
+```shell
+python3 onediff_diffusers_extensions/examples/text_to_image_ip_adapter.py \
+  --compiler oneflow \
+  --saved_image ip-adapter-oneflow.png \
+  --print-output \
+  --multi-scale
+```
+
+### Run with nexfort backend compilation
+```shell
+python3 onediff_diffusers_extensions/examples/text_to_image_ip_adapter.py \
+  --compiler nexfort \
+  --saved_image ip-adapter-nexfort.png \
+  --print-output \
+  --multi-scale
+```
+
+## Performance comparison
+
+Testing on NVIDIA GeForce RTX 3090 / 4090, with image size of 1024*1024, iterating 100 steps:
+| Metric                                         | RTX 3090  1024*1024   | RTX 4090 1024*1024    |
+| ---------------------------------------------- | --------------------- | --------------------- |
+| Data update date (yyyy-mm-dd)                  | 2024-07-26            | 2024-07-26            |
+| PyTorch iteration speed                        | 3.74 it/s             | 7.96 it/s             |
+| OneDiff (oneflow) iteration speed              | 6.90 it/s (+84.5%)    | 14.45 it/s (+81.5%)   |
+| OneDiff (nexfort) iteration speed              | 5.42 it/s (+44.9%)    | 11.97 it/s (+50.4%)   |
+| PyTorch E2E time                               | 27.91 s               | 13.62 s               |
+| OneDiff (oneflow) E2E time                     | 15.61 s (-44.1%)      | 7.88 s (-42.1%)       |
+| OneDiff (nexfort) E2E time                     | 19.60 s (-29.8%)      | 9.28 s (-31.9%)       |
+| PyTorch Max Mem Used                           | 14.58 GiB             | 14.59 GiB             |
+| OneDiff (oneflow) Max Mem Used                 | 17.39 GiB             | 17.54 GiB             |
+| OneDiff (nexfort) Max Mem Used                 | 15.10 GiB             | 15.09 GiB             |
+| PyTorch Warmup with Run time                   |                       |                       |
+| OneDiff (oneflow) Warmup with Compilation time | 131.20 s <sup>1</sup> | 76.07 s <sup>2</sup>  |
+| OneDiff (nexfort) Warmup with Compilation time | 702.90 s <sup>1</sup> | 576.90 s <sup>2</sup> |
+| OneDiff (oneflow) Warmup with Cache time       | N/A                   | N/A                   |
+| OneDiff (nexfort) Warmup with Cache time       | 537.88 s <sup>1</sup> | 433.00 s <sup>2</sup> |
+
+<sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz. Note this is just for reference, and it varies a lot on different CPU.
+
+<sup>2</sup> AMD EPYC 7543 32-Core Processor.
+
+
+## Dynamic shape for IP-Adapter
+
+Run:
+
+```shell
+python3 onediff_diffusers_extensions/examples/text_to_image_ip_adapter.py \
+  --compiler oneflow \
+  --saved_image ip-adapter-oneflow.png \
+  --print-output \
+  --multi-scale \
+  --multi-resolution
+```
+## Note
+When you set the scale for the ip-adapter, you should set it as a **`Tensor`** rather than a `float`. And if you want to modify scale, you should perform an in-place copy operation on the tensor associated with the scale, setting it to the new scale. For more details, please refer to the [code](../text_to_image_ip_adapter.py).
+
+Here are the detailed reasons: If you set the scale as a float, during the compilation to generate the computation graph, the scale will be compiled as a constant, leading to the inability to modify the scale in the computation graph. If you set the scale as a `Tensor`, it will be compiled as a variable, and by modifying this tensor, you can modify the scale that is actually involved in the computations.
diff --git a/onediff_diffusers_extensions/examples/text_to_image_ip_adapter.py b/onediff_diffusers_extensions/examples/text_to_image_ip_adapter.py
@@ -0,0 +1,235 @@
+import argparse
+import inspect
+import json
+import os
+import time
+from pathlib import Path
+
+import torch
+
+from diffusers import AutoPipelineForText2Image
+from diffusers.utils import load_image
+from onediffx import compile_pipe, load_pipe, save_pipe
+
+nexfort_options = {
+    "mode": "cudagraphs:benchmark:max-autotune:low-precision:cache-all",
+    "memory_format": "channels_last",
+    "options": {
+        "inductor.optimize_linear_epilogue": False,
+        "overrides.conv_benchmark": True,
+        "overrides.matmul_allow_tf32": True,
+    },
+}
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--base", type=str, default="stabilityai/stable-diffusion-xl-base-1.0"
+)
+parser.add_argument("--ipadapter", type=str, default="h94/IP-Adapter")
+parser.add_argument("--subfolder", type=str, default="sdxl_models")
+parser.add_argument("--weight_name", type=str, default="ip-adapter_sdxl.bin")
+parser.add_argument(
+    "--input_image",
+    type=str,
+    default="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png",
+)
+parser.add_argument(
+    "--prompt",
+    default="a polar bear sitting in a chair drinking a milkshake",
+    help="Prompt",
+)
+parser.add_argument(
+    "--negative-prompt",
+    default="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
+    help="Negative prompt",
+)
+parser.add_argument("--height", type=int, default=1024)
+parser.add_argument("--width", type=int, default=1024)
+parser.add_argument("--n_steps", type=int, default=100)
+parser.add_argument(
+    "--saved_image", type=str, required=False, default="ip-adapter-out.png"
+)
+parser.add_argument("--seed", type=int, default=1)
+parser.add_argument("--scale", type=float, default=1.0)
+parser.add_argument("--warmup", type=int, default=1)
+parser.add_argument(
+    "--compiler", type=str, default="oneflow", choices=["none", "nexfort", "oneflow"]
+)
+parser.add_argument("--compile-options", type=str, default=nexfort_options)
+parser.add_argument("--cache-dir", default="./onediff_cache", help="cache directory")
+parser.add_argument("--multi-scale", action="store_true")
+parser.add_argument("--multi-resolution", action="store_true")
+parser.add_argument("--print-output", action="store_true")
+args = parser.parse_args()
+
+
+class IterationProfiler:
+    def __init__(self):
+        self.begin = None
+        self.end = None
+        self.num_iterations = 0
+
+    def get_iter_per_sec(self):
+        if self.begin is None or self.end is None:
+            return None
+        self.end.synchronize()
+        dur = self.begin.elapsed_time(self.end)
+        return self.num_iterations / dur * 1000.0
+
+    def callback_on_step_end(self, pipe, i, t, callback_kwargs={}):
+        if self.begin is None:
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.begin = event
+        else:
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.end = event
+            self.num_iterations += 1
+        return callback_kwargs
+
+
+# load an image
+ip_adapter_image = load_image(args.input_image)
+
+# load stable diffusion and ip-adapter
+pipe = AutoPipelineForText2Image.from_pretrained(
+    args.base,
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+pipe.load_ip_adapter(
+    args.ipadapter, subfolder=args.subfolder, weight_name=args.weight_name
+)
+
+# Set ipadapter scale as a tensor instead of a float
+# If scale is a float, it cannot be modified after the graph is traced
+ipadapter_scale = torch.tensor(args.scale, dtype=torch.float, device="cuda")
+pipe.set_ip_adapter_scale(ipadapter_scale)
+pipe.to("cuda")
+
+
+cache_path = os.path.join(args.cache_dir, type(pipe).__name__)
+
+if args.compiler == "none":
+    pass
+elif args.compiler == "nexfort":
+    compile_options = args.compile_options
+    if isinstance(compile_options, str):
+        compile_options = json.loads(compile_options)
+    if args.multi_resolution:
+        compile_options["dynamic"] = True
+    os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", "./.torchinductor")
+    pipe = compile_pipe(pipe, backend="nexfort", options=compile_options)
+else:
+    pipe = compile_pipe(pipe, backend="oneflow")
+    if os.path.exists(cache_path):
+        # TODO(WangYi): load pipe has bug here, which makes scale unchangeable
+        # load_pipe(pipe, cache_path)
+        pass
+
+
+# NOTE: Warm it up.
+# The initial calls will trigger compilation and might be very slow.
+# After that, it should be very fast.
+if args.warmup > 0:
+    begin = time.time()
+    print("=======================================")
+    print("Begin warmup")
+    for _ in range(args.warmup):
+        pipe(
+            prompt=args.prompt,
+            height=args.height,
+            width=args.width,
+            ip_adapter_image=ip_adapter_image,
+            num_inference_steps=args.n_steps,
+        )
+    end = time.time()
+    print("End warmup")
+    print(f"Warmup time: {end - begin:.3f}s")
+    print("=======================================")
+
+# Let"s see it!
+# Note: Progress bar might work incorrectly due to the async nature of CUDA.
+kwarg_inputs = dict(
+    prompt=args.prompt,
+    ip_adapter_image=ip_adapter_image,
+    negative_prompt=args.negative_prompt,
+    height=args.height,
+    width=args.width,
+    num_inference_steps=args.n_steps,
+    generator=torch.Generator(device="cpu").manual_seed(0),
+)
+iter_profiler = IterationProfiler()
+if "callback_on_step_end" in inspect.signature(pipe).parameters:
+    kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
+elif "callback" in inspect.signature(pipe).parameters:
+    kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
+begin = time.time()
+image_to_print = pipe(**kwarg_inputs).images[0]
+image_to_print.save("result.png")
+image_path = (
+    f"{Path(args.saved_image).stem}_{args.scale}_{args.compiler}"
+    + Path(args.saved_image).suffix
+)
+print(f"save output image to {image_path}")
+image_to_print.save(image_path)
+end = time.time()
+
+print("=======================================")
+print(f"Inference time: {end - begin:.3f}s")
+iter_per_sec = iter_profiler.get_iter_per_sec()
+if iter_per_sec is not None:
+    print(f"Iterations per second: {iter_per_sec:.3f}")
+if args.compiler == "oneflow":
+    import oneflow as flow  # usort: skip
+
+    cuda_mem_after_used = flow._oneflow_internal.GetCUDAMemoryUsed() / 1024
+else:
+    cuda_mem_after_used = torch.cuda.max_memory_allocated() / (1024**3)
+print(f"Max used CUDA memory : {cuda_mem_after_used:.3f}GiB")
+print("=======================================")
+
+
+if args.multi_scale:
+    scales = [0.1, 0.5, 1]
+    for scale in scales:
+        # Use ipadapter_scale.copy_ instead of pipeline.set_ip_adapter_scale to modify scale
+        ipadapter_scale.copy_(torch.tensor(scale, dtype=torch.float, device="cuda"))
+        pipe.set_ip_adapter_scale(ipadapter_scale)
+        image = pipe(**kwarg_inputs).images[0]
+        image_path = (
+            f"{Path(args.saved_image).stem}_{scale}" + Path(args.saved_image).suffix
+        )
+        print(f"save output image to {image_path}")
+        image.save(image_path)
+
+if args.multi_resolution:
+    from itertools import product
+
+    sizes = [1024, 512, 768, 256]
+    for h, w in product(sizes, sizes):
+        image = pipe(
+            prompt=args.prompt,
+            ip_adapter_image=ip_adapter_image,
+            negative_prompt=args.negative_prompt,
+            height=h,
+            width=w,
+            num_inference_steps=args.n_steps,
+            generator=torch.Generator(device="cpu").manual_seed(0),
+        ).images[0]
+        print(f"Running at resolution: {h}x{w}")
+
+if args.print_output:
+    from onediff.utils.import_utils import is_nexfort_available
+
+    if is_nexfort_available():
+        from nexfort.utils.term_image import print_image
+
+        print_image(image_to_print, max_width=80)
+
+
+if args.compiler == "oneflow":
+    if not os.path.exists(cache_path):
+        os.makedirs(cache_path)
+    save_pipe(pipe, cache_path)
diff --git a/src/infer_compiler_registry/register_diffusers/__init__.py b/src/infer_compiler_registry/register_diffusers/__init__.py
@@ -70,6 +70,7 @@
 from .attention_processor_oflow import (
     Attention as AttentionOflow,
     AttnProcessor as AttnProcessorOflow,
+    is_ip_adapter_available,
     LoRAAttnProcessor2_0 as LoRAAttnProcessorOflow,
 )
 from .resnet_oflow import Upsample2D as Upsample2DOflow
@@ -102,6 +103,22 @@
         LoRAAttnProcessor2_0: LoRAAttnProcessorOflow,
     }
 
+if is_ip_adapter_available():
+    from diffusers.models.attention_processor import (
+        IPAdapterAttnProcessor,
+        IPAdapterAttnProcessor2_0,
+    )
+
+    from .attention_processor_oflow import (
+        IPAdapterAttnProcessor as IPAdapterAttnProcessorOflow,
+        IPAdapterAttnProcessor2_0 as IPAdapterLoRAAttnProcessor2_0Oflow,
+    )
+
+    torch2oflow_class_map.update({IPAdapterAttnProcessor: IPAdapterAttnProcessorOflow})
+    torch2oflow_class_map.update(
+        {IPAdapterAttnProcessor2_0: IPAdapterLoRAAttnProcessor2_0Oflow}
+    )
+
 torch2oflow_class_map.update({Transformer2DModel: Transformer2DModelOflow})
 torch2oflow_class_map.update({UNet2DConditionModel: UNet2DConditionModelOflow})
 torch2oflow_class_map.update({AttnUpBlock2D: AttnUpBlock2DOflow})