modelscope
diff --git a/‎diffsynth_engine/models/qwen_image/qwen_image_dit.py‎
Lines changed: 7 additions & 5 deletions b/‎diffsynth_engine/models/qwen_image/qwen_image_dit.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎diffsynth_engine/pipelines/qwen_image.py‎
Lines changed: 41 additions & 13 deletions b/‎diffsynth_engine/pipelines/qwen_image.py‎
Lines changed: 41 additions & 13 deletions
diff --git a/‎tests/data/expect/qwen_image/qwen_image_edit_plus.png‎
1.09 MB b/‎tests/data/expect/qwen_image/qwen_image_edit_plus.png‎
1.09 MB
diff --git a/‎tests/data/input/qwen_1.png‎
421 KB b/‎tests/data/input/qwen_1.png‎
421 KB
diff --git a/‎tests/data/input/qwen_2.png‎
466 KB b/‎tests/data/input/qwen_2.png‎
466 KB
diff --git a/‎tests/test_pipelines/test_qwen_image_controlnet.py‎
Lines changed: 4 additions & 2 deletions b/‎tests/test_pipelines/test_qwen_image_controlnet.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎tests/test_pipelines/test_qwen_image_edit_plus.py‎
Lines changed: 42 additions & 0 deletions b/‎tests/test_pipelines/test_qwen_image_edit_plus.py‎
Lines changed: 42 additions & 0 deletions
@@ -449,7 +449,7 @@ def forward(
             cfg_parallel(
                 (
                     image,
-                    edit,
+                    *(edit if edit is not None else ()),
                     timestep,
                     text,
                     text_seq_lens,
@@ -472,10 +472,12 @@ def forward(
                 image = torch.cat([image, context_latents], dim=1)
                 video_fhw += [(1, h // 2, w // 2)]
             if edit is not None:
-                edit = edit.to(dtype=image.dtype)
-                edit = self.patchify(edit)
-                image = torch.cat([image, edit], dim=1)
-                video_fhw += [(1, h // 2, w // 2)]
+                for img in edit:
+                    img = img.to(dtype=image.dtype)
+                    edit_h, edit_w = img.shape[-2:]
+                    img = self.patchify(img)
+                    image = torch.cat([image, img], dim=1)
+                    video_fhw += [(1, edit_h // 2, edit_w // 2)]
 
             rotary_emb = self.pos_embed(video_fhw, text_seq_len, image.device)
 
 
@@ -107,10 +107,14 @@ def __init__(
             dtype=config.model_dtype,
         )
         self.config = config
+        # qwen image
         self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
         self.prompt_template_encode_start_idx = 34
-
+        # qwen image edit
         self.edit_prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"
+        # qwen image edit plus
+        self.edit_plus_prompt_template_encode = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+
         self.edit_prompt_template_encode_start_idx = 64
 
         # sampler
@@ -282,7 +286,7 @@ def load_loras(self, lora_list: List[Tuple[str, float]], fused: bool = True, sav
 
     def unload_loras(self):
         self.dit.unload_loras()
-        self.noise_scheduler.restore_scheduler_config()
+        self.noise_scheduler.restore_config()
 
     def apply_scheduler_config(self, scheduler_config: Dict):
         self.noise_scheduler.update_config(scheduler_config)
@@ -339,16 +343,27 @@ def encode_prompt(
     def encode_prompt_with_image(
         self,
         prompt: Union[str, List[str]],
-        image: torch.Tensor,
+        vae_image: List[torch.Tensor],
+        condition_image: List[torch.Tensor],  # edit plus
         num_images_per_prompt: int = 1,
         max_sequence_length: int = 1024,
+        is_edit_plus: bool = True,
     ):
         prompt = [prompt] if isinstance(prompt, str) else prompt
 
         batch_size = len(prompt)
         template = self.edit_prompt_template_encode
         drop_idx = self.edit_prompt_template_encode_start_idx
-        texts = [template.format(txt) for txt in prompt]
+        if not is_edit_plus:
+            template = self.edit_prompt_template_encode
+            texts = [template.format(txt) for txt in prompt]
+            image = vae_image
+        else:
+            template = self.edit_plus_prompt_template_encode
+            img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>"
+            img_prompt = "".join([img_prompt_template.format(i + 1) for i in range(len(condition_image))])
+            texts = [template.format(img_prompt + e) for e in prompt]
+            image = condition_image
 
         model_inputs = self.processor(text=texts, images=image, max_length=max_sequence_length + drop_idx)
         input_ids, attention_mask, pixel_values, image_grid_thw = (
@@ -454,7 +469,7 @@ def predict_noise_with_cfg(
                 entity_masks = [torch.cat([mask, mask], dim=0) for mask in entity_masks]
             latents = torch.cat([latents, latents], dim=0)
             if image_latents is not None:
-                image_latents = torch.cat([image_latents, image_latents], dim=0)
+                image_latents = [torch.cat([image_latent, image_latent], dim=0) for image_latent in image_latents]
             if context_latents is not None:
                 context_latents = torch.cat([context_latents, context_latents], dim=0)
             timestep = torch.cat([timestep, timestep], dim=0)
@@ -543,7 +558,8 @@ def __call__(
         self,
         prompt: str,
         negative_prompt: str = "",
-        input_image: Image.Image | None = None,  # use for img2img
+        # single image for edit, list for edit plus(QwenImageEdit2509)
+        input_image: List[Image.Image] | Image.Image | None = None,
         cfg_scale: float = 4.0,  # true cfg
         height: int = 1328,
         width: int = 1328,
@@ -555,10 +571,20 @@ def __call__(
         entity_prompts: Optional[List[str]] = None,
         entity_masks: Optional[List[Image.Image]] = None,
     ):
+        is_edit_plus = isinstance(input_image, list)
         if input_image is not None:
-            width, height = input_image.size
-            width, height = self.calculate_dimensions(1024 * 1024, width / height)
-            input_image = input_image.resize((width, height), Image.LANCZOS)
+            if not isinstance(input_image, list):
+                input_image = [input_image]
+            condition_images = []
+            vae_images = []
+            for img in input_image:
+                img_width, img_height = img.size
+                condition_width, condition_height = self.calculate_dimensions(384 * 384, img_width / img_height)
+                vae_width, vae_height = self.calculate_dimensions(1024 * 1024, img_width / img_height)
+                condition_images.append(img.resize((condition_width, condition_height), Image.LANCZOS))
+                vae_images.append(img.resize((vae_width, vae_height), Image.LANCZOS))
+
+            width, height = vae_images[-1].size
 
         self.validate_image_size(height, width, minimum=64, multiple_of=16)
 
@@ -567,7 +593,7 @@ def __call__(
 
         context_latents = None
         for param in controlnet_params:
-            self.load_lora(param.model, param.scale, fused=True, save_original_weight=False)
+            self.load_lora(param.model, param.scale, fused=False, save_original_weight=False)
             if param.control_type == QwenImageControlType.in_context:
                 width, height = param.image.size
                 self.validate_image_size(height, width, minimum=64, multiple_of=16)
@@ -585,16 +611,18 @@ def __call__(
 
         self.load_models_to_device(["vae"])
         if input_image:
-            image_latents = self.prepare_image_latents(input_image)
+            image_latents = [self.prepare_image_latents(img) for img in vae_images]
         else:
             image_latents = None
 
         self.load_models_to_device(["encoder"])
         if image_latents is not None:
-            prompt_emb, prompt_emb_mask = self.encode_prompt_with_image(prompt, input_image, 1, 4096)
+            prompt_emb, prompt_emb_mask = self.encode_prompt_with_image(
+                prompt, vae_images, condition_images, 1, 4096, is_edit_plus
+            )
             if cfg_scale > 1.0 and negative_prompt != "":
                 negative_prompt_emb, negative_prompt_emb_mask = self.encode_prompt_with_image(
-                    negative_prompt, input_image, 1, 4096
+                    negative_prompt, vae_images, condition_images, 1, 4096, is_edit_plus
                 )
             else:
                 negative_prompt_emb, negative_prompt_emb_mask = None, None
 
@@ -38,7 +38,8 @@ def test_incontext_canny(self):
             seed=42,
             controlnet_params=param,
         )
-        self.assertImageEqualAndSaveFailed(image, "qwen_image/qwen_image_canny.png", threshold=0.99)
+        self.assertImageEqualAndSaveFailed(image, "qwen_image/qwen_image_canny.png", threshold=0.95)
+        self.pipe.unload_loras()
 
     def test_incontext_depth(self):
         param = QwenImageControlNetParams(
@@ -54,7 +55,8 @@ def test_incontext_depth(self):
             seed=42,
             controlnet_params=param,
         )
-        self.assertImageEqualAndSaveFailed(image, "qwen_image/qwen_image_depth.png", threshold=0.99)
+        self.assertImageEqualAndSaveFailed(image, "qwen_image/qwen_image_depth.png", threshold=0.95)
+        self.pipe.unload_loras()
 
 
 if __name__ == "__main__":
 
@@ -0,0 +1,42 @@
+import unittest
+import torch
+
+from diffsynth_engine import QwenImagePipelineConfig
+from diffsynth_engine.pipelines import QwenImagePipeline
+from diffsynth_engine.utils.download import fetch_model
+from tests.common.test_case import ImageTestCase
+
+
+class TestQwenImageEditPlusPipeline(ImageTestCase):
+    @classmethod
+    def setUpClass(cls):
+        config = QwenImagePipelineConfig(
+            model_path=fetch_model("Qwen/Qwen-Image-Edit-2509", path="transformer/*.safetensors"),
+            encoder_path=fetch_model("Qwen/Qwen-Image-Edit-2509", path="text_encoder/*.safetensors"),
+            vae_path=fetch_model("Qwen/Qwen-Image-Edit-2509", path="vae/*.safetensors"),
+            model_dtype=torch.bfloat16,
+            encoder_dtype=torch.bfloat16,
+            vae_dtype=torch.float32,
+        )
+        cls.pipe = QwenImagePipeline.from_pretrained(config)
+
+    @classmethod
+    def tearDownClass(cls):
+        del cls.pipe
+
+    def test_txt2img(self):
+        image = self.pipe(
+            prompt="根据这图1中女性和图2中的男性，生成一组结婚照，并遵循以下描述：新郎穿着红色的中式马褂，新娘穿着精致的秀禾服，头戴金色凤冠。他们并肩站立在古老的朱红色宫墙前，背景是雕花的木窗。光线明亮柔和，构图对称，氛围喜庆而庄重。",
+            input_image=[self.get_input_image("qwen_1.png"), self.get_input_image("qwen_2.png")],
+            negative_prompt=" ",
+            cfg_scale=4.0,
+            width=1328,
+            height=1328,
+            num_inference_steps=40,
+            seed=42,
+        )
+        self.assertImageEqualAndSaveFailed(image, "qwen_image/qwen_image_edit_plus.png", threshold=0.95)
+
+
+if __name__ == "__main__":
+    unittest.main()