From 5cbed8e0d157f65d3ddc2420dfd09f2df630e978 Mon Sep 17 00:00:00 2001
From: Suraj Patil <surajp815@gmail.com>
Date: Fri, 26 Aug 2022 21:16:43 +0530
Subject: [PATCH] Fix inpainting script  (#258)

* expand latents before the check, style

* update readme
---
 examples/inference/inpainting.py | 29 ++++++++++-----------
 examples/inference/readme.md     | 43 ++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/examples/inference/inpainting.py b/examples/inference/inpainting.py
index dec56bf2e6..d8082c6209 100644
--- a/examples/inference/inpainting.py
+++ b/examples/inference/inpainting.py
@@ -11,7 +11,7 @@
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
 
 
-def preprocess(image):
+def preprocess_image(image):
     w, h = image.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
     image = image.resize((w, h), resample=PIL.Image.LANCZOS)
@@ -20,15 +20,16 @@ def preprocess(image):
     image = torch.from_numpy(image)
     return 2.0 * image - 1.0
 
+
 def preprocess_mask(mask):
-    mask=mask.convert("L")
+    mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    mask = mask.resize((w//8, h//8), resample=PIL.Image.NEAREST)
+    mask = mask.resize((w // 8, h // 8), resample=PIL.Image.NEAREST)
     mask = np.array(mask).astype(np.float32) / 255.0
-    mask = np.tile(mask,(4,1,1))
-    mask = mask[None].transpose(0, 1, 2, 3)#what does this step do?
-    mask = 1 - mask #repaint white, keep black
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
     mask = torch.from_numpy(mask)
     return mask
 
@@ -90,25 +91,25 @@ def __call__(
 
         self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
 
-        #preprocess image
-        init_image = preprocess(init_image).to(self.device)
+        # preprocess image
+        init_image = preprocess_image(init_image).to(self.device)
 
         # encode the init image into latents and scale the latents
         init_latents = self.vae.encode(init_image).sample()
         init_latents = 0.18215 * init_latents
+
+        # prepare init_latents noise to latents
+        init_latents = torch.cat([init_latents] * batch_size)
         init_latents_orig = init_latents
 
         # preprocess mask
         mask = preprocess_mask(mask_image).to(self.device)
         mask = torch.cat([mask] * batch_size)
 
-        #check sizes
+        # check sizes
         if not mask.shape == init_latents.shape:
             raise ValueError(f"The mask and init_image should be the same size!")
 
-        # prepare init_latents noise to latents
-        init_latents = torch.cat([init_latents] * batch_size)
-
         # get the original timestep using init_timestep
         init_timestep = int(num_inference_steps * strength) + offset
         init_timestep = min(init_timestep, num_inference_steps)
@@ -172,9 +173,9 @@ def __call__(
             # compute the previous noisy sample x_t -> x_t-1
             latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs)["prev_sample"]
 
-            #masking
+            # masking
             init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, t)
-            latents = ( init_latents_proper * mask ) + ( latents * (1-mask) )
+            latents = (init_latents_proper * mask) + (latents * (1 - mask))
 
         # scale and decode the image latents with vae
         latents = 1 / 0.18215 * latents
diff --git a/examples/inference/readme.md b/examples/inference/readme.md
index 023912434d..d934231a36 100644
--- a/examples/inference/readme.md
+++ b/examples/inference/readme.md
@@ -52,3 +52,46 @@ You can also run this example on colab [![Open In Colab](https://colab.research.
 ## Tweak prompts reusing seeds and latents
 
 You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked. [This notebook](stable-diffusion-seeds.ipynb) shows how to do it step by step. You can also run it in Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb).
+
+
+## In-painting using Stable Diffusion
+
+The `inpainting.py` script implements `StableDiffusionInpaintingPipeline`. This script lets you edit specific parts of an image by providing a mask and text prompt.
+
+### How to use it
+
+```python
+from io import BytesIO
+
+from torch import autocast
+import requests
+import PIL
+
+from inpainting import StableDiffusionInpaintingPipeline
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+device = "cuda"
+pipe = StableDiffusionInpaintingPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    revision="fp16", 
+    torch_dtype=torch.float16,
+    use_auth_token=True
+).to(device)
+
+prompt = "a cat sitting on a bench"
+with autocast("cuda"):
+    images = pipe(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75)["sample"]
+
+images[0].save("cat_on_bench.png")
+```
+
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/in_painting_with_stable_diffusion_using_diffusers.ipynb)