From 5285e8e3f629b0a418b2713e36693e067c8234e9 Mon Sep 17 00:00:00 2001
From: shauray8 <shauray9@gmail.com>
Date: Sat, 29 Jun 2024 23:48:31 +0530
Subject: [PATCH 1/6] add watermark utils

---
 src/diffusers/utils/testing_utils.py | 81 ++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 8a6afd768428..dbca32266746 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -583,6 +583,87 @@ def export_to_video(video_frames: List[np.ndarray], output_video_path: str = Non
     return output_video_path
 
 
+def watermark_video_frames(frames: List[PIL.Image], watermark_path: str, opacity: float = .75, scale_down: int = 5, padding_right: int = 10, padding_down: int = 10) -> List[PIL.Image]:
+    if not frames:
+        return []
+
+    def prepare_watermark(watermark, target_width):
+        aspect_ratio = watermark.width / watermark.height
+        new_height = int(target_width / aspect_ratio)
+        watermark = watermark.resize((target_width, new_height), Image.LANCZOS)
+        return np.array(watermark)
+
+    def apply_watermark(frame, watermark, position):
+        frame_array = np.array(frame.convert("RGBA"))
+        h, w = watermark.shape[:2]
+        x, y = position
+        roi = frame_array[y:y+h, x:x+w]
+        alpha = watermark[:, :, 3] / 255.
+        for c in range(3):
+            roi[:, :, c] = watermark[:, :, c] * (alpha * opacity) + roi[:, :, c] * (1 - alpha * opacity)
+        roi[:, :, 3] = np.maximum(roi[:, :, 3], alpha * 255)
+        return Image.fromarray(frame_array.astype(np.uint8))
+
+    sample_frame = frames[0]
+    with Image.open(watermark_path) as watermark:
+        watermark = watermark.convert("RGBA")
+        watermark = prepare_watermark(watermark, sample_frame.width // scale_down)
+
+    position = (sample_frame.width - watermark.shape[1] - padding_right,
+                sample_frame.height - watermark.shape[0] - padding_down)
+
+    watermark_func = partial(apply_watermark, watermark=watermark, position=position)
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        return list(executor.map(watermark_func, frames))
+
+def watermark_image(
+    image: PIL.Image,
+    watermark_path: str,
+    opacity: float = 0.713,
+    scale_down: int = 6,
+    padding_right: int = 10,
+    padding_down: int = 10,
+) -> PIL.Image:
+
+    watermark_array = np.array(PIL.Image.open(logo))
+    img_array = np.array(image)
+    img_h, img_w = img_array.shape[:2]
+    water_h, water_w = watermark_array.shape[:2]
+    scale = min(img_w / (water_w * scale_down), img_h / (water_h * scale_down))
+    new_size = (int(water_w * scale), int(water_h * scale))
+    _watermark_array = np.array(
+        Image.fromarray(watermark_array).resize(new_size, Image.LANCZOS)
+    )
+
+    x, y = (img_w - new_size[0] - padding_right), (img_h - new_size[1] - padding_down)
+
+    if img_array.shape[2] == 3:
+        img_array = np.dstack(
+            [img_array, np.full(img_array.shape[:2], 255, dtype=np.uint8)]
+        )
+    if _watermark_array.shape[2] == 3:
+        _watermark_array = np.dstack(
+            [_watermark_array, np.full(_watermark_array.shape[:2], 255, dtype=np.uint8)]
+        )
+
+    alpha_watermark = _watermark_array[:, :, 3] / 255.0
+    alpha_image = img_array[:, :, 3] / 255.0
+
+    roi = img_array[y : y + new_size[1], x : x + new_size[0]]
+    for c in range(3):
+        roi[:, :, c] = _watermark_array[:, :, c] * (alpha_watermark * opacity) + roi[
+            :, :, c
+        ] * (1 - alpha_watermark * opacity)
+
+    roi[:, :, 3] = (
+        alpha_watermark
+        + alpha_image[y : y + new_size[1], x : x + new_size[0]] * (1 - alpha_watermark)
+    ) * 255
+
+    return Image.fromarray(img_array)
+
+
 def load_hf_numpy(path) -> np.ndarray:
     base_url = "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main"
 

From b934a9a22b20a8433ccf3babdf9aa93746ade9a7 Mon Sep 17 00:00:00 2001
From: shauray8 <shauray9@gmail.com>
Date: Sat, 29 Jun 2024 23:49:27 +0530
Subject: [PATCH 2/6] init adds

---
 src/diffusers/utils/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index f635ea03f62f..4979891509be 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -38,7 +38,7 @@
 from .deprecation_utils import deprecate
 from .doc_utils import replace_example_docstring
 from .dynamic_modules_utils import get_class_from_dynamic_module
-from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video
+from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video, watermark_image, watermark_video_frames
 from .hub_utils import (
     PushToHubMixin,
     _add_variant,

From dea209a0dfef93d2f51cb1e9d6736167b5d57681 Mon Sep 17 00:00:00 2001
From: shauray8 <shauray9@gmail.com>
Date: Sat, 29 Jun 2024 23:52:44 +0530
Subject: [PATCH 3/6] add aImage

---
 src/diffusers/utils/__init__.py      | 11 +++++++++--
 src/diffusers/utils/testing_utils.py |  8 ++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
index 4979891509be..c909ce49c320 100644
--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -38,7 +38,14 @@
 from .deprecation_utils import deprecate
 from .doc_utils import replace_example_docstring
 from .dynamic_modules_utils import get_class_from_dynamic_module
-from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video, watermark_image, watermark_video_frames
+from .export_utils import (
+    export_to_gif,
+    export_to_obj,
+    export_to_ply,
+    export_to_video,
+    watermark_image,
+    watermark_video_frames,
+)
 from .hub_utils import (
     PushToHubMixin,
     _add_variant,
@@ -80,8 +87,8 @@
     is_safetensors_available,
     is_scipy_available,
     is_tensorboard_available,
-    is_torch2_available,
     is_timm_available,
+    is_torch2_available,
     is_torch_available,
     is_torch_npu_available,
     is_torch_version,
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index dbca32266746..98038b4f1b28 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -602,10 +602,10 @@ def apply_watermark(frame, watermark, position):
         for c in range(3):
             roi[:, :, c] = watermark[:, :, c] * (alpha * opacity) + roi[:, :, c] * (1 - alpha * opacity)
         roi[:, :, 3] = np.maximum(roi[:, :, 3], alpha * 255)
-        return Image.fromarray(frame_array.astype(np.uint8))
+        return PIL.Image.fromarray(frame_array.astype(np.uint8))
 
     sample_frame = frames[0]
-    with Image.open(watermark_path) as watermark:
+    with PIL.Image.open(watermark_path) as watermark:
         watermark = watermark.convert("RGBA")
         watermark = prepare_watermark(watermark, sample_frame.width // scale_down)
 
@@ -633,7 +633,7 @@ def watermark_image(
     scale = min(img_w / (water_w * scale_down), img_h / (water_h * scale_down))
     new_size = (int(water_w * scale), int(water_h * scale))
     _watermark_array = np.array(
-        Image.fromarray(watermark_array).resize(new_size, Image.LANCZOS)
+        PIL.Image.fromarray(watermark_array).resize(new_size, PIL.Image.LANCZOS)
     )
 
     x, y = (img_w - new_size[0] - padding_right), (img_h - new_size[1] - padding_down)
@@ -661,7 +661,7 @@ def watermark_image(
         + alpha_image[y : y + new_size[1], x : x + new_size[0]] * (1 - alpha_watermark)
     ) * 255
 
-    return Image.fromarray(img_array)
+    return PIL.Image.fromarray(img_array)
 
 
 def load_hf_numpy(path) -> np.ndarray:

From 38c341e0397b985ac6c54582dc84aee266dd148b Mon Sep 17 00:00:00 2001
From: shauray8 <shauray9@gmail.com>
Date: Thu, 25 Jul 2024 23:32:21 +0530
Subject: [PATCH 4/6] fix copied

---
 src/diffusers/pipelines/__init__.py                | 2 +-
 src/diffusers/plus_pipelines/ella/pipeline_ella.py | 8 ++++----
 src/diffusers/utils/testing_utils.py               | 2 +-
 tests/pipelines/test_pipelines_common.py           | 1 -
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index 8ebde7fd191c..bce9924ce5a9 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -527,8 +527,8 @@
         )
         from .paint_by_example import PaintByExamplePipeline
         from .pia import PIAPipeline
-        from .plus_pipelines import ClothAdapter, OmsDiffusionPipeline
         from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
+        from .plus_pipelines import ClothAdapter, OmsDiffusionPipeline
         from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
         from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
         from .stable_cascade import (
diff --git a/src/diffusers/plus_pipelines/ella/pipeline_ella.py b/src/diffusers/plus_pipelines/ella/pipeline_ella.py
index b687d0e3dfd1..9962d2c378b6 100644
--- a/src/diffusers/plus_pipelines/ella/pipeline_ella.py
+++ b/src/diffusers/plus_pipelines/ella/pipeline_ella.py
@@ -720,7 +720,7 @@ def prepare_latents(
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
         self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
-    ) -> torch.FloatTensor:
+    ) -> torch.Tensor:
         """
         See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
 
@@ -733,7 +733,7 @@ def get_guidance_scale_embedding(
                 Data type of the generated embeddings.
 
         Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
         """
         assert len(w.shape) == 1
         w = w * 1000.0
@@ -1651,7 +1651,7 @@ def prepare_latents(
     # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
     def get_guidance_scale_embedding(
         self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
-    ) -> torch.FloatTensor:
+    ) -> torch.Tensor:
         """
         See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
 
@@ -1664,7 +1664,7 @@ def get_guidance_scale_embedding(
                 Data type of the generated embeddings.
 
         Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
         """
         assert len(w.shape) == 1
         w = w * 1000.0
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index 14accc54933b..f445b4803bdf 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -593,7 +593,7 @@ def watermark_video_frames(frames: List[PIL.Image], watermark_path: str, opacity
     def prepare_watermark(watermark, target_width):
         aspect_ratio = watermark.width / watermark.height
         new_height = int(target_width / aspect_ratio)
-        watermark = watermark.resize((target_width, new_height), Image.LANCZOS)
+        watermark = watermark.resize((target_width, new_height), PIL.Image.LANCZOS)
         return np.array(watermark)
 
     def apply_watermark(frame, watermark, position):
diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py
index e602fa095052..06fcc1c90b71 100644
--- a/tests/pipelines/test_pipelines_common.py
+++ b/tests/pipelines/test_pipelines_common.py
@@ -38,7 +38,6 @@
 from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet
 from diffusers.models.unets.unet_motion_model import UNetMotionModel
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
-from diffusers.plus_models import ELLAProxyUNet
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available

From 939f8eeddecfb1dd16a513c284b20c858e399f1b Mon Sep 17 00:00:00 2001
From: shauray8 <shauray9@gmail.com>
Date: Thu, 25 Jul 2024 23:37:03 +0530
Subject: [PATCH 5/6] more fixes

---
 src/diffusers/utils/testing_utils.py | 47 +++++++++++++++-------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
index f445b4803bdf..95e71248055c 100644
--- a/src/diffusers/utils/testing_utils.py
+++ b/src/diffusers/utils/testing_utils.py
@@ -1,3 +1,4 @@
+import concurrent
 import functools
 import importlib
 import inspect
@@ -586,7 +587,14 @@ def export_to_video(video_frames: List[np.ndarray], output_video_path: str = Non
     return output_video_path
 
 
-def watermark_video_frames(frames: List[PIL.Image], watermark_path: str, opacity: float = .75, scale_down: int = 5, padding_right: int = 10, padding_down: int = 10) -> List[PIL.Image]:
+def watermark_video_frames(
+    frames: List[PIL.Image],
+    watermark_path: str,
+    opacity: float = 0.75,
+    scale_down: int = 5,
+    padding_right: int = 10,
+    padding_down: int = 10,
+) -> List[PIL.Image]:
     if not frames:
         return []
 
@@ -600,8 +608,8 @@ def apply_watermark(frame, watermark, position):
         frame_array = np.array(frame.convert("RGBA"))
         h, w = watermark.shape[:2]
         x, y = position
-        roi = frame_array[y:y+h, x:x+w]
-        alpha = watermark[:, :, 3] / 255.
+        roi = frame_array[y : y + h, x : x + w]
+        alpha = watermark[:, :, 3] / 255.0
         for c in range(3):
             roi[:, :, c] = watermark[:, :, c] * (alpha * opacity) + roi[:, :, c] * (1 - alpha * opacity)
         roi[:, :, 3] = np.maximum(roi[:, :, 3], alpha * 255)
@@ -612,14 +620,17 @@ def apply_watermark(frame, watermark, position):
         watermark = watermark.convert("RGBA")
         watermark = prepare_watermark(watermark, sample_frame.width // scale_down)
 
-    position = (sample_frame.width - watermark.shape[1] - padding_right,
-                sample_frame.height - watermark.shape[0] - padding_down)
+    position = (
+        sample_frame.width - watermark.shape[1] - padding_right,
+        sample_frame.height - watermark.shape[0] - padding_down,
+    )
 
-    watermark_func = partial(apply_watermark, watermark=watermark, position=position)
+    watermark_func = functools.partial(apply_watermark, watermark=watermark, position=position)
 
     with concurrent.futures.ThreadPoolExecutor() as executor:
         return list(executor.map(watermark_func, frames))
 
+
 def watermark_image(
     image: PIL.Image,
     watermark_path: str,
@@ -628,40 +639,32 @@ def watermark_image(
     padding_right: int = 10,
     padding_down: int = 10,
 ) -> PIL.Image:
-
-    watermark_array = np.array(PIL.Image.open(logo))
+    watermark_array = np.array(PIL.Image.open(watermark_path))
     img_array = np.array(image)
     img_h, img_w = img_array.shape[:2]
     water_h, water_w = watermark_array.shape[:2]
     scale = min(img_w / (water_w * scale_down), img_h / (water_h * scale_down))
     new_size = (int(water_w * scale), int(water_h * scale))
-    _watermark_array = np.array(
-        PIL.Image.fromarray(watermark_array).resize(new_size, PIL.Image.LANCZOS)
-    )
+    _watermark_array = np.array(PIL.Image.fromarray(watermark_array).resize(new_size, PIL.Image.LANCZOS))
 
     x, y = (img_w - new_size[0] - padding_right), (img_h - new_size[1] - padding_down)
 
     if img_array.shape[2] == 3:
-        img_array = np.dstack(
-            [img_array, np.full(img_array.shape[:2], 255, dtype=np.uint8)]
-        )
+        img_array = np.dstack([img_array, np.full(img_array.shape[:2], 255, dtype=np.uint8)])
     if _watermark_array.shape[2] == 3:
-        _watermark_array = np.dstack(
-            [_watermark_array, np.full(_watermark_array.shape[:2], 255, dtype=np.uint8)]
-        )
+        _watermark_array = np.dstack([_watermark_array, np.full(_watermark_array.shape[:2], 255, dtype=np.uint8)])
 
     alpha_watermark = _watermark_array[:, :, 3] / 255.0
     alpha_image = img_array[:, :, 3] / 255.0
 
     roi = img_array[y : y + new_size[1], x : x + new_size[0]]
     for c in range(3):
-        roi[:, :, c] = _watermark_array[:, :, c] * (alpha_watermark * opacity) + roi[
-            :, :, c
-        ] * (1 - alpha_watermark * opacity)
+        roi[:, :, c] = _watermark_array[:, :, c] * (alpha_watermark * opacity) + roi[:, :, c] * (
+            1 - alpha_watermark * opacity
+        )
 
     roi[:, :, 3] = (
-        alpha_watermark
-        + alpha_image[y : y + new_size[1], x : x + new_size[0]] * (1 - alpha_watermark)
+        alpha_watermark + alpha_image[y : y + new_size[1], x : x + new_size[0]] * (1 - alpha_watermark)
     ) * 255
 
     return PIL.Image.fromarray(img_array)

From 34fb7903bc223c32c5d6688ca27d87990044edb2 Mon Sep 17 00:00:00 2001
From: shauray8 <shauray9@gmail.com>
Date: Thu, 25 Jul 2024 23:52:09 +0530
Subject: [PATCH 6/6] pass

---
 examples/community/ip_adapter_face_id.py      | 183 +-----------------
 examples/community/lpw_stable_diffusion_xl.py |   1 -
 .../plus_pipelines/champ/pipeline_champ.py    |  42 ++--
 .../plus_pipelines/ella/pipeline_ella.py      |  29 ++-
 .../plus_pipelines/pipeline_utils.py          |   7 +-
 tests/models/test_attention_processor.py      |   3 +
 6 files changed, 55 insertions(+), 210 deletions(-)

diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py
index bfad6e74a62b..c7dc775eeee3 100644
--- a/examples/community/ip_adapter_face_id.py
+++ b/examples/community/ip_adapter_face_id.py
@@ -20,12 +20,7 @@
 import torch.nn.functional as F
 from packaging import version
 from safetensors import safe_open
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
 
 from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import VaeImageProcessor
@@ -45,12 +40,8 @@
 from diffusers.models.embeddings import MultiIPAdapterImageProjection
 from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
-from diffusers.pipelines.stable_diffusion.pipeline_output import (
-    StableDiffusionPipelineOutput,
-)
-from diffusers.pipelines.stable_diffusion.safety_checker import (
-    StableDiffusionSafetyChecker,
-)
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
     USE_PEFT_BACKEND,
@@ -66,149 +57,6 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class LoRAIPAdapterAttnProcessor(nn.Module):
-    r"""
-    Attention processor for IP-Adapater.
-    Args:
-        hidden_size (`int`):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`):
-            The number of channels in the `encoder_hidden_states`.
-        rank (`int`, defaults to 4):
-            The dimension of the LoRA update matrices.
-        network_alpha (`int`, *optional*):
-            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
-        lora_scale (`float`, defaults to 1.0):
-            the weight scale of LoRA.
-        scale (`float`, defaults to 1.0):
-            the weight scale of image prompt.
-        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
-            The context length of the image features.
-    """
-
-    def __init__(
-        self,
-        hidden_size,
-        cross_attention_dim=None,
-        rank=4,
-        network_alpha=None,
-        lora_scale=1.0,
-        scale=1.0,
-        num_tokens=4,
-    ):
-        super().__init__()
-
-        self.rank = rank
-        self.lora_scale = lora_scale
-
-        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
-        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.scale = scale
-        self.num_tokens = num_tokens
-
-        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-    ):
-        residual = hidden_states
-
-        # separate ip_hidden_states from encoder_hidden_states
-        if encoder_hidden_states is not None:
-            if isinstance(encoder_hidden_states, tuple):
-                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
-            else:
-                deprecation_message = (
-                    "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
-                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
-                )
-                deprecate(
-                    "encoder_hidden_states not a tuple",
-                    "1.0.0",
-                    deprecation_message,
-                    standard_warn=False,
-                )
-                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
-                encoder_hidden_states, ip_hidden_states = (
-                    encoder_hidden_states[:, :end_pos, :],
-                    [encoder_hidden_states[:, end_pos:, :]],
-                )
-
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states)
-
-        query = attn.head_to_batch_dim(query)
-        key = attn.head_to_batch_dim(key)
-        value = attn.head_to_batch_dim(value)
-
-        attention_probs = attn.get_attention_scores(query, key, attention_mask)
-        hidden_states = torch.bmm(attention_probs, value)
-        hidden_states = attn.batch_to_head_dim(hidden_states)
-
-        # for ip-adapter
-        ip_key = self.to_k_ip(ip_hidden_states)
-        ip_value = self.to_v_ip(ip_hidden_states)
-
-        ip_key = attn.head_to_batch_dim(ip_key)
-        ip_value = attn.head_to_batch_dim(ip_value)
-
-        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
-        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
-        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
-
-        hidden_states = hidden_states + self.scale * ip_hidden_states
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
 class IPAdapterFullImageProjection(nn.Module):
     def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_tokens=1):
         super().__init__()
@@ -216,12 +64,7 @@ def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_t
 
         self.num_tokens = num_tokens
         self.cross_attention_dim = cross_attention_dim
-        self.ff = FeedForward(
-            image_embed_dim,
-            cross_attention_dim * num_tokens,
-            mult=mult,
-            activation_fn="gelu",
-        )
+        self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu")
         self.norm = nn.LayerNorm(cross_attention_dim)
 
     def forward(self, image_embeds: torch.Tensor):
@@ -732,9 +575,7 @@ def encode_prompt(
                 prompt_embeds = prompt_embeds[0]
             else:
                 prompt_embeds = self.text_encoder(
-                    text_input_ids.to(device),
-                    attention_mask=attention_mask,
-                    output_hidden_states=True,
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
                 )
                 # Access the `hidden_states` first, that contains a tuple of
                 # all the hidden states from the encoder layers. Then index into
@@ -1240,11 +1081,7 @@ def __call__(
 
                 if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
                     # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(
-                        noise_pred,
-                        noise_pred_text,
-                        guidance_rescale=self.guidance_rescale,
-                    )
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
@@ -1267,11 +1104,9 @@ def __call__(
                         callback(step_idx, t, latents)
 
         if not output_type == "latent":
-            image = self.vae.decode(
-                latents / self.vae.config.scaling_factor,
-                return_dict=False,
-                generator=generator,
-            )[0]
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
             image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
         else:
             image = latents
diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py
index 1860bf4bc783..bba112a65b69 100644
--- a/examples/community/lpw_stable_diffusion_xl.py
+++ b/examples/community/lpw_stable_diffusion_xl.py
@@ -1175,7 +1175,6 @@ def prepare_latents(
                 num_channels_latents,
                 int(height) // self.vae_scale_factor,
                 int(width) // self.vae_scale_factor,
-
             )
             if isinstance(generator, list) and len(generator) != batch_size:
                 raise ValueError(
diff --git a/src/diffusers/plus_pipelines/champ/pipeline_champ.py b/src/diffusers/plus_pipelines/champ/pipeline_champ.py
index 1342fe429145..ae4e12642242 100644
--- a/src/diffusers/plus_pipelines/champ/pipeline_champ.py
+++ b/src/diffusers/plus_pipelines/champ/pipeline_champ.py
@@ -37,10 +37,14 @@
         >>> from diffusers import StableVideoDiffusionPipeline
         >>> from diffusers.utils import load_image, export_to_video
 
-        >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        >>> pipe = StableVideoDiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
+        ... )
         >>> pipe.to("cuda")
 
-        >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg")
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg"
+        ... )
         >>> image = image.resize((1024, 576))
 
         >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
@@ -86,8 +90,8 @@ class StableVideoDiffusionPipelineOutput(BaseOutput):
 
     Args:
         frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]):
-            List of denoised PIL images of length `batch_size` or numpy array or torch tensor
-            of shape `(batch_size, num_frames, height, width, num_channels)`.
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size,
+            num_frames, height, width, num_channels)`.
     """
 
     frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor]
@@ -104,7 +108,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
         vae ([`AutoencoderKLTemporalDecoder`]):
             Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
         image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
-            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+            Frozen CLIP image-encoder
+            ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
         unet ([`UNetSpatioTemporalConditionModel`]):
             A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
         scheduler ([`EulerDiscreteScheduler`]):
@@ -357,14 +362,15 @@ def __call__(
 
         Args:
             image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
-                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`.
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0,
+                1]`.
             height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The height in pixels of the generated image.
             width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                 The width in pixels of the generated image.
             num_frames (`int`, *optional*):
-                The number of video frames to generate. Defaults to `self.unet.config.num_frames`
-                (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for
+                `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
             num_inference_steps (`int`, *optional*, defaults to 25):
                 The number of denoising steps. More denoising steps usually lead to a higher quality video at the
                 expense of slower inference. This parameter is modulated by `strength`.
@@ -373,16 +379,18 @@ def __call__(
             max_guidance_scale (`float`, *optional*, defaults to 3.0):
                 The maximum guidance scale. Used for the classifier free guidance with last frame.
             fps (`int`, *optional*, defaults to 7):
-                Frames per second. The rate at which the generated images shall be exported to a video after generation.
-                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+                Frames per second. The rate at which the generated images shall be exported to a video after
+                generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
             motion_bucket_id (`int`, *optional*, defaults to 127):
                 Used for conditioning the amount of motion for the generation. The higher the number the more motion
                 will be in the video.
             noise_aug_strength (`float`, *optional*, defaults to 0.02):
-                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
+                The amount of noise added to the init image, the higher it is the less the video will look like the
+                init image. Increase it for more motion.
             decode_chunk_size (`int`, *optional*):
-                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal
-                quality. For lower memory usage, reduce `decode_chunk_size`.
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the
+                expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality.
+                For lower memory usage, reduce `decode_chunk_size`.
             num_videos_per_prompt (`int`, *optional*, defaults to 1):
                 The number of videos to generate per prompt.
             generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -398,7 +406,8 @@ def __call__(
                 A function that is called at the end of each denoising step during inference. The function is called
                 with the following arguments:
                     `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
-                `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+                `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
             callback_on_step_end_tensor_inputs (`List`, *optional*):
                 The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                 will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -411,8 +420,9 @@ def __call__(
 
         Returns:
             [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned.
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`)
+                is returned.
         """
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
diff --git a/src/diffusers/plus_pipelines/ella/pipeline_ella.py b/src/diffusers/plus_pipelines/ella/pipeline_ella.py
index 9962d2c378b6..0a568c6f7a19 100644
--- a/src/diffusers/plus_pipelines/ella/pipeline_ella.py
+++ b/src/diffusers/plus_pipelines/ella/pipeline_ella.py
@@ -98,8 +98,8 @@ def retrieve_timesteps(
         scheduler (`SchedulerMixin`):
             The scheduler to get timesteps from.
         num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used,
-            `timesteps` must be `None`.
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
         device (`str` or `torch.device`, *optional*):
             The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
         timesteps (`List[int]`, *optional*):
@@ -850,10 +850,10 @@ def __call__(
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
             ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
-                if `do_classifier_free_guidance` is set to `True`.
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
@@ -1391,18 +1391,15 @@ def encode_prompt(
 
             """
             if (
-                hasattr(self.text_encoder.config, "use_attention_mask")
-                and self.text_encoder.config.use_attention_mask
+                hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask
             ):
                 attention_mask = uncond_input.attention_mask.to(device)
             else:
                 attention_mask = None
 
             negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
+                uncond_input.input_ids.to(device), attention_mask=attention_mask,
+            ) negative_prompt_embeds = negative_prompt_embeds[0]
             """
         if do_classifier_free_guidance:
             # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
@@ -1781,10 +1778,10 @@ def __call__(
                 not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
             ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
             ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
-                if `do_classifier_free_guidance` is set to `True`.
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
             output_type (`str`, *optional*, defaults to `"pil"`):
                 The output format of the generated image. Choose between `PIL.Image` or `np.array`.
             return_dict (`bool`, *optional*, defaults to `True`):
diff --git a/src/diffusers/plus_pipelines/pipeline_utils.py b/src/diffusers/plus_pipelines/pipeline_utils.py
index 25285b34338b..1925bb7c12c2 100644
--- a/src/diffusers/plus_pipelines/pipeline_utils.py
+++ b/src/diffusers/plus_pipelines/pipeline_utils.py
@@ -537,7 +537,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 allowed by Git.
             custom_revision (`str`, *optional*):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
-                `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers version.
+                `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers
+                version.
             mirror (`str`, *optional*):
                 Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
                 guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
@@ -1716,8 +1717,8 @@ def disable_freeu(self):
 
     def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
         """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
-        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
 
         <Tip warning={true}>
 
diff --git a/tests/models/test_attention_processor.py b/tests/models/test_attention_processor.py
index 15d7b51a1cf3..2489604274b4 100644
--- a/tests/models/test_attention_processor.py
+++ b/tests/models/test_attention_processor.py
@@ -1,7 +1,10 @@
+import tempfile
 import unittest
 
+import numpy as np
 import torch
 
+from diffusers import DiffusionPipeline
 from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor