From 5285e8e3f629b0a418b2713e36693e067c8234e9 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 29 Jun 2024 23:48:31 +0530 Subject: [PATCH 1/6] add watermark utils --- src/diffusers/utils/testing_utils.py | 81 ++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 8a6afd768428..dbca32266746 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -583,6 +583,87 @@ def export_to_video(video_frames: List[np.ndarray], output_video_path: str = Non return output_video_path +def watermark_video_frames(frames: List[PIL.Image], watermark_path: str, opacity: float = .75, scale_down: int = 5, padding_right: int = 10, padding_down: int = 10) -> List[PIL.Image]: + if not frames: + return [] + + def prepare_watermark(watermark, target_width): + aspect_ratio = watermark.width / watermark.height + new_height = int(target_width / aspect_ratio) + watermark = watermark.resize((target_width, new_height), Image.LANCZOS) + return np.array(watermark) + + def apply_watermark(frame, watermark, position): + frame_array = np.array(frame.convert("RGBA")) + h, w = watermark.shape[:2] + x, y = position + roi = frame_array[y:y+h, x:x+w] + alpha = watermark[:, :, 3] / 255. + for c in range(3): + roi[:, :, c] = watermark[:, :, c] * (alpha * opacity) + roi[:, :, c] * (1 - alpha * opacity) + roi[:, :, 3] = np.maximum(roi[:, :, 3], alpha * 255) + return Image.fromarray(frame_array.astype(np.uint8)) + + sample_frame = frames[0] + with Image.open(watermark_path) as watermark: + watermark = watermark.convert("RGBA") + watermark = prepare_watermark(watermark, sample_frame.width // scale_down) + + position = (sample_frame.width - watermark.shape[1] - padding_right, + sample_frame.height - watermark.shape[0] - padding_down) + + watermark_func = partial(apply_watermark, watermark=watermark, position=position) + + with concurrent.futures.ThreadPoolExecutor() as executor: + return list(executor.map(watermark_func, frames)) + +def watermark_image( + image: PIL.Image, + watermark_path: str, + opacity: float = 0.713, + scale_down: int = 6, + padding_right: int = 10, + padding_down: int = 10, +) -> PIL.Image: + + watermark_array = np.array(PIL.Image.open(logo)) + img_array = np.array(image) + img_h, img_w = img_array.shape[:2] + water_h, water_w = watermark_array.shape[:2] + scale = min(img_w / (water_w * scale_down), img_h / (water_h * scale_down)) + new_size = (int(water_w * scale), int(water_h * scale)) + _watermark_array = np.array( + Image.fromarray(watermark_array).resize(new_size, Image.LANCZOS) + ) + + x, y = (img_w - new_size[0] - padding_right), (img_h - new_size[1] - padding_down) + + if img_array.shape[2] == 3: + img_array = np.dstack( + [img_array, np.full(img_array.shape[:2], 255, dtype=np.uint8)] + ) + if _watermark_array.shape[2] == 3: + _watermark_array = np.dstack( + [_watermark_array, np.full(_watermark_array.shape[:2], 255, dtype=np.uint8)] + ) + + alpha_watermark = _watermark_array[:, :, 3] / 255.0 + alpha_image = img_array[:, :, 3] / 255.0 + + roi = img_array[y : y + new_size[1], x : x + new_size[0]] + for c in range(3): + roi[:, :, c] = _watermark_array[:, :, c] * (alpha_watermark * opacity) + roi[ + :, :, c + ] * (1 - alpha_watermark * opacity) + + roi[:, :, 3] = ( + alpha_watermark + + alpha_image[y : y + new_size[1], x : x + new_size[0]] * (1 - alpha_watermark) + ) * 255 + + return Image.fromarray(img_array) + + def load_hf_numpy(path) -> np.ndarray: base_url = "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main" From b934a9a22b20a8433ccf3babdf9aa93746ade9a7 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 29 Jun 2024 23:49:27 +0530 Subject: [PATCH 2/6] init adds --- src/diffusers/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index f635ea03f62f..4979891509be 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -38,7 +38,7 @@ from .deprecation_utils import deprecate from .doc_utils import replace_example_docstring from .dynamic_modules_utils import get_class_from_dynamic_module -from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video +from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video, watermark_image, watermark_video_frames from .hub_utils import ( PushToHubMixin, _add_variant, From dea209a0dfef93d2f51cb1e9d6736167b5d57681 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Sat, 29 Jun 2024 23:52:44 +0530 Subject: [PATCH 3/6] add aImage --- src/diffusers/utils/__init__.py | 11 +++++++++-- src/diffusers/utils/testing_utils.py | 8 ++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 4979891509be..c909ce49c320 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -38,7 +38,14 @@ from .deprecation_utils import deprecate from .doc_utils import replace_example_docstring from .dynamic_modules_utils import get_class_from_dynamic_module -from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video, watermark_image, watermark_video_frames +from .export_utils import ( + export_to_gif, + export_to_obj, + export_to_ply, + export_to_video, + watermark_image, + watermark_video_frames, +) from .hub_utils import ( PushToHubMixin, _add_variant, @@ -80,8 +87,8 @@ is_safetensors_available, is_scipy_available, is_tensorboard_available, - is_torch2_available, is_timm_available, + is_torch2_available, is_torch_available, is_torch_npu_available, is_torch_version, diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index dbca32266746..98038b4f1b28 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -602,10 +602,10 @@ def apply_watermark(frame, watermark, position): for c in range(3): roi[:, :, c] = watermark[:, :, c] * (alpha * opacity) + roi[:, :, c] * (1 - alpha * opacity) roi[:, :, 3] = np.maximum(roi[:, :, 3], alpha * 255) - return Image.fromarray(frame_array.astype(np.uint8)) + return PIL.Image.fromarray(frame_array.astype(np.uint8)) sample_frame = frames[0] - with Image.open(watermark_path) as watermark: + with PIL.Image.open(watermark_path) as watermark: watermark = watermark.convert("RGBA") watermark = prepare_watermark(watermark, sample_frame.width // scale_down) @@ -633,7 +633,7 @@ def watermark_image( scale = min(img_w / (water_w * scale_down), img_h / (water_h * scale_down)) new_size = (int(water_w * scale), int(water_h * scale)) _watermark_array = np.array( - Image.fromarray(watermark_array).resize(new_size, Image.LANCZOS) + PIL.Image.fromarray(watermark_array).resize(new_size, PIL.Image.LANCZOS) ) x, y = (img_w - new_size[0] - padding_right), (img_h - new_size[1] - padding_down) @@ -661,7 +661,7 @@ def watermark_image( + alpha_image[y : y + new_size[1], x : x + new_size[0]] * (1 - alpha_watermark) ) * 255 - return Image.fromarray(img_array) + return PIL.Image.fromarray(img_array) def load_hf_numpy(path) -> np.ndarray: From 38c341e0397b985ac6c54582dc84aee266dd148b Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 25 Jul 2024 23:32:21 +0530 Subject: [PATCH 4/6] fix copied --- src/diffusers/pipelines/__init__.py | 2 +- src/diffusers/plus_pipelines/ella/pipeline_ella.py | 8 ++++---- src/diffusers/utils/testing_utils.py | 2 +- tests/pipelines/test_pipelines_common.py | 1 - 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 8ebde7fd191c..bce9924ce5a9 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -527,8 +527,8 @@ ) from .paint_by_example import PaintByExamplePipeline from .pia import PIAPipeline - from .plus_pipelines import ClothAdapter, OmsDiffusionPipeline from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline + from .plus_pipelines import ClothAdapter, OmsDiffusionPipeline from .semantic_stable_diffusion import SemanticStableDiffusionPipeline from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline from .stable_cascade import ( diff --git a/src/diffusers/plus_pipelines/ella/pipeline_ella.py b/src/diffusers/plus_pipelines/ella/pipeline_ella.py index b687d0e3dfd1..9962d2c378b6 100644 --- a/src/diffusers/plus_pipelines/ella/pipeline_ella.py +++ b/src/diffusers/plus_pipelines/ella/pipeline_ella.py @@ -720,7 +720,7 @@ def prepare_latents( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -733,7 +733,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 @@ -1651,7 +1651,7 @@ def prepare_latents( # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding def get_guidance_scale_embedding( self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32 - ) -> torch.FloatTensor: + ) -> torch.Tensor: """ See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 @@ -1664,7 +1664,7 @@ def get_guidance_scale_embedding( Data type of the generated embeddings. Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(w), embedding_dim)`. + `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`. """ assert len(w.shape) == 1 w = w * 1000.0 diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 14accc54933b..f445b4803bdf 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -593,7 +593,7 @@ def watermark_video_frames(frames: List[PIL.Image], watermark_path: str, opacity def prepare_watermark(watermark, target_width): aspect_ratio = watermark.width / watermark.height new_height = int(target_width / aspect_ratio) - watermark = watermark.resize((target_width, new_height), Image.LANCZOS) + watermark = watermark.resize((target_width, new_height), PIL.Image.LANCZOS) return np.array(watermark) def apply_watermark(frame, watermark, position): diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index e602fa095052..06fcc1c90b71 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -38,7 +38,6 @@ from diffusers.models.unets.unet_i2vgen_xl import I2VGenXLUNet from diffusers.models.unets.unet_motion_model import UNetMotionModel from diffusers.pipelines.pipeline_utils import StableDiffusionMixin -from diffusers.plus_models import ELLAProxyUNet from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import logging from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available From 939f8eeddecfb1dd16a513c284b20c858e399f1b Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 25 Jul 2024 23:37:03 +0530 Subject: [PATCH 5/6] more fixes --- src/diffusers/utils/testing_utils.py | 47 +++++++++++++++------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index f445b4803bdf..95e71248055c 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1,3 +1,4 @@ +import concurrent import functools import importlib import inspect @@ -586,7 +587,14 @@ def export_to_video(video_frames: List[np.ndarray], output_video_path: str = Non return output_video_path -def watermark_video_frames(frames: List[PIL.Image], watermark_path: str, opacity: float = .75, scale_down: int = 5, padding_right: int = 10, padding_down: int = 10) -> List[PIL.Image]: +def watermark_video_frames( + frames: List[PIL.Image], + watermark_path: str, + opacity: float = 0.75, + scale_down: int = 5, + padding_right: int = 10, + padding_down: int = 10, +) -> List[PIL.Image]: if not frames: return [] @@ -600,8 +608,8 @@ def apply_watermark(frame, watermark, position): frame_array = np.array(frame.convert("RGBA")) h, w = watermark.shape[:2] x, y = position - roi = frame_array[y:y+h, x:x+w] - alpha = watermark[:, :, 3] / 255. + roi = frame_array[y : y + h, x : x + w] + alpha = watermark[:, :, 3] / 255.0 for c in range(3): roi[:, :, c] = watermark[:, :, c] * (alpha * opacity) + roi[:, :, c] * (1 - alpha * opacity) roi[:, :, 3] = np.maximum(roi[:, :, 3], alpha * 255) @@ -612,14 +620,17 @@ def apply_watermark(frame, watermark, position): watermark = watermark.convert("RGBA") watermark = prepare_watermark(watermark, sample_frame.width // scale_down) - position = (sample_frame.width - watermark.shape[1] - padding_right, - sample_frame.height - watermark.shape[0] - padding_down) + position = ( + sample_frame.width - watermark.shape[1] - padding_right, + sample_frame.height - watermark.shape[0] - padding_down, + ) - watermark_func = partial(apply_watermark, watermark=watermark, position=position) + watermark_func = functools.partial(apply_watermark, watermark=watermark, position=position) with concurrent.futures.ThreadPoolExecutor() as executor: return list(executor.map(watermark_func, frames)) + def watermark_image( image: PIL.Image, watermark_path: str, @@ -628,40 +639,32 @@ def watermark_image( padding_right: int = 10, padding_down: int = 10, ) -> PIL.Image: - - watermark_array = np.array(PIL.Image.open(logo)) + watermark_array = np.array(PIL.Image.open(watermark_path)) img_array = np.array(image) img_h, img_w = img_array.shape[:2] water_h, water_w = watermark_array.shape[:2] scale = min(img_w / (water_w * scale_down), img_h / (water_h * scale_down)) new_size = (int(water_w * scale), int(water_h * scale)) - _watermark_array = np.array( - PIL.Image.fromarray(watermark_array).resize(new_size, PIL.Image.LANCZOS) - ) + _watermark_array = np.array(PIL.Image.fromarray(watermark_array).resize(new_size, PIL.Image.LANCZOS)) x, y = (img_w - new_size[0] - padding_right), (img_h - new_size[1] - padding_down) if img_array.shape[2] == 3: - img_array = np.dstack( - [img_array, np.full(img_array.shape[:2], 255, dtype=np.uint8)] - ) + img_array = np.dstack([img_array, np.full(img_array.shape[:2], 255, dtype=np.uint8)]) if _watermark_array.shape[2] == 3: - _watermark_array = np.dstack( - [_watermark_array, np.full(_watermark_array.shape[:2], 255, dtype=np.uint8)] - ) + _watermark_array = np.dstack([_watermark_array, np.full(_watermark_array.shape[:2], 255, dtype=np.uint8)]) alpha_watermark = _watermark_array[:, :, 3] / 255.0 alpha_image = img_array[:, :, 3] / 255.0 roi = img_array[y : y + new_size[1], x : x + new_size[0]] for c in range(3): - roi[:, :, c] = _watermark_array[:, :, c] * (alpha_watermark * opacity) + roi[ - :, :, c - ] * (1 - alpha_watermark * opacity) + roi[:, :, c] = _watermark_array[:, :, c] * (alpha_watermark * opacity) + roi[:, :, c] * ( + 1 - alpha_watermark * opacity + ) roi[:, :, 3] = ( - alpha_watermark - + alpha_image[y : y + new_size[1], x : x + new_size[0]] * (1 - alpha_watermark) + alpha_watermark + alpha_image[y : y + new_size[1], x : x + new_size[0]] * (1 - alpha_watermark) ) * 255 return PIL.Image.fromarray(img_array) From 34fb7903bc223c32c5d6688ca27d87990044edb2 Mon Sep 17 00:00:00 2001 From: shauray8 Date: Thu, 25 Jul 2024 23:52:09 +0530 Subject: [PATCH 6/6] pass --- examples/community/ip_adapter_face_id.py | 183 +----------------- examples/community/lpw_stable_diffusion_xl.py | 1 - .../plus_pipelines/champ/pipeline_champ.py | 42 ++-- .../plus_pipelines/ella/pipeline_ella.py | 29 ++- .../plus_pipelines/pipeline_utils.py | 7 +- tests/models/test_attention_processor.py | 3 + 6 files changed, 55 insertions(+), 210 deletions(-) diff --git a/examples/community/ip_adapter_face_id.py b/examples/community/ip_adapter_face_id.py index bfad6e74a62b..c7dc775eeee3 100644 --- a/examples/community/ip_adapter_face_id.py +++ b/examples/community/ip_adapter_face_id.py @@ -20,12 +20,7 @@ import torch.nn.functional as F from packaging import version from safetensors import safe_open -from transformers import ( - CLIPImageProcessor, - CLIPTextModel, - CLIPTokenizer, - CLIPVisionModelWithProjection, -) +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import VaeImageProcessor @@ -45,12 +40,8 @@ from diffusers.models.embeddings import MultiIPAdapterImageProjection from diffusers.models.lora import adjust_lora_scale_text_encoder from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin -from diffusers.pipelines.stable_diffusion.pipeline_output import ( - StableDiffusionPipelineOutput, -) -from diffusers.pipelines.stable_diffusion.safety_checker import ( - StableDiffusionSafetyChecker, -) +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import ( USE_PEFT_BACKEND, @@ -66,149 +57,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name -class LoRAIPAdapterAttnProcessor(nn.Module): - r""" - Attention processor for IP-Adapater. - Args: - hidden_size (`int`): - The hidden size of the attention layer. - cross_attention_dim (`int`): - The number of channels in the `encoder_hidden_states`. - rank (`int`, defaults to 4): - The dimension of the LoRA update matrices. - network_alpha (`int`, *optional*): - Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. - lora_scale (`float`, defaults to 1.0): - the weight scale of LoRA. - scale (`float`, defaults to 1.0): - the weight scale of image prompt. - num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16): - The context length of the image features. - """ - - def __init__( - self, - hidden_size, - cross_attention_dim=None, - rank=4, - network_alpha=None, - lora_scale=1.0, - scale=1.0, - num_tokens=4, - ): - super().__init__() - - self.rank = rank - self.lora_scale = lora_scale - - self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) - self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) - self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) - self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) - - self.hidden_size = hidden_size - self.cross_attention_dim = cross_attention_dim - self.scale = scale - self.num_tokens = num_tokens - - self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) - self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) - - def __call__( - self, - attn, - hidden_states, - encoder_hidden_states=None, - attention_mask=None, - temb=None, - ): - residual = hidden_states - - # separate ip_hidden_states from encoder_hidden_states - if encoder_hidden_states is not None: - if isinstance(encoder_hidden_states, tuple): - encoder_hidden_states, ip_hidden_states = encoder_hidden_states - else: - deprecation_message = ( - "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release." - " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning." - ) - deprecate( - "encoder_hidden_states not a tuple", - "1.0.0", - deprecation_message, - standard_warn=False, - ) - end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0] - encoder_hidden_states, ip_hidden_states = ( - encoder_hidden_states[:, :end_pos, :], - [encoder_hidden_states[:, end_pos:, :]], - ) - - if attn.spatial_norm is not None: - hidden_states = attn.spatial_norm(hidden_states, temb) - - input_ndim = hidden_states.ndim - - if input_ndim == 4: - batch_size, channel, height, width = hidden_states.shape - hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) - - batch_size, sequence_length, _ = ( - hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape - ) - attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) - - if attn.group_norm is not None: - hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) - - query = attn.to_q(hidden_states) + self.lora_scale * self.to_q_lora(hidden_states) - - if encoder_hidden_states is None: - encoder_hidden_states = hidden_states - elif attn.norm_cross: - encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) - - key = attn.to_k(encoder_hidden_states) + self.lora_scale * self.to_k_lora(encoder_hidden_states) - value = attn.to_v(encoder_hidden_states) + self.lora_scale * self.to_v_lora(encoder_hidden_states) - - query = attn.head_to_batch_dim(query) - key = attn.head_to_batch_dim(key) - value = attn.head_to_batch_dim(value) - - attention_probs = attn.get_attention_scores(query, key, attention_mask) - hidden_states = torch.bmm(attention_probs, value) - hidden_states = attn.batch_to_head_dim(hidden_states) - - # for ip-adapter - ip_key = self.to_k_ip(ip_hidden_states) - ip_value = self.to_v_ip(ip_hidden_states) - - ip_key = attn.head_to_batch_dim(ip_key) - ip_value = attn.head_to_batch_dim(ip_value) - - ip_attention_probs = attn.get_attention_scores(query, ip_key, None) - ip_hidden_states = torch.bmm(ip_attention_probs, ip_value) - ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states) - - hidden_states = hidden_states + self.scale * ip_hidden_states - - # linear proj - hidden_states = attn.to_out[0](hidden_states) + self.lora_scale * self.to_out_lora(hidden_states) - # dropout - hidden_states = attn.to_out[1](hidden_states) - - if input_ndim == 4: - hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) - - if attn.residual_connection: - hidden_states = hidden_states + residual - - hidden_states = hidden_states / attn.rescale_output_factor - - return hidden_states - - class IPAdapterFullImageProjection(nn.Module): def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_tokens=1): super().__init__() @@ -216,12 +64,7 @@ def __init__(self, image_embed_dim=1024, cross_attention_dim=1024, mult=1, num_t self.num_tokens = num_tokens self.cross_attention_dim = cross_attention_dim - self.ff = FeedForward( - image_embed_dim, - cross_attention_dim * num_tokens, - mult=mult, - activation_fn="gelu", - ) + self.ff = FeedForward(image_embed_dim, cross_attention_dim * num_tokens, mult=mult, activation_fn="gelu") self.norm = nn.LayerNorm(cross_attention_dim) def forward(self, image_embeds: torch.Tensor): @@ -732,9 +575,7 @@ def encode_prompt( prompt_embeds = prompt_embeds[0] else: prompt_embeds = self.text_encoder( - text_input_ids.to(device), - attention_mask=attention_mask, - output_hidden_states=True, + text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True ) # Access the `hidden_states` first, that contains a tuple of # all the hidden states from the encoder layers. Then index into @@ -1240,11 +1081,7 @@ def __call__( if self.do_classifier_free_guidance and self.guidance_rescale > 0.0: # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg( - noise_pred, - noise_pred_text, - guidance_rescale=self.guidance_rescale, - ) + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale) # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] @@ -1267,11 +1104,9 @@ def __call__( callback(step_idx, t, latents) if not output_type == "latent": - image = self.vae.decode( - latents / self.vae.config.scaling_factor, - return_dict=False, - generator=generator, - )[0] + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + 0 + ] image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) else: image = latents diff --git a/examples/community/lpw_stable_diffusion_xl.py b/examples/community/lpw_stable_diffusion_xl.py index 1860bf4bc783..bba112a65b69 100644 --- a/examples/community/lpw_stable_diffusion_xl.py +++ b/examples/community/lpw_stable_diffusion_xl.py @@ -1175,7 +1175,6 @@ def prepare_latents( num_channels_latents, int(height) // self.vae_scale_factor, int(width) // self.vae_scale_factor, - ) if isinstance(generator, list) and len(generator) != batch_size: raise ValueError( diff --git a/src/diffusers/plus_pipelines/champ/pipeline_champ.py b/src/diffusers/plus_pipelines/champ/pipeline_champ.py index 1342fe429145..ae4e12642242 100644 --- a/src/diffusers/plus_pipelines/champ/pipeline_champ.py +++ b/src/diffusers/plus_pipelines/champ/pipeline_champ.py @@ -37,10 +37,14 @@ >>> from diffusers import StableVideoDiffusionPipeline >>> from diffusers.utils import load_image, export_to_video - >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16") + >>> pipe = StableVideoDiffusionPipeline.from_pretrained( + ... "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16" + ... ) >>> pipe.to("cuda") - >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg") + >>> image = load_image( + ... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg" + ... ) >>> image = image.resize((1024, 576)) >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0] @@ -86,8 +90,8 @@ class StableVideoDiffusionPipelineOutput(BaseOutput): Args: frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]): - List of denoised PIL images of length `batch_size` or numpy array or torch tensor - of shape `(batch_size, num_frames, height, width, num_channels)`. + List of denoised PIL images of length `batch_size` or numpy array or torch tensor of shape `(batch_size, + num_frames, height, width, num_channels)`. """ frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor] @@ -104,7 +108,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline): vae ([`AutoencoderKLTemporalDecoder`]): Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. image_encoder ([`~transformers.CLIPVisionModelWithProjection`]): - Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). + Frozen CLIP image-encoder + ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)). unet ([`UNetSpatioTemporalConditionModel`]): A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents. scheduler ([`EulerDiscreteScheduler`]): @@ -357,14 +362,15 @@ def __call__( Args: image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`): - Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`. + Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, + 1]`. height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The height in pixels of the generated image. width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): The width in pixels of the generated image. num_frames (`int`, *optional*): - The number of video frames to generate. Defaults to `self.unet.config.num_frames` - (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`). + The number of video frames to generate. Defaults to `self.unet.config.num_frames` (14 for + `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`). num_inference_steps (`int`, *optional*, defaults to 25): The number of denoising steps. More denoising steps usually lead to a higher quality video at the expense of slower inference. This parameter is modulated by `strength`. @@ -373,16 +379,18 @@ def __call__( max_guidance_scale (`float`, *optional*, defaults to 3.0): The maximum guidance scale. Used for the classifier free guidance with last frame. fps (`int`, *optional*, defaults to 7): - Frames per second. The rate at which the generated images shall be exported to a video after generation. - Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. + Frames per second. The rate at which the generated images shall be exported to a video after + generation. Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training. motion_bucket_id (`int`, *optional*, defaults to 127): Used for conditioning the amount of motion for the generation. The higher the number the more motion will be in the video. noise_aug_strength (`float`, *optional*, defaults to 0.02): - The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion. + The amount of noise added to the init image, the higher it is the less the video will look like the + init image. Increase it for more motion. decode_chunk_size (`int`, *optional*): - The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal - quality. For lower memory usage, reduce `decode_chunk_size`. + The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the + expense of more memory usage. By default, the decoder decodes all frames at once for maximal quality. + For lower memory usage, reduce `decode_chunk_size`. num_videos_per_prompt (`int`, *optional*, defaults to 1): The number of videos to generate per prompt. generator (`torch.Generator` or `List[torch.Generator]`, *optional*): @@ -398,7 +406,8 @@ def __call__( A function that is called at the end of each denoising step during inference. The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. - `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`. + `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. callback_on_step_end_tensor_inputs (`List`, *optional*): The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the @@ -411,8 +420,9 @@ def __call__( Returns: [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`: - If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned, - otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned. + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is + returned, otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) + is returned. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor diff --git a/src/diffusers/plus_pipelines/ella/pipeline_ella.py b/src/diffusers/plus_pipelines/ella/pipeline_ella.py index 9962d2c378b6..0a568c6f7a19 100644 --- a/src/diffusers/plus_pipelines/ella/pipeline_ella.py +++ b/src/diffusers/plus_pipelines/ella/pipeline_ella.py @@ -98,8 +98,8 @@ def retrieve_timesteps( scheduler (`SchedulerMixin`): The scheduler to get timesteps from. num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, - `timesteps` must be `None`. + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. device (`str` or `torch.device`, *optional*): The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. timesteps (`List[int]`, *optional*): @@ -850,10 +850,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -1391,18 +1391,15 @@ def encode_prompt( """ if ( - hasattr(self.text_encoder.config, "use_attention_mask") - and self.text_encoder.config.use_attention_mask + hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask ): attention_mask = uncond_input.attention_mask.to(device) else: attention_mask = None negative_prompt_embeds = self.text_encoder( - uncond_input.input_ids.to(device), - attention_mask=attention_mask, - ) - negative_prompt_embeds = negative_prompt_embeds[0] + uncond_input.input_ids.to(device), attention_mask=attention_mask, + ) negative_prompt_embeds = negative_prompt_embeds[0] """ if do_classifier_free_guidance: # duplicate unconditional embeddings for each generation per prompt, using mps friendly method @@ -1781,10 +1778,10 @@ def __call__( not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of + IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should + contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not + provided, embeddings are computed from the `ip_adapter_image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): diff --git a/src/diffusers/plus_pipelines/pipeline_utils.py b/src/diffusers/plus_pipelines/pipeline_utils.py index 25285b34338b..1925bb7c12c2 100644 --- a/src/diffusers/plus_pipelines/pipeline_utils.py +++ b/src/diffusers/plus_pipelines/pipeline_utils.py @@ -537,7 +537,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P allowed by Git. custom_revision (`str`, *optional*): The specific model version to use. It can be a branch name, a tag name, or a commit id similar to - `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers version. + `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers + version. mirror (`str`, *optional*): Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not guarantee the timeliness or safety of the source, and you should refer to the mirror site for more @@ -1716,8 +1717,8 @@ def disable_freeu(self): def fuse_qkv_projections(self, unet: bool = True, vae: bool = True): """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, - key, value) are fused. For cross-attention modules, key and value projection matrices are fused. + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. diff --git a/tests/models/test_attention_processor.py b/tests/models/test_attention_processor.py index 15d7b51a1cf3..2489604274b4 100644 --- a/tests/models/test_attention_processor.py +++ b/tests/models/test_attention_processor.py @@ -1,7 +1,10 @@ +import tempfile import unittest +import numpy as np import torch +from diffusers import DiffusionPipeline from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor